codeql/javascript/ql/src/Performance/ReDoS.ql at fix-javascript-example · sj/codeql · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
/**
 * @name Inefficient regular expression
 * @description A regular expression that requires exponential time to match certain inputs
 *              can be a performance bottleneck, and may be vulnerable to denial-of-service
 *              attacks.
 * @kind problem
 * @problem.severity error
 * @precision high
 * @id js/redos
 * @tags security
 *       external/cwe/cwe-730
 *       external/cwe/cwe-400
 */

import javascript

/*
 * This query implements the analysis described in the following two papers:
 *
 *   James Kirrage, Asiri Rathnayake, Hayo Thielecke: Static Analysis for
 *     Regular Expression Denial-of-Service Attacks. NSS 2013.
 *     (http://www.cs.bham.ac.uk/~hxt/research/reg-exp-sec.pdf)
 *   Asiri Rathnayake, Hayo Thielecke: Static Analysis for Regular Expression
 *     Exponential Runtime via Substructural Logics. 2014.
 *     (https://www.cs.bham.ac.uk/~hxt/research/redos_full.pdf)
 *
 * The basic idea is to search for overlapping cycles in the NFA, that is,
 * states `q` such that there are two distinct paths from `q` to itself
 * that consume the same word `w`.
 *
 * For any such state `q`, an attack string can be constructed as follows:
 * concatenate a prefix `v` that takes the NFA to `q` with `n` copies of
 * the word `w` that leads back to `q` along two different paths, followed
 * by a suffix `x` that is _not_ accepted in state `q`. A backtracking
 * implementation will need to explore at least 2^n different ways of going
 * from `q` back to itself while trying to match the `n` copies of `w`
 * before finally giving up.
 *
 * Now in order to identify overlapping cycles, all we have to do is find
 * pumpable forks, that is, states `q` that can transition to two different
 * states `r1` and `r2` on the same input symbol `c`, such that there are
 * paths from both `r1` and `r2` to `q` that consume the same word. The latter
 * condition is equivalent to saying that `(q, q)` is reachable from `(r1, r2)`
 * in the product NFA.
 *
 * This is what the query does. It makes no attempt to construct a prefix
 * leading into `q`, and only a weak one to construct a suffix that ensures
 * rejection; this causes some false positives. Also, the query does not fully
 * handle character classes and does not handle various other features at all;
 * this causes false negatives.
 *
 * Finally, sometimes it depends on the translation whether the NFA generated
 * for a regular expression has a pumpable fork or not. We implement one
 * particular translation, which may result in false positives or negatives
 * relative to some particular JavaScript engine.
 *
 * More precisely, the query constructs an NFA from a regular expression `r`
 * as follows:
 *
 *   * Every sub-term `t` gives rise to an NFA state `Match(t)`, representing
 *     the state of the automaton before attempting to match `t`.
 *   * There is one additional accepting state `Accept(r)`.
 *   * Transitions between states may be labelled with epsilon, or an abstract
 *     input symbol.
 *   * Each abstract input symbol represents a set of concrete input characters:
 *     either a single character, a set of characters represented by a (positive)
 *     character class, or the set of all characters.
 *   * The product automaton is constructed lazily, starting with pair states
 *     `(q, q)` where `q` is a fork, and proceding along an over-approximate
 *     step relation.
 *   * The over-approximate step relation allows transitions along pairs of
 *     abstract input symbols as long as the symbols are not trivially incompatible.
 *   * Once a trace of pairs of abstract input symbols that leads from a fork
 *     back to itself has been identified, we attempt to construct a concrete
 *     string corresponding to it, which may fail.
 *   * Instead of trying to construct a suffix that makes the automaton fail,
 *     we ensure that it isn't possible to reach the accepting state from the
 *     fork along epsilon transitions. In this case, it is very likely (though
 *     not guaranteed) that a rejecting suffix exists.
 */

/**
 * A branch in a disjunction that is the root node in a literal, or a literal
 * whose root node is not a disjunction.
 */
class RegExpRoot extends @regexpterm { // RegExpTerm is abstract, so do not extend it.
  RegExpRoot() {
    exists (RegExpLiteral literal, RegExpAlt alt | alt.getParent() = literal |
      this = alt.getAChild())
    or
    exists (RegExpLiteral literal |
      not exists (RegExpAlt alt | alt.getParent() = literal) and
      this.(RegExpTerm).getParent() = literal)
  }

  /**
   * Holds if this root term is relevant to the ReDoS analysis.
   */
  predicate isRelevant() {
    // there is at least one repetition
    exists (RegExpRepetition rep | getRoot(rep) = this)
    and
    // there are no lookbehinds
    not exists (RegExpLookbehind lbh | getRoot(lbh) = this)
  }

  string toString() { result = this.(RegExpTerm).toString() }
}

/**
 * A term that matches repetitions of a given pattern, that is, `E*`, `E+`, or `E{n,m}`.
 */
class RegExpRepetition extends RegExpParent {
  RegExpRepetition() {
    this instanceof RegExpStar or
    this instanceof RegExpPlus or
    this instanceof RegExpRange
  }
}

/**
 * Gets the root containing the given term, that is, the root of the literal,
 * or a branch of the root disjunction.
 */
RegExpRoot getRoot(RegExpTerm term) {
  result = term or
  result = getRoot(term.getParent())
}

/**
 * An abstract input symbol, representing a set of concrete characters.
 */
newtype TInputSymbol =
  /** An input symbol corresponding to character `c`. */
  Char(string c) { c = any(RegExpConstant cc).getValue() }
  or
  /**
   * An input symbol representing all characters matched by
   * (positive, non-universal) character class `recc`.
   */
  CharClass(RegExpCharacterClass recc) {
    getRoot(recc).isRelevant() and
    not recc.isInverted() and
    not isUniversalClass(recc)
  }
  or
  /** An input symbol representing all characters matched by `.`. */
  Dot()
  or
  /** An input symbol representing all characters. */
  Any()
  or
  /** An epsilon transition in the automaton. */
  Epsilon()


/**
 * Holds if character class `cc` matches all characters.
 */
predicate isUniversalClass(RegExpCharacterClass cc) {
  // [^]
  cc.isInverted() and not exists(cc.getAChild())
  or
  // [\w\W] and similar
  not cc.isInverted() and
  exists (string cce1, string cce2 |
    cce1 = cc.getAChild().(RegExpCharacterClassEscape).getValue() and
    cce2 = cc.getAChild().(RegExpCharacterClassEscape).getValue() |
    cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
  )
}

/**
 * An abstract input symbol, representing a set of concrete characters.
 */
class InputSymbol extends TInputSymbol {
  InputSymbol() {
    not this instanceof Epsilon
  }

  string toString() {
    this = Char(result) or
    result = any(RegExpCharacterClass recc | this = CharClass(recc)).toString() or
    this = Dot() and result = "." or
    this = Any() and result = "[^]"
  }
}

/**
 * Gets a lower bound on the characters matched by the given character class term.
 */
string getCCLowerBound(RegExpTerm t) {
  t.getParent() instanceof RegExpCharacterClass and
  (
    result = t.(RegExpConstant).getValue() or
    t.(RegExpCharacterRange).isRange(result, _) or
    exists (string name | name = t.(RegExpCharacterClassEscape).getValue() |
      name = "w" and result = "0" or
      name = "W" and result = "" or
      name = "s" and result = "" or
      name = "S" and result = "")
  )
}

/**
 * The highest character used in a regular expression. Used to represent intervals without an upper bound.
 */
string highestCharacter() { result = max(RegExpConstant c || c.getValue()) }

/**
 * Gets an upper bound on the characters matched by the given character class term.
 */
string getCCUpperBound(RegExpTerm t) {
  t.getParent() instanceof RegExpCharacterClass and
  (
    result = t.(RegExpConstant).getValue() or
    t.(RegExpCharacterRange).isRange(_, result) or
    exists (string name | name = t.(RegExpCharacterClassEscape).getValue() |
      name = "w" and result = "z" or
      name = "W" and result = highestCharacter() or
      name = "s" and result = highestCharacter() or
      name = "S" and result = highestCharacter())
  )
}

/**
 * Holds if `s` belongs to `l` and is a character class whose set of matched characters is contained
 * in the interval `lo-hi`.
 */
predicate hasBounds(RegExpRoot l, InputSymbol s, string lo, string hi) {
  exists (RegExpCharacterClass cc | s = CharClass(cc) |
    l = getRoot(cc) and
    lo = min(getCCLowerBound(cc.getAChild())) and
    hi = max(getCCUpperBound(cc.getAChild())))
}

/**
 * Holds if `s1` and `s2` possibly have a non-empty intersection.
 *
 * This predicate is over-approximate; it is only used for pruning the search space.
 */
predicate compatible(InputSymbol s1, InputSymbol s2) {
  exists (RegExpRoot l, string lo1, string lo2, string hi1, string hi2 |
    hasBounds(l, s1, lo1, hi1) and hasBounds(l, s2, lo2, hi2) and
    max(string s | s = lo1 or s = lo2) <= min(string s | s = hi1 or s = hi2))
  or
  exists (intersect(s1, s2))
}

newtype TState =
  Match(RegExpTerm t) {
    getRoot(t).isRelevant()
  }
  or
  Accept(RegExpRoot l) {
    l.isRelevant()
  }

/**
 * A state in the NFA corresponding to a regular expression.
 *
 * Each regular expression literal `l` has one accepting state
 * `Accept(l)` and one state `Match(t)` for every subterm `t`,
 * which represents the state of the NFA before starting to
 * match `t`.
 */
class State extends TState {
  RegExpParent repr;

  State() {
    this = Match(repr) or this = Accept(repr)
  }

  string toString() {
    result = "Match(" + (RegExpTerm)repr + ")" or
    result = "Accept(" + (RegExpRoot)repr + ")"
  }

  Location getLocation() {
    result = repr.getLocation()
  }
}

class EdgeLabel extends TInputSymbol {
  string toString() {
    this = Epsilon() and result = "" or
    exists (InputSymbol s | this = s and result = s.toString())
  }
}

/**
 * Gets a state the NFA may be in after matching `t`.
 */
State after(RegExpTerm t) {
  exists (RegExpAlt alt | t = alt.getAChild() |
    result = after(alt)
  )
  or
  exists (RegExpSequence seq, int i | t = seq.getChild(i) |
    result = Match(seq.getChild(i+1)) or
    i+1 = seq.getNumChild() and result = after(seq)
  )
  or
  exists (RegExpGroup grp | t = grp.getAChild() |
    result = after(grp)
  )
  or
  exists (RegExpStar star | t = star.getAChild() |
    result = Match(star)
  )
  or
  exists (RegExpPlus plus | t = plus.getAChild() |
    result = Match(plus) or
    result = after(plus)
  )
  or
  exists (RegExpOpt opt | t = opt.getAChild() |
    result = after(opt)
  )
  or
  exists (RegExpRoot root | t = root |
    result = Accept(root)
  )
}

/**
 * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
 */
predicate delta(State q1, EdgeLabel lbl, State q2) {
  exists (RegExpConstant s |
    q1 = Match(s) and lbl = Char(s.getValue()) and q2 = after(s)
  )
  or
  exists (RegExpDot dot, RegExpLiteral rel |
    q1 = Match(dot) and q2 = after(dot) and rel = dot.getLiteral() |
    if rel.isDotAll() then lbl = Any() else lbl = Dot()
  )
  or
  exists (RegExpCharacterClass cc |
    isUniversalClass(cc) and q1 = Match(cc) and lbl = Any() and q2 = after(cc) or
    q1 = Match(cc) and lbl = CharClass(cc) and q2 = after(cc)
  )
  or
  exists (RegExpAlt alt | lbl = Epsilon() |
    q1 = Match(alt) and q2 = Match(alt.getAChild())
  )
  or
  exists (RegExpSequence seq | lbl = Epsilon() |
    q1 = Match(seq) and q2 = Match(seq.getChild(0))
  )
  or
  exists (RegExpGroup grp | lbl = Epsilon() |
    q1 = Match(grp) and q2 = Match(grp.getChild(0))
  )
  or
  exists (RegExpStar star | lbl = Epsilon() |
    q1 = Match(star) and q2 = Match(star.getChild(0)) or
    q1 = Match(star) and q2 = after(star)
  )
  or
  exists (RegExpPlus plus | lbl = Epsilon() |
    q1 = Match(plus) and q2 = Match(plus.getChild(0))
  )
  or
  exists (RegExpOpt opt | lbl = Epsilon() |
    q1 = Match(opt) and q2 = Match(opt.getChild(0)) or
    q1 = Match(opt) and q2 = after(opt)
  )
}

/**
 * Gets a state that `q` has an epsilon transition to.
 */
State epsilonSucc(State q) {
  delta(q, Epsilon(), result)
}

/**
 * Gets a state that has an epsilon transition to `q`.
 */
State epsilonPred(State q) {
  q = epsilonSucc(result)
}

/**
 * Holds if there is a state `q` that can be reached from `q1`
 * along epsilon edges, such that there is a transition from
 * `q` to `q2` that consumes symbol `s`.
 */
predicate deltaClosed(State q1, InputSymbol s, State q2) {
  delta(epsilonSucc*(q1), s, q2)
}


/**
 * A state in the product automaton.
 *
 * We lazily only construct those states that we are actually
 * going to need: `(q, q)` for every fork state `q`, and any
 * pair of states that can be reached from a pair that we have
 * already constructed. To cut down on the number of states,
 * we only represent states `(q1, q2)` where `q1` is lexicographically
 * no bigger than `q2`.
 */
newtype TStatePair =
  MkStatePair(State q1, State q2) {
    isFork(q1, _, _, _, _) and q2 = q1
    or
    step(_, _, _, q1, q2) and q1.toString() <= q2.toString()
  }

class StatePair extends TStatePair {
  State q1;
  State q2;

  StatePair() { this = MkStatePair(q1, q2) }

  string toString() { result = "(" + q1 + ", " + q2 + ")" }
}

/**
 * Gets the state pair `(q1, q2)` or `(q2, q1)`; note that only
 * one or the other is defined.
 */
StatePair mkStatePair(State q1, State q2) {
  result = MkStatePair(q1, q2) or result = MkStatePair(q2, q1)
}

predicate isStatePair(StatePair p) { any() }

predicate delta2(StatePair q, StatePair r) { step(q, _, _, r) }

/**
 * Gets the minimum length of a path from `q` to `r` in the
 * product automaton.
 */
int statePairDist(StatePair q, StatePair r) =
  shortestDistances(isStatePair/1, delta2/2)(q, r, result)

/**
 * Holds if there are transitions from `q` to `r1` and from `q` to `r2`
 * labelled with `s1` and `s2`, respectively, where `s1` and `s2` do not
 * trivially have an empty intersection.
 *
 * This predicate only holds for states associated with regular expressions
 * that have at least one repetition quantifier in them (otherwise the
 * expression cannot be vulnerable to ReDoS attacks anyway).
 */
pragma[noopt]
predicate isFork(State q, InputSymbol s1, InputSymbol s2, State r1, State r2) {
  exists (State q1, State q2 |
    q1 = epsilonSucc*(q) and delta(q1, s1, r1) and
    q2 = epsilonSucc*(q) and delta(q2, s2, r2) and
    // Use pragma[noopt] to prevent compatible(s1,s2) from being the starting point of the join.
    // From (s1,s2) it would find a huge number of intermediate state pairs (q1,q2) originating from different literals,
    // and discover at the end that no `q` can reach both `q1` and `q2` by epsilon transitions.
    compatible(s1, s2) |
    s1 != s2 or
    r1 != r2 or
    r1 = r2 and q1 != q2
  )
}

/**
 * Holds if there are transitions from the components of `q` to the corresponding
 * components of `r` labelled with `s1` and `s2`, respectively.
 */
predicate step(StatePair q, InputSymbol s1, InputSymbol s2, StatePair r) {
  exists (State r1, State r2 |
    step(q, s1, s2, r1, r2) and r = mkStatePair(r1, r2)
  )
}

/**
 * Holds if there are transitions from the components of `q` to `r1` and `r2`
 * labelled with `s1` and `s2`, respectively.
 */
predicate step(StatePair q, InputSymbol s1, InputSymbol s2, State r1, State r2) {
  exists (State q1, State q2 | q = MkStatePair(q1, q2) |
    deltaClosed(q1, s1, r1) and deltaClosed(q2, s2, r2) and
    compatible(s1, s2)
  )
}

/**
 * A list of pairs of input symbols that describe a path in the product automaton
 * starting from some fork state.
 */
newtype Trace =
  Nil() or
  Step(InputSymbol s1, InputSymbol s2, Trace t) {
    exists (StatePair p |
      isReachableFromFork(_, p, t, _) and
      step(p, s1, s2, _)
    ) or
    t = Nil() and isFork(_, s1, s2, _, _)
  }

/**
 * Gets a character that is represented by both `c` and `d`.
 */
string intersect(InputSymbol c, InputSymbol d) {
  c = Char(result) and (
    d = Char(result) or
    exists (RegExpCharacterClass cc | d = CharClass(cc) |
      exists (RegExpTerm child | child = cc.getAChild() |
        result = child.(RegExpConstant).getValue() or
        exists (string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) |
          lo <= result and result <= hi
        )
      )
    ) or
    d = Dot() and not (result = "\n" or result = "\r") or
    d = Any()
  ) or
  exists (RegExpCharacterClass cc | c = CharClass(cc) and result = choose(cc) |
    d = CharClass(cc) or
    d = Dot() and not (result = "\n" or result = "\r") or
    d = Any()
  ) or
  c = Dot() and (
    d = Dot() and result = "a" or
    d = Any() and result = "a"
  ) or
  c = Any() and d = Any() and result = "a"
  or
  result = intersect(d, c)
}

/**
 * Gets a character matched by character class `cc`.
 */
string choose(RegExpCharacterClass cc) {
  result = min(string c |
    exists (RegExpTerm child | child = cc.getAChild() |
      c = child.(RegExpConstant).getValue() or
      child.(RegExpCharacterRange).isRange(c, _)
    )
  )
}

/**
 * Gets a string corresponding to the trace `t`.
 */
string concretise(Trace t) {
  t = Nil() and result = ""
  or
  exists (InputSymbol s1, InputSymbol s2, Trace rest | t = Step(s1, s2, rest) |
    result = concretise(rest) + intersect(s1, s2)
  )
}

/**
 * Holds if `r` is reachable from `(fork, fork)` under input `w`, and there is
 * a path from `r` back to `(fork, fork)` with `rem` steps.
 */
predicate isReachableFromFork(State fork, StatePair r, Trace w, int rem) {
  exists (InputSymbol s1, InputSymbol s2, State q1, State q2 |
    isFork(fork, s1, s2, q1, q2) and
    r = MkStatePair(q1, q2) and
    w = Step(s1, s2, Nil()) and
    rem = statePairDist(r, MkStatePair(fork, fork))
  )
  or
  exists (StatePair p, Trace v, InputSymbol s1, InputSymbol s2 |
    isReachableFromFork(fork, p, v, rem+1) and
    step(p, s1, s2, r) and
    w = Step(s1, s2, v) and
    rem > 0
  )
}

/**
 * Gets a state in the product automaton from which `(fork, fork)` is
 * reachable in zero or more epsilon transitions.
 */
StatePair getAForkPair(State fork) {
  isFork(fork, _, _, _, _) and
  result = MkStatePair(epsilonPred*(fork), epsilonPred*(fork))
}

/**
 * Holds if `fork` is a pumpable fork with word `w`.
 */
predicate isPumpable(State fork, string w) {
  exists (StatePair q, Trace t |
    isReachableFromFork(fork, q, t, _) and
    (
      q = getAForkPair(fork) and w = concretise(t)
      or
      exists (InputSymbol s1, InputSymbol s2 |
        step(q, s1, s2, getAForkPair(fork)) and
        w = concretise(Step(s1, s2, t))
      )
    )
  )
}

/**
 * Gets a state that can be reached from pumpable `fork` consuming
 * the first `i+1` characters of `w`.
 *
 * Character classes are overapproximated as intervals; for example,
 * `[a-ln-z]` is treated the same as `[a-z]`, and hence considered
 * to match `m`, even though in fact it does not. This is fine for
 * our purposes, since we only use this predicate to avoid false
 * positives.
 */
State process(State fork, string w, int i) {
  isPumpable(fork, w) and
  exists (State prev | i = 0 and prev = fork or prev = process(fork, w, i-1) |
    exists (InputSymbol s |
      deltaClosed(prev, s, result) and
      exists(intersect(Char(w.charAt(i)), s))
    )
  )
}

/**
 * Gets the result of backslash-escaping newlines, carriage-returns and
 * backslashes in `s`.
 */
bindingset[s]
string escape(string s) {
  result = s.replaceAll("\\", "\\\\").replaceAll("\n", "\\n").replaceAll("\r", "\\r")
}

from RegExpTerm t, string c
where c = min(string w | isPumpable(Match(t), w)) and not isPumpable(epsilonSucc+(Match(t)), _) and
      not epsilonSucc*(process(Match(t), c, c.length()-1)) = Accept(_)
select t, "This part of the regular expression may cause exponential backtracking on strings " +
          "containing many repetitions of '" + escape(c) + "'."