fix regexp-matching bug

In a pattern like

 a*b

a naive attempt to match will take quadratic time on an input that
contains all "a"s an no "b". To improve that case, the regexp compiler
detects that a match will require a "b" and checks the input for a "b"
to enable linear-time failure.

That optimization mishandled `(?!...)` and `(?<!...)` patterns,
treating the must-not-match subpatterns as things that must match.
So,

  (regexp-match "a*(?!b)" "aaaxy")

returned false, because the input doesn't contain "b".

Thie commit repairs the optimization.

Closes #1468
This commit is contained in:
Matthew Flatt 2016-09-24 14:43:54 -06:00
parent 5ec147ee40
commit 00644821de
2 changed files with 27 additions and 2 deletions

View File

@ -1800,6 +1800,20 @@
(err/rt-test (regexp "+" #f) (lambda (exn) (regexp-match? "`[+]' follows nothing in pattern" (exn-message exn))))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Make sure that negated patterns as literal strings are not recorded
;; as "must include this literal string" requirements
(test '("aaa") regexp-match #rx"a*(?!b)" "aaaxy")
(test '("aaa") regexp-match #rx"a*(?<!b)" "aaaxy")
;; Make sure "must match" strings are preserved for non-negated
;; lookahead and lookbehind; the following examples take
;; quadratic time without the "must match" optimization,
;; and return return away with #f for with the optimization
(test #f 'optimized (regexp-match #px"a*(?=bc)" (make-bytes 100024 (char->integer #\a))))
(test #f 'optimized (regexp-match #px"a*(?<=bc)" (make-bytes 100024 (char->integer #\a))))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(report-errs)

View File

@ -281,12 +281,16 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre, Scheme_Object *handler)
longest = 0;
longest_is_ci = 0;
len = 0;
for (; scan != 0; scan = regnext(scan)) {
for (; scan != 0; ) {
int mscan = scan;
while (1) {
int mop;
mop = rOP(mscan);
if (((mop == EXACTLY) || (mop == EXACTLY_CI))
if ((mop == LOOKF) || (mop == LOOKBF)) {
/* skip over part that we don't want to match */
mscan = mscan + rOPLEN(OPERAND(mscan));
mscan = NEXT_OP(mscan);
} else if (((mop == EXACTLY) || (mop == EXACTLY_CI))
&& rOPLEN(OPERAND(mscan)) >= len) {
/* Skip regmust if it contains a null character: */
rxpos ls = OPSTR(OPERAND(mscan));
@ -321,6 +325,13 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre, Scheme_Object *handler)
break;
}
prev_op = rOP(scan);
if ((prev_op == LOOKF) || (prev_op == LOOKBF)) {
/* skip over part that we don't want to match */
scan = scan + rOPLEN(OPERAND(scan));
scan = NEXT_OP(scan);
} else {
scan = regnext(scan);
}
}
if (longest) {
r->regmust = longest;