fix regexp-matching bug
In a pattern like a*b a naive attempt to match will take quadratic time on an input that contains all "a"s an no "b". To improve that case, the regexp compiler detects that a match will require a "b" and checks the input for a "b" to enable linear-time failure. That optimization mishandled `(?!...)` and `(?<!...)` patterns, treating the must-not-match subpatterns as things that must match. So, (regexp-match "a*(?!b)" "aaaxy") returned false, because the input doesn't contain "b". Thie commit repairs the optimization. Closes #1468
This commit is contained in:
parent
5ec147ee40
commit
00644821de
|
@ -1800,6 +1800,20 @@
|
||||||
|
|
||||||
(err/rt-test (regexp "+" #f) (lambda (exn) (regexp-match? "`[+]' follows nothing in pattern" (exn-message exn))))
|
(err/rt-test (regexp "+" #f) (lambda (exn) (regexp-match? "`[+]' follows nothing in pattern" (exn-message exn))))
|
||||||
|
|
||||||
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; Make sure that negated patterns as literal strings are not recorded
|
||||||
|
;; as "must include this literal string" requirements
|
||||||
|
|
||||||
|
(test '("aaa") regexp-match #rx"a*(?!b)" "aaaxy")
|
||||||
|
(test '("aaa") regexp-match #rx"a*(?<!b)" "aaaxy")
|
||||||
|
|
||||||
|
;; Make sure "must match" strings are preserved for non-negated
|
||||||
|
;; lookahead and lookbehind; the following examples take
|
||||||
|
;; quadratic time without the "must match" optimization,
|
||||||
|
;; and return return away with #f for with the optimization
|
||||||
|
(test #f 'optimized (regexp-match #px"a*(?=bc)" (make-bytes 100024 (char->integer #\a))))
|
||||||
|
(test #f 'optimized (regexp-match #px"a*(?<=bc)" (make-bytes 100024 (char->integer #\a))))
|
||||||
|
|
||||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(report-errs)
|
(report-errs)
|
||||||
|
|
|
@ -281,12 +281,16 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre, Scheme_Object *handler)
|
||||||
longest = 0;
|
longest = 0;
|
||||||
longest_is_ci = 0;
|
longest_is_ci = 0;
|
||||||
len = 0;
|
len = 0;
|
||||||
for (; scan != 0; scan = regnext(scan)) {
|
for (; scan != 0; ) {
|
||||||
int mscan = scan;
|
int mscan = scan;
|
||||||
while (1) {
|
while (1) {
|
||||||
int mop;
|
int mop;
|
||||||
mop = rOP(mscan);
|
mop = rOP(mscan);
|
||||||
if (((mop == EXACTLY) || (mop == EXACTLY_CI))
|
if ((mop == LOOKF) || (mop == LOOKBF)) {
|
||||||
|
/* skip over part that we don't want to match */
|
||||||
|
mscan = mscan + rOPLEN(OPERAND(mscan));
|
||||||
|
mscan = NEXT_OP(mscan);
|
||||||
|
} else if (((mop == EXACTLY) || (mop == EXACTLY_CI))
|
||||||
&& rOPLEN(OPERAND(mscan)) >= len) {
|
&& rOPLEN(OPERAND(mscan)) >= len) {
|
||||||
/* Skip regmust if it contains a null character: */
|
/* Skip regmust if it contains a null character: */
|
||||||
rxpos ls = OPSTR(OPERAND(mscan));
|
rxpos ls = OPSTR(OPERAND(mscan));
|
||||||
|
@ -321,6 +325,13 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre, Scheme_Object *handler)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
prev_op = rOP(scan);
|
prev_op = rOP(scan);
|
||||||
|
if ((prev_op == LOOKF) || (prev_op == LOOKBF)) {
|
||||||
|
/* skip over part that we don't want to match */
|
||||||
|
scan = scan + rOPLEN(OPERAND(scan));
|
||||||
|
scan = NEXT_OP(scan);
|
||||||
|
} else {
|
||||||
|
scan = regnext(scan);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (longest) {
|
if (longest) {
|
||||||
r->regmust = longest;
|
r->regmust = longest;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user