fix regexp-matching bug
In a pattern like a*b a naive attempt to match will take quadratic time on an input that contains all "a"s an no "b". To improve that case, the regexp compiler detects that a match will require a "b" and checks the input for a "b" to enable linear-time failure. That optimization mishandled `(?!...)` and `(?<!...)` patterns, treating the must-not-match subpatterns as things that must match. So, (regexp-match "a*(?!b)" "aaaxy") returned false, because the input doesn't contain "b". Thie commit repairs the optimization. Closes #1468
This commit is contained in:
parent
5ec147ee40
commit
00644821de
|
@ -1800,6 +1800,20 @@
|
|||
|
||||
(err/rt-test (regexp "+" #f) (lambda (exn) (regexp-match? "`[+]' follows nothing in pattern" (exn-message exn))))
|
||||
|
||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Make sure that negated patterns as literal strings are not recorded
|
||||
;; as "must include this literal string" requirements
|
||||
|
||||
(test '("aaa") regexp-match #rx"a*(?!b)" "aaaxy")
|
||||
(test '("aaa") regexp-match #rx"a*(?<!b)" "aaaxy")
|
||||
|
||||
;; Make sure "must match" strings are preserved for non-negated
|
||||
;; lookahead and lookbehind; the following examples take
|
||||
;; quadratic time without the "must match" optimization,
|
||||
;; and return return away with #f for with the optimization
|
||||
(test #f 'optimized (regexp-match #px"a*(?=bc)" (make-bytes 100024 (char->integer #\a))))
|
||||
(test #f 'optimized (regexp-match #px"a*(?<=bc)" (make-bytes 100024 (char->integer #\a))))
|
||||
|
||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(report-errs)
|
||||
|
|
|
@ -281,12 +281,16 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre, Scheme_Object *handler)
|
|||
longest = 0;
|
||||
longest_is_ci = 0;
|
||||
len = 0;
|
||||
for (; scan != 0; scan = regnext(scan)) {
|
||||
for (; scan != 0; ) {
|
||||
int mscan = scan;
|
||||
while (1) {
|
||||
int mop;
|
||||
mop = rOP(mscan);
|
||||
if (((mop == EXACTLY) || (mop == EXACTLY_CI))
|
||||
if ((mop == LOOKF) || (mop == LOOKBF)) {
|
||||
/* skip over part that we don't want to match */
|
||||
mscan = mscan + rOPLEN(OPERAND(mscan));
|
||||
mscan = NEXT_OP(mscan);
|
||||
} else if (((mop == EXACTLY) || (mop == EXACTLY_CI))
|
||||
&& rOPLEN(OPERAND(mscan)) >= len) {
|
||||
/* Skip regmust if it contains a null character: */
|
||||
rxpos ls = OPSTR(OPERAND(mscan));
|
||||
|
@ -321,6 +325,13 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre, Scheme_Object *handler)
|
|||
break;
|
||||
}
|
||||
prev_op = rOP(scan);
|
||||
if ((prev_op == LOOKF) || (prev_op == LOOKBF)) {
|
||||
/* skip over part that we don't want to match */
|
||||
scan = scan + rOPLEN(OPERAND(scan));
|
||||
scan = NEXT_OP(scan);
|
||||
} else {
|
||||
scan = regnext(scan);
|
||||
}
|
||||
}
|
||||
if (longest) {
|
||||
r->regmust = longest;
|
||||
|
|
Loading…
Reference in New Issue
Block a user