minor regexp clean-ups

This commit is contained in:
Matthew Flatt 2010-04-15 10:10:19 -04:00
parent f1fdeb3bff
commit 249a9e38f7
3 changed files with 13 additions and 27 deletions

View File

@ -56,13 +56,15 @@
[else (raise-type-error 'regexp-replace-quote
"string or byte string" s)]))
;; This was originally intended to be general, but it has become specialized
;; to deal with the combination of a regexp and a number:
(define (make-regexp-tweaker tweaker)
(let ([t (make-weak-hasheq)])
(lambda (rx)
(let ([t (make-hash)])
(lambda (rx n)
(define-syntax-rule (->str x) (if (bytes? x) (bytes->string/utf-8 x) x))
(define-syntax-rule (->bts x) (if (bytes? x) x (string->bytes/utf-8 x)))
(define-syntax-rule (tweak unwrap wrap convert)
(let ([tweaked (tweaker (unwrap rx))])
(let ([tweaked (tweaker (unwrap rx) n)])
;; the tweaker is allowed to return a regexp
(if (or (regexp? tweaked) (byte-regexp? tweaked))
tweaked
@ -81,8 +83,9 @@
'regexp-tweaker
"regexp, byte regexp, string, or byte string"
rx)]))
(or (hash-ref t rx #f)
(let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*)))))
(let ([key (cons n rx)])
(or (hash-ref t key #f)
(let ([rx* (run-tweak)]) (hash-set! t key rx*) rx*))))))
(define (regexp-try-match pattern input-port [start-k 0] [end-k #f] [out #f])
(unless (input-port? input-port)
@ -109,8 +112,8 @@
;; Helper macro for the regexp functions below, with some utilities.
(define (bstring-length s)
(if (bytes? s) (bytes-length s) (string-length s)))
(define (no-empty-edge-matches n)
(make-regexp-tweaker (lambda (rx)
(define no-empty-edge-matches
(make-regexp-tweaker (lambda (rx n)
(if (bytes? rx)
(bytes-append #"(?:"
rx
@ -166,8 +169,7 @@
(let loop ([acc '()] [start start] [end end] [ipre ipre] [0-ok? #t])
(let* ([rx (if 0-ok?
orig-rx
((no-empty-edge-matches (add1 (bytes-length ipre)))
orig-rx))])
(no-empty-edge-matches orig-rx (add1 (bytes-length ipre))))])
(if (and port-success-choose (input-port? string))
;; Input port match, get string

View File

@ -84,7 +84,6 @@ THREAD_LOCAL_DECL(static rxpos regcode) ; /* Code-emit pointer, if less than
THREAD_LOCAL_DECL(static rxpos regcodesize);
THREAD_LOCAL_DECL(static rxpos regcodemax);
THREAD_LOCAL_DECL(static long regmaxlookback);
static int reghasgenlookback; /* FIXME: make this thread local */
/* caches to avoid gc */
THREAD_LOCAL_DECL(static long rx_buffer_size);
@ -170,7 +169,6 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre)
regnpar = 1;
regncounter = 0;
regmaxlookback = 0;
reghasgenlookback = 0;
regcode = 1;
regcodesize = 0;
regcodemax = 0;
@ -202,8 +200,6 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre)
r->nsubexp = regnpar;
r->ncounter = regncounter;
r->maxlookback = regmaxlookback;
if (reghasgenlookback)
r->flags |= REGEXP_LOOKBEHIND;
/* Second pass: emit code. */
regparse = exp;
@ -498,7 +494,7 @@ reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
rxpos br;
rxpos ender;
int parno = 0;
int flags, matchmin, matchmax, maxlookback, brcount, hasgenlookback;
int flags, matchmin, matchmax, maxlookback, brcount;
Scheme_Hash_Table *backdepends;
#ifdef DO_STACK_CHECK
@ -564,7 +560,6 @@ reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
matchmin = regmatchmin;
matchmax = regmatchmax;
maxlookback = regmaxlookback;
hasgenlookback = reghasgenlookback;
brcount = 1;
while (regparsestr[regparse] == '|') {
brcount++;
@ -595,15 +590,12 @@ reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
matchmax = regmatchmax;
if (regmaxlookback > maxlookback)
maxlookback = regmaxlookback;
if (reghasgenlookback)
hasgenlookback = 1;
}
}
regbackdepends = backdepends;
regmatchmin = matchmin;
regmatchmax = matchmax;
regmaxlookback = maxlookback;
reghasgenlookback = hasgenlookback;
if (paren && paren_set) {
Scheme_Object *assumed;
@ -660,7 +652,6 @@ reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
if (matchmax > 0x7FFF)
FAIL("lookbehind match is potentially too long (more than 32767 bytes)");
regmaxlookback = matchmax + maxlookback;
reghasgenlookback = 1;
if (ret + 8 < regcodesize) {
regstr[ret + 5] = (matchmin >> 8);
regstr[ret + 6] = (matchmin & 255);
@ -726,7 +717,7 @@ regbranch(int *flagp, int parse_flags, int without_branch_node)
{
rxpos ret;
rxpos chain, latest;
int flags = 0, matchmin = 0, matchmax = 0, maxlookback = 0, hasgenlookback = 0, pcount = 0, save_flags;
int flags = 0, matchmin = 0, matchmax = 0, maxlookback = 0, pcount = 0, save_flags;
*flagp = (WORST|SPFIXED); /* Tentatively. */
@ -757,8 +748,6 @@ regbranch(int *flagp, int parse_flags, int without_branch_node)
regtail(chain, latest);
if (!(flags&SPFIXED))
*flagp &= ~SPFIXED;
if (reghasgenlookback && (regmaxlookback > matchmin))
hasgenlookback = 1;
if ((regmaxlookback - matchmin) > maxlookback)
maxlookback = regmaxlookback - matchmin;
matchmin += regmatchmin;
@ -771,7 +760,6 @@ regbranch(int *flagp, int parse_flags, int without_branch_node)
regmatchmin = matchmin;
regmatchmax = matchmax;
regmaxlookback = maxlookback;
reghasgenlookback = hasgenlookback;
if (chain == 0) { /* Loop ran zero times. */
latest = regnode(NOTHING);
if (without_branch_node)
@ -1085,7 +1073,6 @@ regatom(int *flagp, int parse_flags, int at_start)
*flagp = (WORST|SPFIXED); /* Tentatively. */
regmatchmin = regmatchmax = 1;
regmaxlookback = 0;
reghasgenlookback = 0;
switch (regparsestr[regparse++]) {
case '^':
@ -1331,12 +1318,10 @@ regatom(int *flagp, int parse_flags, int at_start)
ret = regnode(WORDBOUND);
regmatchmin = regmatchmax = 0;
regmaxlookback = 1;
reghasgenlookback = 1;
} else if ((parse_flags & PARSE_PCRE) && (c == 'B')) {
ret = regnode(NOTWORDBOUND);
regmatchmin = regmatchmax = 0;
regmaxlookback = 1;
reghasgenlookback = 1;
} else if ((parse_flags & PARSE_PCRE) && (c == 'p')) {
ret = regunicode(0);
regmatchmax = MAX_UTF8_CHAR_BYTES;

View File

@ -31,7 +31,6 @@ typedef struct regexp {
#define REGEXP_ANCH 0x04
#define REGEXP_MUST_CI 0x08
#define REGEXP_JIT 0x10
#define REGEXP_LOOKBEHIND 0x20
#ifdef INDIRECT_TO_PROGRAM
# define N_ITO_DELTA(prog, extra, re) extra