minor regexp clean-ups
This commit is contained in:
parent
f1fdeb3bff
commit
249a9e38f7
|
@ -56,13 +56,15 @@
|
||||||
[else (raise-type-error 'regexp-replace-quote
|
[else (raise-type-error 'regexp-replace-quote
|
||||||
"string or byte string" s)]))
|
"string or byte string" s)]))
|
||||||
|
|
||||||
|
;; This was originally intended to be general, but it has become specialized
|
||||||
|
;; to deal with the combination of a regexp and a number:
|
||||||
(define (make-regexp-tweaker tweaker)
|
(define (make-regexp-tweaker tweaker)
|
||||||
(let ([t (make-weak-hasheq)])
|
(let ([t (make-hash)])
|
||||||
(lambda (rx)
|
(lambda (rx n)
|
||||||
(define-syntax-rule (->str x) (if (bytes? x) (bytes->string/utf-8 x) x))
|
(define-syntax-rule (->str x) (if (bytes? x) (bytes->string/utf-8 x) x))
|
||||||
(define-syntax-rule (->bts x) (if (bytes? x) x (string->bytes/utf-8 x)))
|
(define-syntax-rule (->bts x) (if (bytes? x) x (string->bytes/utf-8 x)))
|
||||||
(define-syntax-rule (tweak unwrap wrap convert)
|
(define-syntax-rule (tweak unwrap wrap convert)
|
||||||
(let ([tweaked (tweaker (unwrap rx))])
|
(let ([tweaked (tweaker (unwrap rx) n)])
|
||||||
;; the tweaker is allowed to return a regexp
|
;; the tweaker is allowed to return a regexp
|
||||||
(if (or (regexp? tweaked) (byte-regexp? tweaked))
|
(if (or (regexp? tweaked) (byte-regexp? tweaked))
|
||||||
tweaked
|
tweaked
|
||||||
|
@ -81,8 +83,9 @@
|
||||||
'regexp-tweaker
|
'regexp-tweaker
|
||||||
"regexp, byte regexp, string, or byte string"
|
"regexp, byte regexp, string, or byte string"
|
||||||
rx)]))
|
rx)]))
|
||||||
(or (hash-ref t rx #f)
|
(let ([key (cons n rx)])
|
||||||
(let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*)))))
|
(or (hash-ref t key #f)
|
||||||
|
(let ([rx* (run-tweak)]) (hash-set! t key rx*) rx*))))))
|
||||||
|
|
||||||
(define (regexp-try-match pattern input-port [start-k 0] [end-k #f] [out #f])
|
(define (regexp-try-match pattern input-port [start-k 0] [end-k #f] [out #f])
|
||||||
(unless (input-port? input-port)
|
(unless (input-port? input-port)
|
||||||
|
@ -109,8 +112,8 @@
|
||||||
;; Helper macro for the regexp functions below, with some utilities.
|
;; Helper macro for the regexp functions below, with some utilities.
|
||||||
(define (bstring-length s)
|
(define (bstring-length s)
|
||||||
(if (bytes? s) (bytes-length s) (string-length s)))
|
(if (bytes? s) (bytes-length s) (string-length s)))
|
||||||
(define (no-empty-edge-matches n)
|
(define no-empty-edge-matches
|
||||||
(make-regexp-tweaker (lambda (rx)
|
(make-regexp-tweaker (lambda (rx n)
|
||||||
(if (bytes? rx)
|
(if (bytes? rx)
|
||||||
(bytes-append #"(?:"
|
(bytes-append #"(?:"
|
||||||
rx
|
rx
|
||||||
|
@ -166,8 +169,7 @@
|
||||||
(let loop ([acc '()] [start start] [end end] [ipre ipre] [0-ok? #t])
|
(let loop ([acc '()] [start start] [end end] [ipre ipre] [0-ok? #t])
|
||||||
(let* ([rx (if 0-ok?
|
(let* ([rx (if 0-ok?
|
||||||
orig-rx
|
orig-rx
|
||||||
((no-empty-edge-matches (add1 (bytes-length ipre)))
|
(no-empty-edge-matches orig-rx (add1 (bytes-length ipre))))])
|
||||||
orig-rx))])
|
|
||||||
(if (and port-success-choose (input-port? string))
|
(if (and port-success-choose (input-port? string))
|
||||||
|
|
||||||
;; Input port match, get string
|
;; Input port match, get string
|
||||||
|
|
|
@ -84,7 +84,6 @@ THREAD_LOCAL_DECL(static rxpos regcode) ; /* Code-emit pointer, if less than
|
||||||
THREAD_LOCAL_DECL(static rxpos regcodesize);
|
THREAD_LOCAL_DECL(static rxpos regcodesize);
|
||||||
THREAD_LOCAL_DECL(static rxpos regcodemax);
|
THREAD_LOCAL_DECL(static rxpos regcodemax);
|
||||||
THREAD_LOCAL_DECL(static long regmaxlookback);
|
THREAD_LOCAL_DECL(static long regmaxlookback);
|
||||||
static int reghasgenlookback; /* FIXME: make this thread local */
|
|
||||||
|
|
||||||
/* caches to avoid gc */
|
/* caches to avoid gc */
|
||||||
THREAD_LOCAL_DECL(static long rx_buffer_size);
|
THREAD_LOCAL_DECL(static long rx_buffer_size);
|
||||||
|
@ -170,7 +169,6 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre)
|
||||||
regnpar = 1;
|
regnpar = 1;
|
||||||
regncounter = 0;
|
regncounter = 0;
|
||||||
regmaxlookback = 0;
|
regmaxlookback = 0;
|
||||||
reghasgenlookback = 0;
|
|
||||||
regcode = 1;
|
regcode = 1;
|
||||||
regcodesize = 0;
|
regcodesize = 0;
|
||||||
regcodemax = 0;
|
regcodemax = 0;
|
||||||
|
@ -202,8 +200,6 @@ regcomp(char *expstr, rxpos exp, int explen, int pcre)
|
||||||
r->nsubexp = regnpar;
|
r->nsubexp = regnpar;
|
||||||
r->ncounter = regncounter;
|
r->ncounter = regncounter;
|
||||||
r->maxlookback = regmaxlookback;
|
r->maxlookback = regmaxlookback;
|
||||||
if (reghasgenlookback)
|
|
||||||
r->flags |= REGEXP_LOOKBEHIND;
|
|
||||||
|
|
||||||
/* Second pass: emit code. */
|
/* Second pass: emit code. */
|
||||||
regparse = exp;
|
regparse = exp;
|
||||||
|
@ -498,7 +494,7 @@ reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
|
||||||
rxpos br;
|
rxpos br;
|
||||||
rxpos ender;
|
rxpos ender;
|
||||||
int parno = 0;
|
int parno = 0;
|
||||||
int flags, matchmin, matchmax, maxlookback, brcount, hasgenlookback;
|
int flags, matchmin, matchmax, maxlookback, brcount;
|
||||||
Scheme_Hash_Table *backdepends;
|
Scheme_Hash_Table *backdepends;
|
||||||
|
|
||||||
#ifdef DO_STACK_CHECK
|
#ifdef DO_STACK_CHECK
|
||||||
|
@ -564,7 +560,6 @@ reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
|
||||||
matchmin = regmatchmin;
|
matchmin = regmatchmin;
|
||||||
matchmax = regmatchmax;
|
matchmax = regmatchmax;
|
||||||
maxlookback = regmaxlookback;
|
maxlookback = regmaxlookback;
|
||||||
hasgenlookback = reghasgenlookback;
|
|
||||||
brcount = 1;
|
brcount = 1;
|
||||||
while (regparsestr[regparse] == '|') {
|
while (regparsestr[regparse] == '|') {
|
||||||
brcount++;
|
brcount++;
|
||||||
|
@ -595,15 +590,12 @@ reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
|
||||||
matchmax = regmatchmax;
|
matchmax = regmatchmax;
|
||||||
if (regmaxlookback > maxlookback)
|
if (regmaxlookback > maxlookback)
|
||||||
maxlookback = regmaxlookback;
|
maxlookback = regmaxlookback;
|
||||||
if (reghasgenlookback)
|
|
||||||
hasgenlookback = 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
regbackdepends = backdepends;
|
regbackdepends = backdepends;
|
||||||
regmatchmin = matchmin;
|
regmatchmin = matchmin;
|
||||||
regmatchmax = matchmax;
|
regmatchmax = matchmax;
|
||||||
regmaxlookback = maxlookback;
|
regmaxlookback = maxlookback;
|
||||||
reghasgenlookback = hasgenlookback;
|
|
||||||
|
|
||||||
if (paren && paren_set) {
|
if (paren && paren_set) {
|
||||||
Scheme_Object *assumed;
|
Scheme_Object *assumed;
|
||||||
|
@ -660,7 +652,6 @@ reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
|
||||||
if (matchmax > 0x7FFF)
|
if (matchmax > 0x7FFF)
|
||||||
FAIL("lookbehind match is potentially too long (more than 32767 bytes)");
|
FAIL("lookbehind match is potentially too long (more than 32767 bytes)");
|
||||||
regmaxlookback = matchmax + maxlookback;
|
regmaxlookback = matchmax + maxlookback;
|
||||||
reghasgenlookback = 1;
|
|
||||||
if (ret + 8 < regcodesize) {
|
if (ret + 8 < regcodesize) {
|
||||||
regstr[ret + 5] = (matchmin >> 8);
|
regstr[ret + 5] = (matchmin >> 8);
|
||||||
regstr[ret + 6] = (matchmin & 255);
|
regstr[ret + 6] = (matchmin & 255);
|
||||||
|
@ -726,7 +717,7 @@ regbranch(int *flagp, int parse_flags, int without_branch_node)
|
||||||
{
|
{
|
||||||
rxpos ret;
|
rxpos ret;
|
||||||
rxpos chain, latest;
|
rxpos chain, latest;
|
||||||
int flags = 0, matchmin = 0, matchmax = 0, maxlookback = 0, hasgenlookback = 0, pcount = 0, save_flags;
|
int flags = 0, matchmin = 0, matchmax = 0, maxlookback = 0, pcount = 0, save_flags;
|
||||||
|
|
||||||
*flagp = (WORST|SPFIXED); /* Tentatively. */
|
*flagp = (WORST|SPFIXED); /* Tentatively. */
|
||||||
|
|
||||||
|
@ -757,8 +748,6 @@ regbranch(int *flagp, int parse_flags, int without_branch_node)
|
||||||
regtail(chain, latest);
|
regtail(chain, latest);
|
||||||
if (!(flags&SPFIXED))
|
if (!(flags&SPFIXED))
|
||||||
*flagp &= ~SPFIXED;
|
*flagp &= ~SPFIXED;
|
||||||
if (reghasgenlookback && (regmaxlookback > matchmin))
|
|
||||||
hasgenlookback = 1;
|
|
||||||
if ((regmaxlookback - matchmin) > maxlookback)
|
if ((regmaxlookback - matchmin) > maxlookback)
|
||||||
maxlookback = regmaxlookback - matchmin;
|
maxlookback = regmaxlookback - matchmin;
|
||||||
matchmin += regmatchmin;
|
matchmin += regmatchmin;
|
||||||
|
@ -771,7 +760,6 @@ regbranch(int *flagp, int parse_flags, int without_branch_node)
|
||||||
regmatchmin = matchmin;
|
regmatchmin = matchmin;
|
||||||
regmatchmax = matchmax;
|
regmatchmax = matchmax;
|
||||||
regmaxlookback = maxlookback;
|
regmaxlookback = maxlookback;
|
||||||
reghasgenlookback = hasgenlookback;
|
|
||||||
if (chain == 0) { /* Loop ran zero times. */
|
if (chain == 0) { /* Loop ran zero times. */
|
||||||
latest = regnode(NOTHING);
|
latest = regnode(NOTHING);
|
||||||
if (without_branch_node)
|
if (without_branch_node)
|
||||||
|
@ -1085,7 +1073,6 @@ regatom(int *flagp, int parse_flags, int at_start)
|
||||||
*flagp = (WORST|SPFIXED); /* Tentatively. */
|
*flagp = (WORST|SPFIXED); /* Tentatively. */
|
||||||
regmatchmin = regmatchmax = 1;
|
regmatchmin = regmatchmax = 1;
|
||||||
regmaxlookback = 0;
|
regmaxlookback = 0;
|
||||||
reghasgenlookback = 0;
|
|
||||||
|
|
||||||
switch (regparsestr[regparse++]) {
|
switch (regparsestr[regparse++]) {
|
||||||
case '^':
|
case '^':
|
||||||
|
@ -1331,12 +1318,10 @@ regatom(int *flagp, int parse_flags, int at_start)
|
||||||
ret = regnode(WORDBOUND);
|
ret = regnode(WORDBOUND);
|
||||||
regmatchmin = regmatchmax = 0;
|
regmatchmin = regmatchmax = 0;
|
||||||
regmaxlookback = 1;
|
regmaxlookback = 1;
|
||||||
reghasgenlookback = 1;
|
|
||||||
} else if ((parse_flags & PARSE_PCRE) && (c == 'B')) {
|
} else if ((parse_flags & PARSE_PCRE) && (c == 'B')) {
|
||||||
ret = regnode(NOTWORDBOUND);
|
ret = regnode(NOTWORDBOUND);
|
||||||
regmatchmin = regmatchmax = 0;
|
regmatchmin = regmatchmax = 0;
|
||||||
regmaxlookback = 1;
|
regmaxlookback = 1;
|
||||||
reghasgenlookback = 1;
|
|
||||||
} else if ((parse_flags & PARSE_PCRE) && (c == 'p')) {
|
} else if ((parse_flags & PARSE_PCRE) && (c == 'p')) {
|
||||||
ret = regunicode(0);
|
ret = regunicode(0);
|
||||||
regmatchmax = MAX_UTF8_CHAR_BYTES;
|
regmatchmax = MAX_UTF8_CHAR_BYTES;
|
||||||
|
|
|
@ -31,7 +31,6 @@ typedef struct regexp {
|
||||||
#define REGEXP_ANCH 0x04
|
#define REGEXP_ANCH 0x04
|
||||||
#define REGEXP_MUST_CI 0x08
|
#define REGEXP_MUST_CI 0x08
|
||||||
#define REGEXP_JIT 0x10
|
#define REGEXP_JIT 0x10
|
||||||
#define REGEXP_LOOKBEHIND 0x20
|
|
||||||
|
|
||||||
#ifdef INDIRECT_TO_PROGRAM
|
#ifdef INDIRECT_TO_PROGRAM
|
||||||
# define N_ITO_DELTA(prog, extra, re) extra
|
# define N_ITO_DELTA(prog, extra, re) extra
|
||||||
|
|
Loading…
Reference in New Issue
Block a user