improve regexp support (PR 10855, halfway)

This commit is contained in:
Matthew Flatt 2010-04-14 18:40:08 -04:00
parent 3fe4a22dbd
commit 1c34ccec44
13 changed files with 1433 additions and 1026 deletions

View File

@ -108,25 +108,29 @@
;; Helper macro for the regexp functions below, with some utilities.
(define (bstring-length s)
(if (bytes? s) (bytes-length s) (string-length s)))
(define no-empty-edge-matches
(define (no-empty-edge-matches n)
(make-regexp-tweaker (lambda (rx)
(if (bytes? rx)
(bytes-append #"(?=.)(?:" rx #")(?<=.)")
(format "(?=.)(?:~a)(?<=.)" rx)))))
(define (bstring->no-edge-regexp name pattern)
(if (or (regexp? pattern) (byte-regexp? pattern)
(string? pattern) (bytes? pattern))
(no-empty-edge-matches pattern)
(raise-type-error
name "regexp, byte regexp, string, or byte string" pattern)))
(bytes-append #"(?=.)(?:" rx #")(?<=" (make-bytes n (char->integer #\.)) #")")
(format "(?=.)(?:~a)(?<=~a)" rx (make-bytes n (char->integer #\.)))))))
(define-syntax-rule (regexp-loop
name loop start end rx string
name loop start end pattern string
ipre
success-choose failure-k
port-success-k port-success-choose port-failure-k
need-leftover? peek?)
(let ([len (cond [(string? string) (string-length string)]
[(bytes? string) (bytes-length string)]
[else #f])])
(let* ([len (cond [(string? string) (string-length string)]
[(bytes? string) (bytes-length string)]
[else #f])]
[orig-rx (cond [(bytes? pattern) (byte-regexp pattern)]
[(string? pattern) (regexp pattern)]
[(regexp? pattern) pattern]
[(byte-regexp? pattern) pattern]
[else
(raise-type-error 'name
"regexp, byte regexp, string, or byte string"
pattern)])]
[max-lookbehind (regexp-max-lookbehind orig-rx)])
(if peek?
(unless (input-port? string)
(raise-type-error 'name "input port" string))
@ -140,6 +144,8 @@
(and (number? end) (exact? end) (integer? end)
(end . >= . 0)))
(raise-type-error 'name "non-negative exact integer or false" end))
(unless (bytes? ipre)
(raise-type-error 'name "byte string" ipre))
(unless (or (input-port? string) (and len (start . <= . len)))
(raise-mismatch-error
'name
@ -153,66 +159,76 @@
(format "ending offset index out of range [~a,~a]: " start len)
end))
(reverse
(let loop ([acc '()] [start start] [end end])
(let loop ([acc '()] [start start] [end end] [ipre ipre] [rx #f] [rx-lb 0])
(let* ([new-rx-lb (add1 (bytes-length ipre))]
[rx (if (= rx-lb new-rx-lb)
rx
((no-empty-edge-matches new-rx-lb) orig-rx))])
(if (and port-success-choose (input-port? string))
(if (and port-success-choose (input-port? string))
;; Input port match, get string
(let* ([_ (when (positive? start)
;; Skip start chars:
(let ([s (make-bytes 4096)])
(let loop ([n 0])
(unless (= n start)
(let ([m (read-bytes-avail!
s string 0 (min (- start n) 4096))])
(unless (eof-object? m) (loop (+ n m))))))))]
[discarded/leftovers (if need-leftover? #f 0)]
[spitout (if need-leftover?
(open-output-bytes)
(make-output-port
'counter always-evt
(lambda (s start end flush? breakable?)
(let ([c (- end start)])
(set! discarded/leftovers
(+ c discarded/leftovers))
c))
void))]
[end (and end (- end start))]
[m (regexp-match rx string 0 end spitout)]
[m (and m (car m))]
[discarded/leftovers (if need-leftover?
(get-output-bytes spitout)
discarded/leftovers)]
[end (and end m
(- end (if need-leftover?
(bstring-length discarded/leftovers)
discarded/leftovers)
(bstring-length m)))])
(if m
(loop (cons (port-success-choose m discarded/leftovers) acc)
0 end)
(port-failure-k acc discarded/leftovers)))
;; String/port match, get positions
(let ([m (if peek?
(regexp-match-peek-positions rx string start end)
(regexp-match-positions rx string start end))])
(if (not m)
(failure-k acc start end)
(let ([mstart (caar m)] [mend (cdar m)])
(if port-success-k
(port-success-k
(lambda (acc new-start new-end)
(loop acc new-start new-end))
acc start end mstart mend)
(loop (cons (success-choose start mstart mend) acc)
mend end))))))))))
;; Input port match, get string
(let* ([_ (when (positive? start)
;; Skip start chars:
(let ([s (make-bytes 4096)])
(let loop ([n 0])
(unless (= n start)
(let ([m (read-bytes-avail!
s string 0 (min (- start n) 4096))])
(unless (eof-object? m) (loop (+ n m))))))))]
[discarded/leftovers (if need-leftover? #f 0)]
[spitout (if need-leftover?
(open-output-bytes)
(make-output-port
'counter always-evt
(lambda (s start end flush? breakable?)
(let ([c (- end start)])
(set! discarded/leftovers
(+ c discarded/leftovers))
c))
void))]
[end (and end (- end start))])
(let-values ([(m ipre) (regexp-match/end rx string 0 end spitout ipre
max-lookbehind)])
(let* ([m (and m (car m))]
[discarded/leftovers (if need-leftover?
(get-output-bytes spitout)
discarded/leftovers)]
[end (and end m
(- end (if need-leftover?
(bstring-length discarded/leftovers)
discarded/leftovers)
(bstring-length m)))])
(if m
(loop (cons (port-success-choose m discarded/leftovers) acc)
0 end ipre
rx new-rx-lb)
(port-failure-k acc discarded/leftovers)))))
;; String/port match, get positions
(let-values ([(m ipre)
(if peek?
(regexp-match-peek-positions/end rx string start end #f ipre
max-lookbehind)
(regexp-match-positions/end rx string start end #f ipre
max-lookbehind))])
(if (not m)
(failure-k acc start end)
(let ([mstart (caar m)] [mend (cdar m)])
(if port-success-k
(port-success-k
(lambda (acc new-start new-end)
(loop acc new-start new-end ipre rx new-rx-lb))
acc start end mstart mend)
(loop (cons (success-choose start mstart mend) acc)
mend end ipre rx new-rx-lb)))))))))))
;; Returns all the positions at which the pattern matched.
(define (regexp-match-positions* pattern string [start 0] [end #f])
(define (regexp-match-positions* pattern string [start 0] [end #f] [ipre #""])
(regexp-loop
regexp-match-positions* loop start end
(bstring->no-edge-regexp 'regexp-match-positions* pattern) string
pattern string
ipre
;; success-choose:
(lambda (start mstart mend) (cons mstart mend))
;; failure-k:
@ -233,10 +249,11 @@
#f))
;; Returns all the positions at which the pattern matched.
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
(define (regexp-match-peek-positions* pattern string [start 0] [end #f] [ipre #""])
(regexp-loop
regexp-match-peek-positions* loop start end
(bstring->no-edge-regexp 'regexp-match-peek-positions* pattern) string
pattern string
ipre
;; success-choose:
(lambda (start mstart mend) (cons mstart mend))
;; failure-k:
@ -250,13 +267,13 @@
;; Splits a string into a list by removing any piece which matches
;; the pattern.
(define (regexp-split pattern string [start 0] [end #f])
(define rx (bstring->no-edge-regexp 'regexp-split pattern))
(define buf (if (and (string? string) (byte-regexp? rx))
(define (regexp-split pattern string [start 0] [end #f] [ipre #""])
(define buf (if (and (string? string) (or (byte-regexp? pattern)
(bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?))
string))
(define sub (if (bytes? buf) subbytes substring))
(regexp-loop regexp-split loop start end rx buf
(regexp-loop regexp-split loop start end pattern buf ipre
;; success-choose:
(lambda (start mstart mend) (sub buf start mstart))
;; failure-k:
@ -272,13 +289,13 @@
#f))
;; Returns all the matches for the pattern in the string.
(define (regexp-match* pattern string [start 0] [end #f])
(define rx (bstring->no-edge-regexp 'regexp-match* pattern))
(define buf (if (and (string? string) (byte-regexp? rx))
(define (regexp-match* pattern string [start 0] [end #f] [ipre #""])
(define buf (if (and (string? string) (or (byte-regexp? pattern)
(bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?))
string))
(define sub (if (bytes? buf) subbytes substring))
(regexp-loop regexp-match* loop start end rx buf
(regexp-loop regexp-match* loop start end pattern buf ipre
;; success-choose:
(lambda (start mstart mend) (sub buf mstart mend))
;; failure-k:

View File

@ -195,6 +195,16 @@ case-sensitively.
(regexp-match (regexp-quote ".") "apple.scm")
]}
@defproc[(regexp-max-lookbehind [pattern (or/c regexp? byte-regexp?)])
exact-nonnegative-integer?]{
Returns the maximum number of bytes that @scheme[pattern] may consult
before the starting position of a match to determine the match. For
example, the pattern @litchar{(?<=abc)d} consults three bytes
preceding a matching @litchar{d}, while @litchar{e(?<=a..)d} consults
two bytes before a matching @litchar{ed}. A @litchar{^} pattern may
consult a preceding byte to determine whether the current position is
the start of the input or of a line.}
@;------------------------------------------------------------------------
@section{Regexp Matching}
@ -203,7 +213,8 @@ case-sensitively.
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f])
[output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(or/c #f (cons/c string? (listof (or/c string? #f))))
@ -228,9 +239,18 @@ is an input port, and if the end-of-file is reached before
@scheme[start-pos] bytes are skipped, then the match fails.
In @scheme[pattern], a start-of-string @litchar{^} refers to the first
position of @scheme[input] after @scheme[start-pos], and the
end-of-input @litchar{$} refers to the @scheme[end-pos]th position or
(in the case of an input port) the end of file, whichever comes first.
position of @scheme[input] after @scheme[start-pos], assuming that
@scheme[input-prefix] is @scheme[#""]. The end-of-input @litchar{$}
refers to the @scheme[end-pos]th position or (in the case of an input
port) the end of file, whichever comes first, assuming that
@scheme[output-prefix] is @scheme[#f].
The @scheme[input-prefix] specifies bytes that effectively precede
@scheme[input] for the purposes of @litchar{^} and other look-behind
matching. For example, a @scheme[#""] prefix means that @litchar{^}
matches at the beginning of the stream, while a @scheme[#"\n"]
@scheme[input-prefix] means that a start-of-line @litchar{^} can match
the beginning of the input, while a start-of-file @litchar{^} cannot.
If the match fails, @scheme[#f] is returned. If the match succeeds, a
list containing strings or byte string, and possibly @scheme[#f], is
@ -302,7 +322,8 @@ bytes. To avoid such interleaving, use @scheme[regexp-match-peek]
@defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f])
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(listof string?)
@ -315,12 +336,14 @@ results for parenthesized sub-patterns in @scheme[pattern] are not
returned.)
The @scheme[pattern] is used in order to find matches, where each
match attempt starts at the end of the last match. Empty matches are
handled like any matches, returning a zero-length string or byte
sequence (they are more useful in the complementing
@scheme[regexp-split] function). However, the @scheme[pattern] is
restricted from matching an empty string at the beginning (or right
after a previous match) or at the end.
match attempt starts at the end of the last match, and @litchar{$} is
allowed to match the beginning of the input (if @scheme[input-prefix]
is @scheme[#""]) only for the first match. Empty matches are handled
like other matches, returning a zero-length string or byte sequence
(they are more useful in the complementing @scheme[regexp-split]
function), but @scheme[pattern] is restricted from matching an empty
string at the beginning (or right after a previous match) or at the
end.
If @scheme[input] contains no matches (in the range @scheme[start-pos]
to @scheme[end-pos]), @scheme[null] is returned. Otherwise, each item
@ -339,7 +362,8 @@ port).
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f])
[output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(or/c #f (cons/c string? (listof (or/c string? #f))))
@ -360,7 +384,8 @@ fails.}
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f])
[output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer?
@ -388,11 +413,11 @@ positions indicate the number of bytes that were read, including
(regexp-match-positions #rx"(-[0-9]*)+" "a-12--345b")
]}
@defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f])
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(listof (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?))]{
@ -408,7 +433,8 @@ like @scheme[regexp-match*].
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f])
[output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
boolean?]{
Like @scheme[regexp-match], but returns merely @scheme[#t] when the
@ -437,19 +463,20 @@ entire content of @scheme[input] matches @scheme[pattern].
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f])
[progress (or/c evt #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c bytes? (listof (or/c bytes? #f)))
#f)]{
Like @scheme[regexp-match] on input ports, but only peeks bytes from
@scheme[input-port] instead of reading them. Furthermore, instead of
@scheme[input] instead of reading them. Furthermore, instead of
an output port, the last optional argument is a progress event for
@scheme[input-port] (see @scheme[port-progress-evt]). If @scheme[progress]
becomes ready, then the match stops peeking from @scheme[input-port]
@scheme[input] (see @scheme[port-progress-evt]). If @scheme[progress]
becomes ready, then the match stops peeking from @scheme[input]
and returns @scheme[#f]. The @scheme[progress] argument can be
@scheme[#f], in which case the peek may continue with inconsistent
information if another process meanwhile reads from
@scheme[input-port].
@scheme[input].
@examples[
(define p (open-input-string "a abcd"))
@ -466,7 +493,8 @@ information if another process meanwhile reads from
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f])
[progress (or/c evt #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer?
@ -475,7 +503,7 @@ information if another process meanwhile reads from
#f)]{
Like @scheme[regexp-match-positions] on input ports, but only peeks
bytes from @scheme[input-port] instead of reading them, and with a
bytes from @scheme[input] instead of reading them, and with a
@scheme[progress] argument like @scheme[regexp-match-peek].}
@ -483,12 +511,13 @@ bytes from @scheme[input-port] instead of reading them, and with a
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f])
[progress (or/c evt #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c bytes? (listof (or/c bytes? #f)))
#f)]{
Like @scheme[regexp-match-peek], but it attempts to match only bytes
that are available from @scheme[input-port] without blocking. The
that are available from @scheme[input] without blocking. The
match fails if not-yet-available characters might be used to match
@scheme[pattern].}
@ -497,7 +526,8 @@ match fails if not-yet-available characters might be used to match
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f])
[progress (or/c evt #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer?
@ -506,7 +536,7 @@ match fails if not-yet-available characters might be used to match
#f)]{
Like @scheme[regexp-match-peek-positions], but it attempts to match
only bytes that are available from @scheme[input-port] without
only bytes that are available from @scheme[input] without
blocking. The match fails if not-yet-available characters might be
used to match @scheme[pattern].}
@ -514,20 +544,91 @@ used to match @scheme[pattern].}
@defproc[(regexp-match-peek-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f])
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(listof (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?))]{
Like @scheme[regexp-match-peek-positions], but returns multiple matches like
@scheme[regexp-match*].}
@defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""]
[count nonnegative-exact-integer? 1])
(values
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(or/c #f (cons/c string? (listof (or/c string? #f))))
(or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))
(or/c #f bytes?))]{
Like @scheme[regexp-match], but with a second result: a byte
string of up to @scheme[count] bytes that correspond to the input
(possibly including the @scheme[input-prefix]) leading to the end of
the match; the second result is @scheme[#f] if no match is found.
The second result can be useful as an @scheme[input-prefix] for
attempting a second match on @scheme[input] starting from the end of
the first match. In that case, use @scheme[regexp-max-lookbehind]
to determine an appropriate value for @scheme[count].}
@deftogether[(
@defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""]
[count exact-nonnegative-integer? 1])
(values (listof (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?))
(or/c #f bytes?))]
@defproc[(regexp-match-peek-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f]
[input-prefix bytes? #""]
[count exact-nonnegative-integer? 1])
(values
(or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
#f)))
#f)
(or/c #f bytes?))]
@defproc[(regexp-match-peek-positions-immediate/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f]
[input-prefix bytes? #""]
[count exact-nonnegative-integer? 1])
(values
(or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
#f)))
#f)
(or/c #f bytes?))]
)]{
Like @scheme[regexp-match-positions], etc., but with a second result
like @scheme[regexp-match/end].}
@;------------------------------------------------------------------------
@section{Regexp Splitting}
@defproc[(regexp-split [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f])
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(cons/c string? (listof string?))
@ -567,7 +668,8 @@ an end-of-file if @scheme[input] is an input port).
[input (or/c string? bytes?)]
[insert (or/c string? bytes?
((string?) () #:rest (listof string?) . ->* . string?)
((bytes?) () #:rest (listof bytes?) . ->* . bytes?))])
((bytes?) () #:rest (listof bytes?) . ->* . bytes?))]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
string?
@ -634,7 +736,8 @@ before the @litchar{\}. For example, the Scheme constant
[input (or/c string? bytes?)]
[insert (or/c string? bytes?
(string? . -> . string?)
(bytes? . -> . bytes?))])
(bytes? . -> . bytes?))]
[input-prefix bytes? #""])
(or/c string? bytes?)]{
Like @scheme[regexp-replace], except that every instance of

View File

@ -1266,12 +1266,12 @@
(arity-test regexp 1 1)
(arity-test regexp? 1 1)
(arity-test regexp-match 2 5)
(arity-test regexp-match-positions 2 5)
(arity-test regexp-match-peek 2 5)
(arity-test regexp-match-peek-positions 2 5)
(arity-test regexp-replace 3 3)
(arity-test regexp-replace* 3 3)
(arity-test regexp-match 2 6)
(arity-test regexp-match-positions 2 6)
(arity-test regexp-match-peek 2 6)
(arity-test regexp-match-peek-positions 2 6)
(arity-test regexp-replace 3 4)
(arity-test regexp-replace* 3 4)
(test #t procedure? car)
(test #f procedure? 'car)

View File

@ -1435,7 +1435,7 @@
#"^(?:a?b?)*$"
(#"(?=(a+?))(\\1ab)" #"aaab" (#"aab" #"a" #"aab"))
(#"(\\w+:)+" #"one:" (#"one:" #"one:"))
(#"$(?<=^(a))" #"a" #f)
(#"$(?<=^(a))" #"a" (#"" #"a"))
(#"(?=(a+?))(\\1ab)" #"aaab" (#"aab" #"a" #"aab"))
(#"^(?=(a+?))\\1ab" #"aaab" #f)
(#"^(?=(a+?))\\1ab" #"aaab" #f)
@ -1710,4 +1710,52 @@
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Check prefixes and suffixes that disallow matches at ^ at start or $ at end:
(let ([try
(lambda (regexp-match input output [output2 output])
(let ([try-succeed
(lambda (output i s e)
(test (output (list "a")) regexp-match #rx"^a" (i) s e #f #"")
(test (output (list "a")) regexp-match #rx"(?m:^a)" (i) s e #f #"\n")
(test (output (list "a")) regexp-match #rx"(?m:^a)" (i) s e #f #"x\n"))])
(try-succeed output (lambda () (input "a")) 0 #f)
(try-succeed output2 (lambda () (input "xay")) 1 2))
(let ([try-fail
(lambda (i s e)
(test #f regexp-match #rx"^a" (i) s e #f #"\n")
(test #f regexp-match #rx"^a" (i) s e #f #"x\n")
(let ([try-always-fail
(lambda (m)
(test #f regexp-match (m #rx"^a") (i) s e #f #"x")
(test #f regexp-match (m #rx"^a") (i) s e #f #"\nx"))])
(try-always-fail values)
(try-always-fail (lambda (rx) (regexp (format "(?m:~a)" (object-name rx)))))))])
(try-fail (lambda () (input "a")) 0 #f)
(try-fail (lambda () (input "xay")) 1 2)))])
(try regexp-match values values)
(try regexp-match? values (lambda (a) (and a #t)))
(try regexp-match string->bytes/utf-8 (lambda (l) (map string->bytes/utf-8 l)))
(try regexp-match open-input-string (lambda (l) (map string->bytes/utf-8 l)))
(try regexp-match-positions values
(lambda (s) (and s '((0 . 1))))
(lambda (s) (and s '((1 . 2))))))
;; regexp-replace* disallows start matching after the first match:
(test "baa" regexp-replace* #rx"^a" "aaa" "b")
;; regexp-replace* disallows start matching after the first match:
(test "bbb" regexp-replace* #rx"(?m:^a\n)" "a\na\na\n" "b")
(test "bba\n" regexp-replace* #rx"(?m:^a[z\n])" "a\naza\n" "b")
(test-values `(("abc") #"c")
(lambda () (regexp-match/end #rx".*" "abc")))
(test-values `(((0 . 3)) #"c")
(lambda () (regexp-match-positions/end #rx".*" "abc")))
(test-values `(((0 . 3)) #"c")
(lambda () (regexp-match-peek-positions/end #rx".*" (open-input-string "abc"))))
(test-values `(((0 . 3)) #"c")
(lambda () (regexp-match-peek-positions-immediate/end #rx".*" (open-input-string "abc"))))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(report-errs)

View File

@ -75,6 +75,10 @@
(t '("a" "b" "c") eof "[abc]" "a b c" 0)
(t '("a" "b" "c") eof "[abc]" "a b c" 0 #f)
(t '("a" "b" "c") eof "[abc]" "a b c" 0 5)
(t '("a") eof "^." "a b c" 0 5 #"")
(t '() eof "^." "a b c" 0 5 #"x")
(t '("a\n" "b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
(t '("b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
(for-each (lambda (cvt)
(test '(#"\x80" #"\x80") regexp-match* (cvt #"\x80") #"a\x80z\x80q"))
(list values byte-regexp byte-pregexp))
@ -95,6 +99,10 @@
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0)
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0 #f)
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0 5)
(t '((0 . 1)) eof "^." "a b c" 0 5 #"")
(t '() eof "^." "a b c" 0 5 #"x")
(t '((0 . 2) (2 . 4) (4 . 6)) eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
(t '((2 . 4) (4 . 6)) eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
(for-each (lambda (cvt)
(test '((1 . 2) (3 . 4)) regexp-match-positions* (cvt #"\x80") #"a\x80z\x80q"))
(list values byte-regexp byte-pregexp))
@ -115,6 +123,10 @@
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0)
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0 #f)
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0 5)
(t '("" " b c") eof "^." "a b c" 0 5 #"")
(t '("a b c") eof "^." "a b c" 0 5 #"x")
(t '("" "" "" "") eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
(t '("a\n" "" "") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
(for-each (lambda (cvt)
(test '(#"" #"a" #"z" #"q" #"") regexp-split (cvt #"\x80") #"\x80a\x80z\x80q\x80"))
(list values byte-regexp byte-pregexp))
@ -135,6 +147,10 @@
(t '((0 . 1)) "a b c" "[abc]" "a b c" 0 2)
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 #f)
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 5)
(t '((0 . 1)) "a b c" "^." "a b c" 0 5 #"")
(t '() "a b c" "^." "a b c" 0 5 #"x")
(t '((0 . 2) (2 . 4) (4 . 6)) "a\nb\nc\n" "(?m:^.\n)" "a\nb\nc\n" 0 6)
(t '((2 . 4) (4 . 6)) "a\nb\nc\n" "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
;; ---------- tests with zero-length matches ----------
;; Many of these tests can be repeated with Perl. To try something in Perl,
;; put this code in a file:

View File

@ -1,3 +1,7 @@
Version 4.2.5.9
regexp-match* et al. now disable ^ matching on all but the first
match
Version 4.2.5.3
Added chaperones

File diff suppressed because it is too large Load Diff

View File

@ -5130,6 +5130,7 @@ static int mark_regwork_MARK(void *p, struct NewGC *gc) {
gcMARK2(r->endp, gc);
gcMARK2(r->counters, gc);
gcMARK2(r->peekskip, gc);
gcMARK2(r->prefix, gc);
return
gcBYTES_TO_WORDS(sizeof(Regwork));
}
@ -5145,6 +5146,7 @@ static int mark_regwork_FIXUP(void *p, struct NewGC *gc) {
gcFIXUP2(r->endp, gc);
gcFIXUP2(r->counters, gc);
gcFIXUP2(r->peekskip, gc);
gcFIXUP2(r->prefix, gc);
return
gcBYTES_TO_WORDS(sizeof(Regwork));
}

View File

@ -2101,6 +2101,7 @@ mark_regwork {
gcMARK2(r->endp, gc);
gcMARK2(r->counters, gc);
gcMARK2(r->peekskip, gc);
gcMARK2(r->prefix, gc);
size:
gcBYTES_TO_WORDS(sizeof(Regwork));
}

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,7 @@
#define USE_COMPILED_STARTUP 1
#define EXPECTED_PRIM_COUNT 983
#define EXPECTED_PRIM_COUNT 988
#define EXPECTED_UNSAFE_COUNT 65
#define EXPECTED_FLFXNUM_COUNT 53

View File

@ -26,11 +26,12 @@ typedef struct regexp {
#endif
} regexp;
#define REGEXP_IS_UTF8 0x01
#define REGEXP_IS_PCRE 0x02
#define REGEXP_ANCH 0x04
#define REGEXP_MUST_CI 0x08
#define REGEXP_JIT 0x10
#define REGEXP_IS_UTF8 0x01
#define REGEXP_IS_PCRE 0x02
#define REGEXP_ANCH 0x04
#define REGEXP_MUST_CI 0x08
#define REGEXP_JIT 0x10
#define REGEXP_LOOKBEHIND 0x20
#ifdef INDIRECT_TO_PROGRAM
# define N_ITO_DELTA(prog, extra, re) extra
@ -217,14 +218,17 @@ typedef struct Regwork {
char *instr;
Scheme_Object *port;
Scheme_Object *unless_evt;
short nonblock, aborted;
char nonblock, aborted;
rxpos instr_size; /* For port reads */
rxpos input_maxend; /* For port reads */
rxpos input, input_end, input_start; /* String-input pointer. */
rxpos boi, bol; /* Beginning of input/line, for ^ check. */
rxpos input_min; /* input_start minus prefix_size */
rxpos boi; /* Beginning of input, for ^ check. */
rxpos *startp; /* Pointer to startp array. */
rxpos *maybep; /* Pointer to tentative startp array. */
rxpos *endp; /* Ditto for endp. */
int *counters; /* For {} counters */
Scheme_Object *peekskip;
char *prefix;
rxpos prefix_len, prefix_delta;
} Regwork;

View File

@ -13,12 +13,12 @@
consistently.)
*/
#define MZSCHEME_VERSION "4.2.5.9"
#define MZSCHEME_VERSION "4.2.5.10"
#define MZSCHEME_VERSION_X 4
#define MZSCHEME_VERSION_Y 2
#define MZSCHEME_VERSION_Z 5
#define MZSCHEME_VERSION_W 9
#define MZSCHEME_VERSION_W 10
#define MZSCHEME_VERSION_MAJOR ((MZSCHEME_VERSION_X * 100) + MZSCHEME_VERSION_Y)
#define MZSCHEME_VERSION_MINOR ((MZSCHEME_VERSION_Z * 1000) + MZSCHEME_VERSION_W)