improve regexp support (PR 10855, halfway)

This commit is contained in:
Matthew Flatt 2010-04-14 18:40:08 -04:00
parent 3fe4a22dbd
commit 1c34ccec44
13 changed files with 1433 additions and 1026 deletions

View File

@ -108,25 +108,29 @@
;; Helper macro for the regexp functions below, with some utilities. ;; Helper macro for the regexp functions below, with some utilities.
(define (bstring-length s) (define (bstring-length s)
(if (bytes? s) (bytes-length s) (string-length s))) (if (bytes? s) (bytes-length s) (string-length s)))
(define no-empty-edge-matches (define (no-empty-edge-matches n)
(make-regexp-tweaker (lambda (rx) (make-regexp-tweaker (lambda (rx)
(if (bytes? rx) (if (bytes? rx)
(bytes-append #"(?=.)(?:" rx #")(?<=.)") (bytes-append #"(?=.)(?:" rx #")(?<=" (make-bytes n (char->integer #\.)) #")")
(format "(?=.)(?:~a)(?<=.)" rx))))) (format "(?=.)(?:~a)(?<=~a)" rx (make-bytes n (char->integer #\.)))))))
(define (bstring->no-edge-regexp name pattern)
(if (or (regexp? pattern) (byte-regexp? pattern)
(string? pattern) (bytes? pattern))
(no-empty-edge-matches pattern)
(raise-type-error
name "regexp, byte regexp, string, or byte string" pattern)))
(define-syntax-rule (regexp-loop (define-syntax-rule (regexp-loop
name loop start end rx string name loop start end pattern string
ipre
success-choose failure-k success-choose failure-k
port-success-k port-success-choose port-failure-k port-success-k port-success-choose port-failure-k
need-leftover? peek?) need-leftover? peek?)
(let ([len (cond [(string? string) (string-length string)] (let* ([len (cond [(string? string) (string-length string)]
[(bytes? string) (bytes-length string)] [(bytes? string) (bytes-length string)]
[else #f])]) [else #f])]
[orig-rx (cond [(bytes? pattern) (byte-regexp pattern)]
[(string? pattern) (regexp pattern)]
[(regexp? pattern) pattern]
[(byte-regexp? pattern) pattern]
[else
(raise-type-error 'name
"regexp, byte regexp, string, or byte string"
pattern)])]
[max-lookbehind (regexp-max-lookbehind orig-rx)])
(if peek? (if peek?
(unless (input-port? string) (unless (input-port? string)
(raise-type-error 'name "input port" string)) (raise-type-error 'name "input port" string))
@ -140,6 +144,8 @@
(and (number? end) (exact? end) (integer? end) (and (number? end) (exact? end) (integer? end)
(end . >= . 0))) (end . >= . 0)))
(raise-type-error 'name "non-negative exact integer or false" end)) (raise-type-error 'name "non-negative exact integer or false" end))
(unless (bytes? ipre)
(raise-type-error 'name "byte string" ipre))
(unless (or (input-port? string) (and len (start . <= . len))) (unless (or (input-port? string) (and len (start . <= . len)))
(raise-mismatch-error (raise-mismatch-error
'name 'name
@ -153,66 +159,76 @@
(format "ending offset index out of range [~a,~a]: " start len) (format "ending offset index out of range [~a,~a]: " start len)
end)) end))
(reverse (reverse
(let loop ([acc '()] [start start] [end end]) (let loop ([acc '()] [start start] [end end] [ipre ipre] [rx #f] [rx-lb 0])
(let* ([new-rx-lb (add1 (bytes-length ipre))]
[rx (if (= rx-lb new-rx-lb)
rx
((no-empty-edge-matches new-rx-lb) orig-rx))])
(if (and port-success-choose (input-port? string))
(if (and port-success-choose (input-port? string)) ;; Input port match, get string
(let* ([_ (when (positive? start)
;; Input port match, get string ;; Skip start chars:
(let* ([_ (when (positive? start) (let ([s (make-bytes 4096)])
;; Skip start chars: (let loop ([n 0])
(let ([s (make-bytes 4096)]) (unless (= n start)
(let loop ([n 0]) (let ([m (read-bytes-avail!
(unless (= n start) s string 0 (min (- start n) 4096))])
(let ([m (read-bytes-avail! (unless (eof-object? m) (loop (+ n m))))))))]
s string 0 (min (- start n) 4096))]) [discarded/leftovers (if need-leftover? #f 0)]
(unless (eof-object? m) (loop (+ n m))))))))] [spitout (if need-leftover?
[discarded/leftovers (if need-leftover? #f 0)] (open-output-bytes)
[spitout (if need-leftover? (make-output-port
(open-output-bytes) 'counter always-evt
(make-output-port (lambda (s start end flush? breakable?)
'counter always-evt (let ([c (- end start)])
(lambda (s start end flush? breakable?) (set! discarded/leftovers
(let ([c (- end start)]) (+ c discarded/leftovers))
(set! discarded/leftovers c))
(+ c discarded/leftovers)) void))]
c)) [end (and end (- end start))])
void))] (let-values ([(m ipre) (regexp-match/end rx string 0 end spitout ipre
[end (and end (- end start))] max-lookbehind)])
[m (regexp-match rx string 0 end spitout)] (let* ([m (and m (car m))]
[m (and m (car m))] [discarded/leftovers (if need-leftover?
[discarded/leftovers (if need-leftover? (get-output-bytes spitout)
(get-output-bytes spitout) discarded/leftovers)]
discarded/leftovers)] [end (and end m
[end (and end m (- end (if need-leftover?
(- end (if need-leftover? (bstring-length discarded/leftovers)
(bstring-length discarded/leftovers) discarded/leftovers)
discarded/leftovers) (bstring-length m)))])
(bstring-length m)))]) (if m
(if m (loop (cons (port-success-choose m discarded/leftovers) acc)
(loop (cons (port-success-choose m discarded/leftovers) acc) 0 end ipre
0 end) rx new-rx-lb)
(port-failure-k acc discarded/leftovers))) (port-failure-k acc discarded/leftovers)))))
;; String/port match, get positions ;; String/port match, get positions
(let ([m (if peek? (let-values ([(m ipre)
(regexp-match-peek-positions rx string start end) (if peek?
(regexp-match-positions rx string start end))]) (regexp-match-peek-positions/end rx string start end #f ipre
(if (not m) max-lookbehind)
(failure-k acc start end) (regexp-match-positions/end rx string start end #f ipre
(let ([mstart (caar m)] [mend (cdar m)]) max-lookbehind))])
(if port-success-k
(port-success-k (if (not m)
(lambda (acc new-start new-end) (failure-k acc start end)
(loop acc new-start new-end)) (let ([mstart (caar m)] [mend (cdar m)])
acc start end mstart mend) (if port-success-k
(loop (cons (success-choose start mstart mend) acc) (port-success-k
mend end)))))))))) (lambda (acc new-start new-end)
(loop acc new-start new-end ipre rx new-rx-lb))
acc start end mstart mend)
(loop (cons (success-choose start mstart mend) acc)
mend end ipre rx new-rx-lb)))))))))))
;; Returns all the positions at which the pattern matched. ;; Returns all the positions at which the pattern matched.
(define (regexp-match-positions* pattern string [start 0] [end #f]) (define (regexp-match-positions* pattern string [start 0] [end #f] [ipre #""])
(regexp-loop (regexp-loop
regexp-match-positions* loop start end regexp-match-positions* loop start end
(bstring->no-edge-regexp 'regexp-match-positions* pattern) string pattern string
ipre
;; success-choose: ;; success-choose:
(lambda (start mstart mend) (cons mstart mend)) (lambda (start mstart mend) (cons mstart mend))
;; failure-k: ;; failure-k:
@ -233,10 +249,11 @@
#f)) #f))
;; Returns all the positions at which the pattern matched. ;; Returns all the positions at which the pattern matched.
(define (regexp-match-peek-positions* pattern string [start 0] [end #f]) (define (regexp-match-peek-positions* pattern string [start 0] [end #f] [ipre #""])
(regexp-loop (regexp-loop
regexp-match-peek-positions* loop start end regexp-match-peek-positions* loop start end
(bstring->no-edge-regexp 'regexp-match-peek-positions* pattern) string pattern string
ipre
;; success-choose: ;; success-choose:
(lambda (start mstart mend) (cons mstart mend)) (lambda (start mstart mend) (cons mstart mend))
;; failure-k: ;; failure-k:
@ -250,13 +267,13 @@
;; Splits a string into a list by removing any piece which matches ;; Splits a string into a list by removing any piece which matches
;; the pattern. ;; the pattern.
(define (regexp-split pattern string [start 0] [end #f]) (define (regexp-split pattern string [start 0] [end #f] [ipre #""])
(define rx (bstring->no-edge-regexp 'regexp-split pattern)) (define buf (if (and (string? string) (or (byte-regexp? pattern)
(define buf (if (and (string? string) (byte-regexp? rx)) (bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?)) (string->bytes/utf-8 string (char->integer #\?))
string)) string))
(define sub (if (bytes? buf) subbytes substring)) (define sub (if (bytes? buf) subbytes substring))
(regexp-loop regexp-split loop start end rx buf (regexp-loop regexp-split loop start end pattern buf ipre
;; success-choose: ;; success-choose:
(lambda (start mstart mend) (sub buf start mstart)) (lambda (start mstart mend) (sub buf start mstart))
;; failure-k: ;; failure-k:
@ -272,13 +289,13 @@
#f)) #f))
;; Returns all the matches for the pattern in the string. ;; Returns all the matches for the pattern in the string.
(define (regexp-match* pattern string [start 0] [end #f]) (define (regexp-match* pattern string [start 0] [end #f] [ipre #""])
(define rx (bstring->no-edge-regexp 'regexp-match* pattern)) (define buf (if (and (string? string) (or (byte-regexp? pattern)
(define buf (if (and (string? string) (byte-regexp? rx)) (bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?)) (string->bytes/utf-8 string (char->integer #\?))
string)) string))
(define sub (if (bytes? buf) subbytes substring)) (define sub (if (bytes? buf) subbytes substring))
(regexp-loop regexp-match* loop start end rx buf (regexp-loop regexp-match* loop start end pattern buf ipre
;; success-choose: ;; success-choose:
(lambda (start mstart mend) (sub buf mstart mend)) (lambda (start mstart mend) (sub buf mstart mend))
;; failure-k: ;; failure-k:

View File

@ -195,6 +195,16 @@ case-sensitively.
(regexp-match (regexp-quote ".") "apple.scm") (regexp-match (regexp-quote ".") "apple.scm")
]} ]}
@defproc[(regexp-max-lookbehind [pattern (or/c regexp? byte-regexp?)])
exact-nonnegative-integer?]{
Returns the maximum number of bytes that @scheme[pattern] may consult
before the starting position of a match to determine the match. For
example, the pattern @litchar{(?<=abc)d} consults three bytes
preceding a matching @litchar{d}, while @litchar{e(?<=a..)d} consults
two bytes before a matching @litchar{ed}. A @litchar{^} pattern may
consult a preceding byte to determine whether the current position is
the start of the input or of a line.}
@;------------------------------------------------------------------------ @;------------------------------------------------------------------------
@section{Regexp Matching} @section{Regexp Matching}
@ -203,7 +213,8 @@ case-sensitively.
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]) [output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern)) (if (and (or (string? pattern) (regexp? pattern))
(string? input)) (string? input))
(or/c #f (cons/c string? (listof (or/c string? #f)))) (or/c #f (cons/c string? (listof (or/c string? #f))))
@ -228,9 +239,18 @@ is an input port, and if the end-of-file is reached before
@scheme[start-pos] bytes are skipped, then the match fails. @scheme[start-pos] bytes are skipped, then the match fails.
In @scheme[pattern], a start-of-string @litchar{^} refers to the first In @scheme[pattern], a start-of-string @litchar{^} refers to the first
position of @scheme[input] after @scheme[start-pos], and the position of @scheme[input] after @scheme[start-pos], assuming that
end-of-input @litchar{$} refers to the @scheme[end-pos]th position or @scheme[input-prefix] is @scheme[#""]. The end-of-input @litchar{$}
(in the case of an input port) the end of file, whichever comes first. refers to the @scheme[end-pos]th position or (in the case of an input
port) the end of file, whichever comes first, assuming that
@scheme[output-prefix] is @scheme[#f].
The @scheme[input-prefix] specifies bytes that effectively precede
@scheme[input] for the purposes of @litchar{^} and other look-behind
matching. For example, a @scheme[#""] prefix means that @litchar{^}
matches at the beginning of the stream, while a @scheme[#"\n"]
@scheme[input-prefix] means that a start-of-line @litchar{^} can match
the beginning of the input, while a start-of-file @litchar{^} cannot.
If the match fails, @scheme[#f] is returned. If the match succeeds, a If the match fails, @scheme[#f] is returned. If the match succeeds, a
list containing strings or byte string, and possibly @scheme[#f], is list containing strings or byte string, and possibly @scheme[#f], is
@ -302,7 +322,8 @@ bytes. To avoid such interleaving, use @scheme[regexp-match-peek]
@defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]) [end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern)) (if (and (or (string? pattern) (regexp? pattern))
(string? input)) (string? input))
(listof string?) (listof string?)
@ -315,12 +336,14 @@ results for parenthesized sub-patterns in @scheme[pattern] are not
returned.) returned.)
The @scheme[pattern] is used in order to find matches, where each The @scheme[pattern] is used in order to find matches, where each
match attempt starts at the end of the last match. Empty matches are match attempt starts at the end of the last match, and @litchar{$} is
handled like any matches, returning a zero-length string or byte allowed to match the beginning of the input (if @scheme[input-prefix]
sequence (they are more useful in the complementing is @scheme[#""]) only for the first match. Empty matches are handled
@scheme[regexp-split] function). However, the @scheme[pattern] is like other matches, returning a zero-length string or byte sequence
restricted from matching an empty string at the beginning (or right (they are more useful in the complementing @scheme[regexp-split]
after a previous match) or at the end. function), but @scheme[pattern] is restricted from matching an empty
string at the beginning (or right after a previous match) or at the
end.
If @scheme[input] contains no matches (in the range @scheme[start-pos] If @scheme[input] contains no matches (in the range @scheme[start-pos]
to @scheme[end-pos]), @scheme[null] is returned. Otherwise, each item to @scheme[end-pos]), @scheme[null] is returned. Otherwise, each item
@ -339,7 +362,8 @@ port).
[input input-port?] [input input-port?]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]) [output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern)) (if (and (or (string? pattern) (regexp? pattern))
(string? input)) (string? input))
(or/c #f (cons/c string? (listof (or/c string? #f)))) (or/c #f (cons/c string? (listof (or/c string? #f))))
@ -360,7 +384,8 @@ fails.}
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]) [output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c (cons/c exact-nonnegative-integer? (or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?) exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer? (listof (or/c (cons/c exact-nonnegative-integer?
@ -388,11 +413,11 @@ positions indicate the number of bytes that were read, including
(regexp-match-positions #rx"(-[0-9]*)+" "a-12--345b") (regexp-match-positions #rx"(-[0-9]*)+" "a-12--345b")
]} ]}
@defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]) [end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(listof (cons/c exact-nonnegative-integer? (listof (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?))]{ exact-nonnegative-integer?))]{
@ -408,7 +433,8 @@ like @scheme[regexp-match*].
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]) [output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
boolean?]{ boolean?]{
Like @scheme[regexp-match], but returns merely @scheme[#t] when the Like @scheme[regexp-match], but returns merely @scheme[#t] when the
@ -437,19 +463,20 @@ entire content of @scheme[input] matches @scheme[pattern].
[input input-port?] [input input-port?]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f]) [progress (or/c evt #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c bytes? (listof (or/c bytes? #f))) (or/c (cons/c bytes? (listof (or/c bytes? #f)))
#f)]{ #f)]{
Like @scheme[regexp-match] on input ports, but only peeks bytes from Like @scheme[regexp-match] on input ports, but only peeks bytes from
@scheme[input-port] instead of reading them. Furthermore, instead of @scheme[input] instead of reading them. Furthermore, instead of
an output port, the last optional argument is a progress event for an output port, the last optional argument is a progress event for
@scheme[input-port] (see @scheme[port-progress-evt]). If @scheme[progress] @scheme[input] (see @scheme[port-progress-evt]). If @scheme[progress]
becomes ready, then the match stops peeking from @scheme[input-port] becomes ready, then the match stops peeking from @scheme[input]
and returns @scheme[#f]. The @scheme[progress] argument can be and returns @scheme[#f]. The @scheme[progress] argument can be
@scheme[#f], in which case the peek may continue with inconsistent @scheme[#f], in which case the peek may continue with inconsistent
information if another process meanwhile reads from information if another process meanwhile reads from
@scheme[input-port]. @scheme[input].
@examples[ @examples[
(define p (open-input-string "a abcd")) (define p (open-input-string "a abcd"))
@ -466,7 +493,8 @@ information if another process meanwhile reads from
[input input-port?] [input input-port?]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f]) [progress (or/c evt #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c (cons/c exact-nonnegative-integer? (or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?) exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer? (listof (or/c (cons/c exact-nonnegative-integer?
@ -475,7 +503,7 @@ information if another process meanwhile reads from
#f)]{ #f)]{
Like @scheme[regexp-match-positions] on input ports, but only peeks Like @scheme[regexp-match-positions] on input ports, but only peeks
bytes from @scheme[input-port] instead of reading them, and with a bytes from @scheme[input] instead of reading them, and with a
@scheme[progress] argument like @scheme[regexp-match-peek].} @scheme[progress] argument like @scheme[regexp-match-peek].}
@ -483,12 +511,13 @@ bytes from @scheme[input-port] instead of reading them, and with a
[input input-port?] [input input-port?]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f]) [progress (or/c evt #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c bytes? (listof (or/c bytes? #f))) (or/c (cons/c bytes? (listof (or/c bytes? #f)))
#f)]{ #f)]{
Like @scheme[regexp-match-peek], but it attempts to match only bytes Like @scheme[regexp-match-peek], but it attempts to match only bytes
that are available from @scheme[input-port] without blocking. The that are available from @scheme[input] without blocking. The
match fails if not-yet-available characters might be used to match match fails if not-yet-available characters might be used to match
@scheme[pattern].} @scheme[pattern].}
@ -497,7 +526,8 @@ match fails if not-yet-available characters might be used to match
[input input-port?] [input input-port?]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f]) [progress (or/c evt #f) #f]
[input-prefix bytes? #""])
(or/c (cons/c (cons/c exact-nonnegative-integer? (or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?) exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer? (listof (or/c (cons/c exact-nonnegative-integer?
@ -506,7 +536,7 @@ match fails if not-yet-available characters might be used to match
#f)]{ #f)]{
Like @scheme[regexp-match-peek-positions], but it attempts to match Like @scheme[regexp-match-peek-positions], but it attempts to match
only bytes that are available from @scheme[input-port] without only bytes that are available from @scheme[input] without
blocking. The match fails if not-yet-available characters might be blocking. The match fails if not-yet-available characters might be
used to match @scheme[pattern].} used to match @scheme[pattern].}
@ -514,20 +544,91 @@ used to match @scheme[pattern].}
@defproc[(regexp-match-peek-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match-peek-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input input-port?] [input input-port?]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]) [end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(listof (cons/c exact-nonnegative-integer? (listof (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?))]{ exact-nonnegative-integer?))]{
Like @scheme[regexp-match-peek-positions], but returns multiple matches like Like @scheme[regexp-match-peek-positions], but returns multiple matches like
@scheme[regexp-match*].} @scheme[regexp-match*].}
@defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""]
[count nonnegative-exact-integer? 1])
(values
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(or/c #f (cons/c string? (listof (or/c string? #f))))
(or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))
(or/c #f bytes?))]{
Like @scheme[regexp-match], but with a second result: a byte
string of up to @scheme[count] bytes that correspond to the input
(possibly including the @scheme[input-prefix]) leading to the end of
the match; the second result is @scheme[#f] if no match is found.
The second result can be useful as an @scheme[input-prefix] for
attempting a second match on @scheme[input] starting from the end of
the first match. In that case, use @scheme[regexp-max-lookbehind]
to determine an appropriate value for @scheme[count].}
@deftogether[(
@defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""]
[count exact-nonnegative-integer? 1])
(values (listof (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?))
(or/c #f bytes?))]
@defproc[(regexp-match-peek-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f]
[input-prefix bytes? #""]
[count exact-nonnegative-integer? 1])
(values
(or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
#f)))
#f)
(or/c #f bytes?))]
@defproc[(regexp-match-peek-positions-immediate/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input input-port?]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[progress (or/c evt #f) #f]
[input-prefix bytes? #""]
[count exact-nonnegative-integer? 1])
(values
(or/c (cons/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
(listof (or/c (cons/c exact-nonnegative-integer?
exact-nonnegative-integer?)
#f)))
#f)
(or/c #f bytes?))]
)]{
Like @scheme[regexp-match-positions], etc., but with a second result
like @scheme[regexp-match/end].}
@;------------------------------------------------------------------------ @;------------------------------------------------------------------------
@section{Regexp Splitting} @section{Regexp Splitting}
@defproc[(regexp-split [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-split [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]) [end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern)) (if (and (or (string? pattern) (regexp? pattern))
(string? input)) (string? input))
(cons/c string? (listof string?)) (cons/c string? (listof string?))
@ -567,7 +668,8 @@ an end-of-file if @scheme[input] is an input port).
[input (or/c string? bytes?)] [input (or/c string? bytes?)]
[insert (or/c string? bytes? [insert (or/c string? bytes?
((string?) () #:rest (listof string?) . ->* . string?) ((string?) () #:rest (listof string?) . ->* . string?)
((bytes?) () #:rest (listof bytes?) . ->* . bytes?))]) ((bytes?) () #:rest (listof bytes?) . ->* . bytes?))]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern)) (if (and (or (string? pattern) (regexp? pattern))
(string? input)) (string? input))
string? string?
@ -634,7 +736,8 @@ before the @litchar{\}. For example, the Scheme constant
[input (or/c string? bytes?)] [input (or/c string? bytes?)]
[insert (or/c string? bytes? [insert (or/c string? bytes?
(string? . -> . string?) (string? . -> . string?)
(bytes? . -> . bytes?))]) (bytes? . -> . bytes?))]
[input-prefix bytes? #""])
(or/c string? bytes?)]{ (or/c string? bytes?)]{
Like @scheme[regexp-replace], except that every instance of Like @scheme[regexp-replace], except that every instance of

View File

@ -1266,12 +1266,12 @@
(arity-test regexp 1 1) (arity-test regexp 1 1)
(arity-test regexp? 1 1) (arity-test regexp? 1 1)
(arity-test regexp-match 2 5) (arity-test regexp-match 2 6)
(arity-test regexp-match-positions 2 5) (arity-test regexp-match-positions 2 6)
(arity-test regexp-match-peek 2 5) (arity-test regexp-match-peek 2 6)
(arity-test regexp-match-peek-positions 2 5) (arity-test regexp-match-peek-positions 2 6)
(arity-test regexp-replace 3 3) (arity-test regexp-replace 3 4)
(arity-test regexp-replace* 3 3) (arity-test regexp-replace* 3 4)
(test #t procedure? car) (test #t procedure? car)
(test #f procedure? 'car) (test #f procedure? 'car)

View File

@ -1435,7 +1435,7 @@
#"^(?:a?b?)*$" #"^(?:a?b?)*$"
(#"(?=(a+?))(\\1ab)" #"aaab" (#"aab" #"a" #"aab")) (#"(?=(a+?))(\\1ab)" #"aaab" (#"aab" #"a" #"aab"))
(#"(\\w+:)+" #"one:" (#"one:" #"one:")) (#"(\\w+:)+" #"one:" (#"one:" #"one:"))
(#"$(?<=^(a))" #"a" #f) (#"$(?<=^(a))" #"a" (#"" #"a"))
(#"(?=(a+?))(\\1ab)" #"aaab" (#"aab" #"a" #"aab")) (#"(?=(a+?))(\\1ab)" #"aaab" (#"aab" #"a" #"aab"))
(#"^(?=(a+?))\\1ab" #"aaab" #f) (#"^(?=(a+?))\\1ab" #"aaab" #f)
(#"^(?=(a+?))\\1ab" #"aaab" #f) (#"^(?=(a+?))\\1ab" #"aaab" #f)
@ -1710,4 +1710,52 @@
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Check prefixes and suffixes that disallow matches at ^ at start or $ at end:
(let ([try
(lambda (regexp-match input output [output2 output])
(let ([try-succeed
(lambda (output i s e)
(test (output (list "a")) regexp-match #rx"^a" (i) s e #f #"")
(test (output (list "a")) regexp-match #rx"(?m:^a)" (i) s e #f #"\n")
(test (output (list "a")) regexp-match #rx"(?m:^a)" (i) s e #f #"x\n"))])
(try-succeed output (lambda () (input "a")) 0 #f)
(try-succeed output2 (lambda () (input "xay")) 1 2))
(let ([try-fail
(lambda (i s e)
(test #f regexp-match #rx"^a" (i) s e #f #"\n")
(test #f regexp-match #rx"^a" (i) s e #f #"x\n")
(let ([try-always-fail
(lambda (m)
(test #f regexp-match (m #rx"^a") (i) s e #f #"x")
(test #f regexp-match (m #rx"^a") (i) s e #f #"\nx"))])
(try-always-fail values)
(try-always-fail (lambda (rx) (regexp (format "(?m:~a)" (object-name rx)))))))])
(try-fail (lambda () (input "a")) 0 #f)
(try-fail (lambda () (input "xay")) 1 2)))])
(try regexp-match values values)
(try regexp-match? values (lambda (a) (and a #t)))
(try regexp-match string->bytes/utf-8 (lambda (l) (map string->bytes/utf-8 l)))
(try regexp-match open-input-string (lambda (l) (map string->bytes/utf-8 l)))
(try regexp-match-positions values
(lambda (s) (and s '((0 . 1))))
(lambda (s) (and s '((1 . 2))))))
;; regexp-replace* disallows start matching after the first match:
(test "baa" regexp-replace* #rx"^a" "aaa" "b")
;; regexp-replace* disallows start matching after the first match:
(test "bbb" regexp-replace* #rx"(?m:^a\n)" "a\na\na\n" "b")
(test "bba\n" regexp-replace* #rx"(?m:^a[z\n])" "a\naza\n" "b")
(test-values `(("abc") #"c")
(lambda () (regexp-match/end #rx".*" "abc")))
(test-values `(((0 . 3)) #"c")
(lambda () (regexp-match-positions/end #rx".*" "abc")))
(test-values `(((0 . 3)) #"c")
(lambda () (regexp-match-peek-positions/end #rx".*" (open-input-string "abc"))))
(test-values `(((0 . 3)) #"c")
(lambda () (regexp-match-peek-positions-immediate/end #rx".*" (open-input-string "abc"))))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(report-errs) (report-errs)

View File

@ -75,6 +75,10 @@
(t '("a" "b" "c") eof "[abc]" "a b c" 0) (t '("a" "b" "c") eof "[abc]" "a b c" 0)
(t '("a" "b" "c") eof "[abc]" "a b c" 0 #f) (t '("a" "b" "c") eof "[abc]" "a b c" 0 #f)
(t '("a" "b" "c") eof "[abc]" "a b c" 0 5) (t '("a" "b" "c") eof "[abc]" "a b c" 0 5)
(t '("a") eof "^." "a b c" 0 5 #"")
(t '() eof "^." "a b c" 0 5 #"x")
(t '("a\n" "b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
(t '("b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
(for-each (lambda (cvt) (for-each (lambda (cvt)
(test '(#"\x80" #"\x80") regexp-match* (cvt #"\x80") #"a\x80z\x80q")) (test '(#"\x80" #"\x80") regexp-match* (cvt #"\x80") #"a\x80z\x80q"))
(list values byte-regexp byte-pregexp)) (list values byte-regexp byte-pregexp))
@ -95,6 +99,10 @@
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0) (t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0)
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0 #f) (t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0 #f)
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0 5) (t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0 5)
(t '((0 . 1)) eof "^." "a b c" 0 5 #"")
(t '() eof "^." "a b c" 0 5 #"x")
(t '((0 . 2) (2 . 4) (4 . 6)) eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
(t '((2 . 4) (4 . 6)) eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
(for-each (lambda (cvt) (for-each (lambda (cvt)
(test '((1 . 2) (3 . 4)) regexp-match-positions* (cvt #"\x80") #"a\x80z\x80q")) (test '((1 . 2) (3 . 4)) regexp-match-positions* (cvt #"\x80") #"a\x80z\x80q"))
(list values byte-regexp byte-pregexp)) (list values byte-regexp byte-pregexp))
@ -115,6 +123,10 @@
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0) (t '("" "1" "2" "") eof "[abc]" "a1b2c" 0)
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0 #f) (t '("" "1" "2" "") eof "[abc]" "a1b2c" 0 #f)
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0 5) (t '("" "1" "2" "") eof "[abc]" "a1b2c" 0 5)
(t '("" " b c") eof "^." "a b c" 0 5 #"")
(t '("a b c") eof "^." "a b c" 0 5 #"x")
(t '("" "" "" "") eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
(t '("a\n" "" "") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
(for-each (lambda (cvt) (for-each (lambda (cvt)
(test '(#"" #"a" #"z" #"q" #"") regexp-split (cvt #"\x80") #"\x80a\x80z\x80q\x80")) (test '(#"" #"a" #"z" #"q" #"") regexp-split (cvt #"\x80") #"\x80a\x80z\x80q\x80"))
(list values byte-regexp byte-pregexp)) (list values byte-regexp byte-pregexp))
@ -135,6 +147,10 @@
(t '((0 . 1)) "a b c" "[abc]" "a b c" 0 2) (t '((0 . 1)) "a b c" "[abc]" "a b c" 0 2)
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 #f) (t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 #f)
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 5) (t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 5)
(t '((0 . 1)) "a b c" "^." "a b c" 0 5 #"")
(t '() "a b c" "^." "a b c" 0 5 #"x")
(t '((0 . 2) (2 . 4) (4 . 6)) "a\nb\nc\n" "(?m:^.\n)" "a\nb\nc\n" 0 6)
(t '((2 . 4) (4 . 6)) "a\nb\nc\n" "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
;; ---------- tests with zero-length matches ---------- ;; ---------- tests with zero-length matches ----------
;; Many of these tests can be repeated with Perl. To try something in Perl, ;; Many of these tests can be repeated with Perl. To try something in Perl,
;; put this code in a file: ;; put this code in a file:

View File

@ -1,3 +1,7 @@
Version 4.2.5.9
regexp-match* et al. now disable ^ matching on all but the first
match
Version 4.2.5.3 Version 4.2.5.3
Added chaperones Added chaperones

File diff suppressed because it is too large Load Diff

View File

@ -5130,6 +5130,7 @@ static int mark_regwork_MARK(void *p, struct NewGC *gc) {
gcMARK2(r->endp, gc); gcMARK2(r->endp, gc);
gcMARK2(r->counters, gc); gcMARK2(r->counters, gc);
gcMARK2(r->peekskip, gc); gcMARK2(r->peekskip, gc);
gcMARK2(r->prefix, gc);
return return
gcBYTES_TO_WORDS(sizeof(Regwork)); gcBYTES_TO_WORDS(sizeof(Regwork));
} }
@ -5145,6 +5146,7 @@ static int mark_regwork_FIXUP(void *p, struct NewGC *gc) {
gcFIXUP2(r->endp, gc); gcFIXUP2(r->endp, gc);
gcFIXUP2(r->counters, gc); gcFIXUP2(r->counters, gc);
gcFIXUP2(r->peekskip, gc); gcFIXUP2(r->peekskip, gc);
gcFIXUP2(r->prefix, gc);
return return
gcBYTES_TO_WORDS(sizeof(Regwork)); gcBYTES_TO_WORDS(sizeof(Regwork));
} }

View File

@ -2101,6 +2101,7 @@ mark_regwork {
gcMARK2(r->endp, gc); gcMARK2(r->endp, gc);
gcMARK2(r->counters, gc); gcMARK2(r->counters, gc);
gcMARK2(r->peekskip, gc); gcMARK2(r->peekskip, gc);
gcMARK2(r->prefix, gc);
size: size:
gcBYTES_TO_WORDS(sizeof(Regwork)); gcBYTES_TO_WORDS(sizeof(Regwork));
} }

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,7 @@
#define USE_COMPILED_STARTUP 1 #define USE_COMPILED_STARTUP 1
#define EXPECTED_PRIM_COUNT 983 #define EXPECTED_PRIM_COUNT 988
#define EXPECTED_UNSAFE_COUNT 65 #define EXPECTED_UNSAFE_COUNT 65
#define EXPECTED_FLFXNUM_COUNT 53 #define EXPECTED_FLFXNUM_COUNT 53

View File

@ -26,11 +26,12 @@ typedef struct regexp {
#endif #endif
} regexp; } regexp;
#define REGEXP_IS_UTF8 0x01 #define REGEXP_IS_UTF8 0x01
#define REGEXP_IS_PCRE 0x02 #define REGEXP_IS_PCRE 0x02
#define REGEXP_ANCH 0x04 #define REGEXP_ANCH 0x04
#define REGEXP_MUST_CI 0x08 #define REGEXP_MUST_CI 0x08
#define REGEXP_JIT 0x10 #define REGEXP_JIT 0x10
#define REGEXP_LOOKBEHIND 0x20
#ifdef INDIRECT_TO_PROGRAM #ifdef INDIRECT_TO_PROGRAM
# define N_ITO_DELTA(prog, extra, re) extra # define N_ITO_DELTA(prog, extra, re) extra
@ -217,14 +218,17 @@ typedef struct Regwork {
char *instr; char *instr;
Scheme_Object *port; Scheme_Object *port;
Scheme_Object *unless_evt; Scheme_Object *unless_evt;
short nonblock, aborted; char nonblock, aborted;
rxpos instr_size; /* For port reads */ rxpos instr_size; /* For port reads */
rxpos input_maxend; /* For port reads */ rxpos input_maxend; /* For port reads */
rxpos input, input_end, input_start; /* String-input pointer. */ rxpos input, input_end, input_start; /* String-input pointer. */
rxpos boi, bol; /* Beginning of input/line, for ^ check. */ rxpos input_min; /* input_start minus prefix_size */
rxpos boi; /* Beginning of input, for ^ check. */
rxpos *startp; /* Pointer to startp array. */ rxpos *startp; /* Pointer to startp array. */
rxpos *maybep; /* Pointer to tentative startp array. */ rxpos *maybep; /* Pointer to tentative startp array. */
rxpos *endp; /* Ditto for endp. */ rxpos *endp; /* Ditto for endp. */
int *counters; /* For {} counters */ int *counters; /* For {} counters */
Scheme_Object *peekskip; Scheme_Object *peekskip;
char *prefix;
rxpos prefix_len, prefix_delta;
} Regwork; } Regwork;

View File

@ -13,12 +13,12 @@
consistently.) consistently.)
*/ */
#define MZSCHEME_VERSION "4.2.5.9" #define MZSCHEME_VERSION "4.2.5.10"
#define MZSCHEME_VERSION_X 4 #define MZSCHEME_VERSION_X 4
#define MZSCHEME_VERSION_Y 2 #define MZSCHEME_VERSION_Y 2
#define MZSCHEME_VERSION_Z 5 #define MZSCHEME_VERSION_Z 5
#define MZSCHEME_VERSION_W 9 #define MZSCHEME_VERSION_W 10
#define MZSCHEME_VERSION_MAJOR ((MZSCHEME_VERSION_X * 100) + MZSCHEME_VERSION_Y) #define MZSCHEME_VERSION_MAJOR ((MZSCHEME_VERSION_X * 100) + MZSCHEME_VERSION_Y)
#define MZSCHEME_VERSION_MINOR ((MZSCHEME_VERSION_Z * 1000) + MZSCHEME_VERSION_W) #define MZSCHEME_VERSION_MINOR ((MZSCHEME_VERSION_Z * 1000) + MZSCHEME_VERSION_W)