improve regexp support (PR 10855, halfway)
This commit is contained in:
parent
3fe4a22dbd
commit
1c34ccec44
|
@ -108,25 +108,29 @@
|
|||
;; Helper macro for the regexp functions below, with some utilities.
|
||||
(define (bstring-length s)
|
||||
(if (bytes? s) (bytes-length s) (string-length s)))
|
||||
(define no-empty-edge-matches
|
||||
(define (no-empty-edge-matches n)
|
||||
(make-regexp-tweaker (lambda (rx)
|
||||
(if (bytes? rx)
|
||||
(bytes-append #"(?=.)(?:" rx #")(?<=.)")
|
||||
(format "(?=.)(?:~a)(?<=.)" rx)))))
|
||||
(define (bstring->no-edge-regexp name pattern)
|
||||
(if (or (regexp? pattern) (byte-regexp? pattern)
|
||||
(string? pattern) (bytes? pattern))
|
||||
(no-empty-edge-matches pattern)
|
||||
(raise-type-error
|
||||
name "regexp, byte regexp, string, or byte string" pattern)))
|
||||
(bytes-append #"(?=.)(?:" rx #")(?<=" (make-bytes n (char->integer #\.)) #")")
|
||||
(format "(?=.)(?:~a)(?<=~a)" rx (make-bytes n (char->integer #\.)))))))
|
||||
(define-syntax-rule (regexp-loop
|
||||
name loop start end rx string
|
||||
name loop start end pattern string
|
||||
ipre
|
||||
success-choose failure-k
|
||||
port-success-k port-success-choose port-failure-k
|
||||
need-leftover? peek?)
|
||||
(let ([len (cond [(string? string) (string-length string)]
|
||||
[(bytes? string) (bytes-length string)]
|
||||
[else #f])])
|
||||
(let* ([len (cond [(string? string) (string-length string)]
|
||||
[(bytes? string) (bytes-length string)]
|
||||
[else #f])]
|
||||
[orig-rx (cond [(bytes? pattern) (byte-regexp pattern)]
|
||||
[(string? pattern) (regexp pattern)]
|
||||
[(regexp? pattern) pattern]
|
||||
[(byte-regexp? pattern) pattern]
|
||||
[else
|
||||
(raise-type-error 'name
|
||||
"regexp, byte regexp, string, or byte string"
|
||||
pattern)])]
|
||||
[max-lookbehind (regexp-max-lookbehind orig-rx)])
|
||||
(if peek?
|
||||
(unless (input-port? string)
|
||||
(raise-type-error 'name "input port" string))
|
||||
|
@ -140,6 +144,8 @@
|
|||
(and (number? end) (exact? end) (integer? end)
|
||||
(end . >= . 0)))
|
||||
(raise-type-error 'name "non-negative exact integer or false" end))
|
||||
(unless (bytes? ipre)
|
||||
(raise-type-error 'name "byte string" ipre))
|
||||
(unless (or (input-port? string) (and len (start . <= . len)))
|
||||
(raise-mismatch-error
|
||||
'name
|
||||
|
@ -153,66 +159,76 @@
|
|||
(format "ending offset index out of range [~a,~a]: " start len)
|
||||
end))
|
||||
(reverse
|
||||
(let loop ([acc '()] [start start] [end end])
|
||||
(let loop ([acc '()] [start start] [end end] [ipre ipre] [rx #f] [rx-lb 0])
|
||||
(let* ([new-rx-lb (add1 (bytes-length ipre))]
|
||||
[rx (if (= rx-lb new-rx-lb)
|
||||
rx
|
||||
((no-empty-edge-matches new-rx-lb) orig-rx))])
|
||||
(if (and port-success-choose (input-port? string))
|
||||
|
||||
(if (and port-success-choose (input-port? string))
|
||||
|
||||
;; Input port match, get string
|
||||
(let* ([_ (when (positive? start)
|
||||
;; Skip start chars:
|
||||
(let ([s (make-bytes 4096)])
|
||||
(let loop ([n 0])
|
||||
(unless (= n start)
|
||||
(let ([m (read-bytes-avail!
|
||||
s string 0 (min (- start n) 4096))])
|
||||
(unless (eof-object? m) (loop (+ n m))))))))]
|
||||
[discarded/leftovers (if need-leftover? #f 0)]
|
||||
[spitout (if need-leftover?
|
||||
(open-output-bytes)
|
||||
(make-output-port
|
||||
'counter always-evt
|
||||
(lambda (s start end flush? breakable?)
|
||||
(let ([c (- end start)])
|
||||
(set! discarded/leftovers
|
||||
(+ c discarded/leftovers))
|
||||
c))
|
||||
void))]
|
||||
[end (and end (- end start))]
|
||||
[m (regexp-match rx string 0 end spitout)]
|
||||
[m (and m (car m))]
|
||||
[discarded/leftovers (if need-leftover?
|
||||
(get-output-bytes spitout)
|
||||
discarded/leftovers)]
|
||||
[end (and end m
|
||||
(- end (if need-leftover?
|
||||
(bstring-length discarded/leftovers)
|
||||
discarded/leftovers)
|
||||
(bstring-length m)))])
|
||||
(if m
|
||||
(loop (cons (port-success-choose m discarded/leftovers) acc)
|
||||
0 end)
|
||||
(port-failure-k acc discarded/leftovers)))
|
||||
|
||||
;; String/port match, get positions
|
||||
(let ([m (if peek?
|
||||
(regexp-match-peek-positions rx string start end)
|
||||
(regexp-match-positions rx string start end))])
|
||||
(if (not m)
|
||||
(failure-k acc start end)
|
||||
(let ([mstart (caar m)] [mend (cdar m)])
|
||||
(if port-success-k
|
||||
(port-success-k
|
||||
(lambda (acc new-start new-end)
|
||||
(loop acc new-start new-end))
|
||||
acc start end mstart mend)
|
||||
(loop (cons (success-choose start mstart mend) acc)
|
||||
mend end))))))))))
|
||||
;; Input port match, get string
|
||||
(let* ([_ (when (positive? start)
|
||||
;; Skip start chars:
|
||||
(let ([s (make-bytes 4096)])
|
||||
(let loop ([n 0])
|
||||
(unless (= n start)
|
||||
(let ([m (read-bytes-avail!
|
||||
s string 0 (min (- start n) 4096))])
|
||||
(unless (eof-object? m) (loop (+ n m))))))))]
|
||||
[discarded/leftovers (if need-leftover? #f 0)]
|
||||
[spitout (if need-leftover?
|
||||
(open-output-bytes)
|
||||
(make-output-port
|
||||
'counter always-evt
|
||||
(lambda (s start end flush? breakable?)
|
||||
(let ([c (- end start)])
|
||||
(set! discarded/leftovers
|
||||
(+ c discarded/leftovers))
|
||||
c))
|
||||
void))]
|
||||
[end (and end (- end start))])
|
||||
(let-values ([(m ipre) (regexp-match/end rx string 0 end spitout ipre
|
||||
max-lookbehind)])
|
||||
(let* ([m (and m (car m))]
|
||||
[discarded/leftovers (if need-leftover?
|
||||
(get-output-bytes spitout)
|
||||
discarded/leftovers)]
|
||||
[end (and end m
|
||||
(- end (if need-leftover?
|
||||
(bstring-length discarded/leftovers)
|
||||
discarded/leftovers)
|
||||
(bstring-length m)))])
|
||||
(if m
|
||||
(loop (cons (port-success-choose m discarded/leftovers) acc)
|
||||
0 end ipre
|
||||
rx new-rx-lb)
|
||||
(port-failure-k acc discarded/leftovers)))))
|
||||
|
||||
;; String/port match, get positions
|
||||
(let-values ([(m ipre)
|
||||
(if peek?
|
||||
(regexp-match-peek-positions/end rx string start end #f ipre
|
||||
max-lookbehind)
|
||||
(regexp-match-positions/end rx string start end #f ipre
|
||||
max-lookbehind))])
|
||||
|
||||
(if (not m)
|
||||
(failure-k acc start end)
|
||||
(let ([mstart (caar m)] [mend (cdar m)])
|
||||
(if port-success-k
|
||||
(port-success-k
|
||||
(lambda (acc new-start new-end)
|
||||
(loop acc new-start new-end ipre rx new-rx-lb))
|
||||
acc start end mstart mend)
|
||||
(loop (cons (success-choose start mstart mend) acc)
|
||||
mend end ipre rx new-rx-lb)))))))))))
|
||||
|
||||
;; Returns all the positions at which the pattern matched.
|
||||
(define (regexp-match-positions* pattern string [start 0] [end #f])
|
||||
(define (regexp-match-positions* pattern string [start 0] [end #f] [ipre #""])
|
||||
(regexp-loop
|
||||
regexp-match-positions* loop start end
|
||||
(bstring->no-edge-regexp 'regexp-match-positions* pattern) string
|
||||
pattern string
|
||||
ipre
|
||||
;; success-choose:
|
||||
(lambda (start mstart mend) (cons mstart mend))
|
||||
;; failure-k:
|
||||
|
@ -233,10 +249,11 @@
|
|||
#f))
|
||||
|
||||
;; Returns all the positions at which the pattern matched.
|
||||
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
|
||||
(define (regexp-match-peek-positions* pattern string [start 0] [end #f] [ipre #""])
|
||||
(regexp-loop
|
||||
regexp-match-peek-positions* loop start end
|
||||
(bstring->no-edge-regexp 'regexp-match-peek-positions* pattern) string
|
||||
pattern string
|
||||
ipre
|
||||
;; success-choose:
|
||||
(lambda (start mstart mend) (cons mstart mend))
|
||||
;; failure-k:
|
||||
|
@ -250,13 +267,13 @@
|
|||
|
||||
;; Splits a string into a list by removing any piece which matches
|
||||
;; the pattern.
|
||||
(define (regexp-split pattern string [start 0] [end #f])
|
||||
(define rx (bstring->no-edge-regexp 'regexp-split pattern))
|
||||
(define buf (if (and (string? string) (byte-regexp? rx))
|
||||
(define (regexp-split pattern string [start 0] [end #f] [ipre #""])
|
||||
(define buf (if (and (string? string) (or (byte-regexp? pattern)
|
||||
(bytes? pattern)))
|
||||
(string->bytes/utf-8 string (char->integer #\?))
|
||||
string))
|
||||
(define sub (if (bytes? buf) subbytes substring))
|
||||
(regexp-loop regexp-split loop start end rx buf
|
||||
(regexp-loop regexp-split loop start end pattern buf ipre
|
||||
;; success-choose:
|
||||
(lambda (start mstart mend) (sub buf start mstart))
|
||||
;; failure-k:
|
||||
|
@ -272,13 +289,13 @@
|
|||
#f))
|
||||
|
||||
;; Returns all the matches for the pattern in the string.
|
||||
(define (regexp-match* pattern string [start 0] [end #f])
|
||||
(define rx (bstring->no-edge-regexp 'regexp-match* pattern))
|
||||
(define buf (if (and (string? string) (byte-regexp? rx))
|
||||
(define (regexp-match* pattern string [start 0] [end #f] [ipre #""])
|
||||
(define buf (if (and (string? string) (or (byte-regexp? pattern)
|
||||
(bytes? pattern)))
|
||||
(string->bytes/utf-8 string (char->integer #\?))
|
||||
string))
|
||||
(define sub (if (bytes? buf) subbytes substring))
|
||||
(regexp-loop regexp-match* loop start end rx buf
|
||||
(regexp-loop regexp-match* loop start end pattern buf ipre
|
||||
;; success-choose:
|
||||
(lambda (start mstart mend) (sub buf mstart mend))
|
||||
;; failure-k:
|
||||
|
|
|
@ -195,6 +195,16 @@ case-sensitively.
|
|||
(regexp-match (regexp-quote ".") "apple.scm")
|
||||
]}
|
||||
|
||||
@defproc[(regexp-max-lookbehind [pattern (or/c regexp? byte-regexp?)])
|
||||
exact-nonnegative-integer?]{
|
||||
|
||||
Returns the maximum number of bytes that @scheme[pattern] may consult
|
||||
before the starting position of a match to determine the match. For
|
||||
example, the pattern @litchar{(?<=abc)d} consults three bytes
|
||||
preceding a matching @litchar{d}, while @litchar{e(?<=a..)d} consults
|
||||
two bytes before a matching @litchar{ed}. A @litchar{^} pattern may
|
||||
consult a preceding byte to determine whether the current position is
|
||||
the start of the input or of a line.}
|
||||
|
||||
@;------------------------------------------------------------------------
|
||||
@section{Regexp Matching}
|
||||
|
@ -203,7 +213,8 @@ case-sensitively.
|
|||
[input (or/c string? bytes? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f])
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
(or/c #f (cons/c string? (listof (or/c string? #f))))
|
||||
|
@ -228,9 +239,18 @@ is an input port, and if the end-of-file is reached before
|
|||
@scheme[start-pos] bytes are skipped, then the match fails.
|
||||
|
||||
In @scheme[pattern], a start-of-string @litchar{^} refers to the first
|
||||
position of @scheme[input] after @scheme[start-pos], and the
|
||||
end-of-input @litchar{$} refers to the @scheme[end-pos]th position or
|
||||
(in the case of an input port) the end of file, whichever comes first.
|
||||
position of @scheme[input] after @scheme[start-pos], assuming that
|
||||
@scheme[input-prefix] is @scheme[#""]. The end-of-input @litchar{$}
|
||||
refers to the @scheme[end-pos]th position or (in the case of an input
|
||||
port) the end of file, whichever comes first, assuming that
|
||||
@scheme[output-prefix] is @scheme[#f].
|
||||
|
||||
The @scheme[input-prefix] specifies bytes that effectively precede
|
||||
@scheme[input] for the purposes of @litchar{^} and other look-behind
|
||||
matching. For example, a @scheme[#""] prefix means that @litchar{^}
|
||||
matches at the beginning of the stream, while a @scheme[#"\n"]
|
||||
@scheme[input-prefix] means that a start-of-line @litchar{^} can match
|
||||
the beginning of the input, while a start-of-file @litchar{^} cannot.
|
||||
|
||||
If the match fails, @scheme[#f] is returned. If the match succeeds, a
|
||||
list containing strings or byte string, and possibly @scheme[#f], is
|
||||
|
@ -302,7 +322,8 @@ bytes. To avoid such interleaving, use @scheme[regexp-match-peek]
|
|||
@defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f])
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
(listof string?)
|
||||
|
@ -315,12 +336,14 @@ results for parenthesized sub-patterns in @scheme[pattern] are not
|
|||
returned.)
|
||||
|
||||
The @scheme[pattern] is used in order to find matches, where each
|
||||
match attempt starts at the end of the last match. Empty matches are
|
||||
handled like any matches, returning a zero-length string or byte
|
||||
sequence (they are more useful in the complementing
|
||||
@scheme[regexp-split] function). However, the @scheme[pattern] is
|
||||
restricted from matching an empty string at the beginning (or right
|
||||
after a previous match) or at the end.
|
||||
match attempt starts at the end of the last match, and @litchar{$} is
|
||||
allowed to match the beginning of the input (if @scheme[input-prefix]
|
||||
is @scheme[#""]) only for the first match. Empty matches are handled
|
||||
like other matches, returning a zero-length string or byte sequence
|
||||
(they are more useful in the complementing @scheme[regexp-split]
|
||||
function), but @scheme[pattern] is restricted from matching an empty
|
||||
string at the beginning (or right after a previous match) or at the
|
||||
end.
|
||||
|
||||
If @scheme[input] contains no matches (in the range @scheme[start-pos]
|
||||
to @scheme[end-pos]), @scheme[null] is returned. Otherwise, each item
|
||||
|
@ -339,7 +362,8 @@ port).
|
|||
[input input-port?]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f])
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
(or/c #f (cons/c string? (listof (or/c string? #f))))
|
||||
|
@ -360,7 +384,8 @@ fails.}
|
|||
[input (or/c string? bytes? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f])
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(or/c (cons/c (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?)
|
||||
(listof (or/c (cons/c exact-nonnegative-integer?
|
||||
|
@ -388,11 +413,11 @@ positions indicate the number of bytes that were read, including
|
|||
(regexp-match-positions #rx"(-[0-9]*)+" "a-12--345b")
|
||||
]}
|
||||
|
||||
|
||||
@defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f])
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(listof (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?))]{
|
||||
|
||||
|
@ -408,7 +433,8 @@ like @scheme[regexp-match*].
|
|||
[input (or/c string? bytes? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f])
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
boolean?]{
|
||||
|
||||
Like @scheme[regexp-match], but returns merely @scheme[#t] when the
|
||||
|
@ -437,19 +463,20 @@ entire content of @scheme[input] matches @scheme[pattern].
|
|||
[input input-port?]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[progress (or/c evt #f) #f])
|
||||
[progress (or/c evt #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(or/c (cons/c bytes? (listof (or/c bytes? #f)))
|
||||
#f)]{
|
||||
|
||||
Like @scheme[regexp-match] on input ports, but only peeks bytes from
|
||||
@scheme[input-port] instead of reading them. Furthermore, instead of
|
||||
@scheme[input] instead of reading them. Furthermore, instead of
|
||||
an output port, the last optional argument is a progress event for
|
||||
@scheme[input-port] (see @scheme[port-progress-evt]). If @scheme[progress]
|
||||
becomes ready, then the match stops peeking from @scheme[input-port]
|
||||
@scheme[input] (see @scheme[port-progress-evt]). If @scheme[progress]
|
||||
becomes ready, then the match stops peeking from @scheme[input]
|
||||
and returns @scheme[#f]. The @scheme[progress] argument can be
|
||||
@scheme[#f], in which case the peek may continue with inconsistent
|
||||
information if another process meanwhile reads from
|
||||
@scheme[input-port].
|
||||
@scheme[input].
|
||||
|
||||
@examples[
|
||||
(define p (open-input-string "a abcd"))
|
||||
|
@ -466,7 +493,8 @@ information if another process meanwhile reads from
|
|||
[input input-port?]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[progress (or/c evt #f) #f])
|
||||
[progress (or/c evt #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(or/c (cons/c (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?)
|
||||
(listof (or/c (cons/c exact-nonnegative-integer?
|
||||
|
@ -475,7 +503,7 @@ information if another process meanwhile reads from
|
|||
#f)]{
|
||||
|
||||
Like @scheme[regexp-match-positions] on input ports, but only peeks
|
||||
bytes from @scheme[input-port] instead of reading them, and with a
|
||||
bytes from @scheme[input] instead of reading them, and with a
|
||||
@scheme[progress] argument like @scheme[regexp-match-peek].}
|
||||
|
||||
|
||||
|
@ -483,12 +511,13 @@ bytes from @scheme[input-port] instead of reading them, and with a
|
|||
[input input-port?]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[progress (or/c evt #f) #f])
|
||||
[progress (or/c evt #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(or/c (cons/c bytes? (listof (or/c bytes? #f)))
|
||||
#f)]{
|
||||
|
||||
Like @scheme[regexp-match-peek], but it attempts to match only bytes
|
||||
that are available from @scheme[input-port] without blocking. The
|
||||
that are available from @scheme[input] without blocking. The
|
||||
match fails if not-yet-available characters might be used to match
|
||||
@scheme[pattern].}
|
||||
|
||||
|
@ -497,7 +526,8 @@ match fails if not-yet-available characters might be used to match
|
|||
[input input-port?]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[progress (or/c evt #f) #f])
|
||||
[progress (or/c evt #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(or/c (cons/c (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?)
|
||||
(listof (or/c (cons/c exact-nonnegative-integer?
|
||||
|
@ -506,7 +536,7 @@ match fails if not-yet-available characters might be used to match
|
|||
#f)]{
|
||||
|
||||
Like @scheme[regexp-match-peek-positions], but it attempts to match
|
||||
only bytes that are available from @scheme[input-port] without
|
||||
only bytes that are available from @scheme[input] without
|
||||
blocking. The match fails if not-yet-available characters might be
|
||||
used to match @scheme[pattern].}
|
||||
|
||||
|
@ -514,20 +544,91 @@ used to match @scheme[pattern].}
|
|||
@defproc[(regexp-match-peek-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input input-port?]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f])
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(listof (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?))]{
|
||||
|
||||
Like @scheme[regexp-match-peek-positions], but returns multiple matches like
|
||||
@scheme[regexp-match*].}
|
||||
|
||||
@defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
[input-prefix bytes? #""]
|
||||
[count nonnegative-exact-integer? 1])
|
||||
(values
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
(or/c #f (cons/c string? (listof (or/c string? #f))))
|
||||
(or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))
|
||||
(or/c #f bytes?))]{
|
||||
|
||||
Like @scheme[regexp-match], but with a second result: a byte
|
||||
string of up to @scheme[count] bytes that correspond to the input
|
||||
(possibly including the @scheme[input-prefix]) leading to the end of
|
||||
the match; the second result is @scheme[#f] if no match is found.
|
||||
|
||||
The second result can be useful as an @scheme[input-prefix] for
|
||||
attempting a second match on @scheme[input] starting from the end of
|
||||
the first match. In that case, use @scheme[regexp-max-lookbehind]
|
||||
to determine an appropriate value for @scheme[count].}
|
||||
|
||||
@deftogether[(
|
||||
@defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[input-prefix bytes? #""]
|
||||
[count exact-nonnegative-integer? 1])
|
||||
(values (listof (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?))
|
||||
(or/c #f bytes?))]
|
||||
@defproc[(regexp-match-peek-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input input-port?]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[progress (or/c evt #f) #f]
|
||||
[input-prefix bytes? #""]
|
||||
[count exact-nonnegative-integer? 1])
|
||||
(values
|
||||
(or/c (cons/c (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?)
|
||||
(listof (or/c (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?)
|
||||
#f)))
|
||||
#f)
|
||||
(or/c #f bytes?))]
|
||||
@defproc[(regexp-match-peek-positions-immediate/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input input-port?]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[progress (or/c evt #f) #f]
|
||||
[input-prefix bytes? #""]
|
||||
[count exact-nonnegative-integer? 1])
|
||||
(values
|
||||
(or/c (cons/c (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?)
|
||||
(listof (or/c (cons/c exact-nonnegative-integer?
|
||||
exact-nonnegative-integer?)
|
||||
#f)))
|
||||
#f)
|
||||
(or/c #f bytes?))]
|
||||
)]{
|
||||
|
||||
Like @scheme[regexp-match-positions], etc., but with a second result
|
||||
like @scheme[regexp-match/end].}
|
||||
|
||||
@;------------------------------------------------------------------------
|
||||
@section{Regexp Splitting}
|
||||
|
||||
@defproc[(regexp-split [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f])
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
(cons/c string? (listof string?))
|
||||
|
@ -567,7 +668,8 @@ an end-of-file if @scheme[input] is an input port).
|
|||
[input (or/c string? bytes?)]
|
||||
[insert (or/c string? bytes?
|
||||
((string?) () #:rest (listof string?) . ->* . string?)
|
||||
((bytes?) () #:rest (listof bytes?) . ->* . bytes?))])
|
||||
((bytes?) () #:rest (listof bytes?) . ->* . bytes?))]
|
||||
[input-prefix bytes? #""])
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
string?
|
||||
|
@ -634,7 +736,8 @@ before the @litchar{\}. For example, the Scheme constant
|
|||
[input (or/c string? bytes?)]
|
||||
[insert (or/c string? bytes?
|
||||
(string? . -> . string?)
|
||||
(bytes? . -> . bytes?))])
|
||||
(bytes? . -> . bytes?))]
|
||||
[input-prefix bytes? #""])
|
||||
(or/c string? bytes?)]{
|
||||
|
||||
Like @scheme[regexp-replace], except that every instance of
|
||||
|
|
|
@ -1266,12 +1266,12 @@
|
|||
|
||||
(arity-test regexp 1 1)
|
||||
(arity-test regexp? 1 1)
|
||||
(arity-test regexp-match 2 5)
|
||||
(arity-test regexp-match-positions 2 5)
|
||||
(arity-test regexp-match-peek 2 5)
|
||||
(arity-test regexp-match-peek-positions 2 5)
|
||||
(arity-test regexp-replace 3 3)
|
||||
(arity-test regexp-replace* 3 3)
|
||||
(arity-test regexp-match 2 6)
|
||||
(arity-test regexp-match-positions 2 6)
|
||||
(arity-test regexp-match-peek 2 6)
|
||||
(arity-test regexp-match-peek-positions 2 6)
|
||||
(arity-test regexp-replace 3 4)
|
||||
(arity-test regexp-replace* 3 4)
|
||||
|
||||
(test #t procedure? car)
|
||||
(test #f procedure? 'car)
|
||||
|
|
|
@ -1435,7 +1435,7 @@
|
|||
#"^(?:a?b?)*$"
|
||||
(#"(?=(a+?))(\\1ab)" #"aaab" (#"aab" #"a" #"aab"))
|
||||
(#"(\\w+:)+" #"one:" (#"one:" #"one:"))
|
||||
(#"$(?<=^(a))" #"a" #f)
|
||||
(#"$(?<=^(a))" #"a" (#"" #"a"))
|
||||
(#"(?=(a+?))(\\1ab)" #"aaab" (#"aab" #"a" #"aab"))
|
||||
(#"^(?=(a+?))\\1ab" #"aaab" #f)
|
||||
(#"^(?=(a+?))\\1ab" #"aaab" #f)
|
||||
|
@ -1710,4 +1710,52 @@
|
|||
|
||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Check prefixes and suffixes that disallow matches at ^ at start or $ at end:
|
||||
(let ([try
|
||||
(lambda (regexp-match input output [output2 output])
|
||||
(let ([try-succeed
|
||||
(lambda (output i s e)
|
||||
(test (output (list "a")) regexp-match #rx"^a" (i) s e #f #"")
|
||||
(test (output (list "a")) regexp-match #rx"(?m:^a)" (i) s e #f #"\n")
|
||||
(test (output (list "a")) regexp-match #rx"(?m:^a)" (i) s e #f #"x\n"))])
|
||||
(try-succeed output (lambda () (input "a")) 0 #f)
|
||||
(try-succeed output2 (lambda () (input "xay")) 1 2))
|
||||
(let ([try-fail
|
||||
(lambda (i s e)
|
||||
(test #f regexp-match #rx"^a" (i) s e #f #"\n")
|
||||
(test #f regexp-match #rx"^a" (i) s e #f #"x\n")
|
||||
(let ([try-always-fail
|
||||
(lambda (m)
|
||||
(test #f regexp-match (m #rx"^a") (i) s e #f #"x")
|
||||
(test #f regexp-match (m #rx"^a") (i) s e #f #"\nx"))])
|
||||
(try-always-fail values)
|
||||
(try-always-fail (lambda (rx) (regexp (format "(?m:~a)" (object-name rx)))))))])
|
||||
(try-fail (lambda () (input "a")) 0 #f)
|
||||
(try-fail (lambda () (input "xay")) 1 2)))])
|
||||
(try regexp-match values values)
|
||||
(try regexp-match? values (lambda (a) (and a #t)))
|
||||
(try regexp-match string->bytes/utf-8 (lambda (l) (map string->bytes/utf-8 l)))
|
||||
(try regexp-match open-input-string (lambda (l) (map string->bytes/utf-8 l)))
|
||||
(try regexp-match-positions values
|
||||
(lambda (s) (and s '((0 . 1))))
|
||||
(lambda (s) (and s '((1 . 2))))))
|
||||
|
||||
;; regexp-replace* disallows start matching after the first match:
|
||||
(test "baa" regexp-replace* #rx"^a" "aaa" "b")
|
||||
|
||||
;; regexp-replace* disallows start matching after the first match:
|
||||
(test "bbb" regexp-replace* #rx"(?m:^a\n)" "a\na\na\n" "b")
|
||||
(test "bba\n" regexp-replace* #rx"(?m:^a[z\n])" "a\naza\n" "b")
|
||||
|
||||
(test-values `(("abc") #"c")
|
||||
(lambda () (regexp-match/end #rx".*" "abc")))
|
||||
(test-values `(((0 . 3)) #"c")
|
||||
(lambda () (regexp-match-positions/end #rx".*" "abc")))
|
||||
(test-values `(((0 . 3)) #"c")
|
||||
(lambda () (regexp-match-peek-positions/end #rx".*" (open-input-string "abc"))))
|
||||
(test-values `(((0 . 3)) #"c")
|
||||
(lambda () (regexp-match-peek-positions-immediate/end #rx".*" (open-input-string "abc"))))
|
||||
|
||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(report-errs)
|
||||
|
|
|
@ -75,6 +75,10 @@
|
|||
(t '("a" "b" "c") eof "[abc]" "a b c" 0)
|
||||
(t '("a" "b" "c") eof "[abc]" "a b c" 0 #f)
|
||||
(t '("a" "b" "c") eof "[abc]" "a b c" 0 5)
|
||||
(t '("a") eof "^." "a b c" 0 5 #"")
|
||||
(t '() eof "^." "a b c" 0 5 #"x")
|
||||
(t '("a\n" "b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
|
||||
(t '("b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
|
||||
(for-each (lambda (cvt)
|
||||
(test '(#"\x80" #"\x80") regexp-match* (cvt #"\x80") #"a\x80z\x80q"))
|
||||
(list values byte-regexp byte-pregexp))
|
||||
|
@ -95,6 +99,10 @@
|
|||
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0)
|
||||
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0 #f)
|
||||
(t '((0 . 1) (2 . 3) (4 . 5)) eof "[abc]" "a b c" 0 5)
|
||||
(t '((0 . 1)) eof "^." "a b c" 0 5 #"")
|
||||
(t '() eof "^." "a b c" 0 5 #"x")
|
||||
(t '((0 . 2) (2 . 4) (4 . 6)) eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
|
||||
(t '((2 . 4) (4 . 6)) eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
|
||||
(for-each (lambda (cvt)
|
||||
(test '((1 . 2) (3 . 4)) regexp-match-positions* (cvt #"\x80") #"a\x80z\x80q"))
|
||||
(list values byte-regexp byte-pregexp))
|
||||
|
@ -115,6 +123,10 @@
|
|||
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0)
|
||||
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0 #f)
|
||||
(t '("" "1" "2" "") eof "[abc]" "a1b2c" 0 5)
|
||||
(t '("" " b c") eof "^." "a b c" 0 5 #"")
|
||||
(t '("a b c") eof "^." "a b c" 0 5 #"x")
|
||||
(t '("" "" "" "") eof "(?m:^.\n)" "a\nb\nc\n" 0 6)
|
||||
(t '("a\n" "" "") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
|
||||
(for-each (lambda (cvt)
|
||||
(test '(#"" #"a" #"z" #"q" #"") regexp-split (cvt #"\x80") #"\x80a\x80z\x80q\x80"))
|
||||
(list values byte-regexp byte-pregexp))
|
||||
|
@ -135,6 +147,10 @@
|
|||
(t '((0 . 1)) "a b c" "[abc]" "a b c" 0 2)
|
||||
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 #f)
|
||||
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 5)
|
||||
(t '((0 . 1)) "a b c" "^." "a b c" 0 5 #"")
|
||||
(t '() "a b c" "^." "a b c" 0 5 #"x")
|
||||
(t '((0 . 2) (2 . 4) (4 . 6)) "a\nb\nc\n" "(?m:^.\n)" "a\nb\nc\n" 0 6)
|
||||
(t '((2 . 4) (4 . 6)) "a\nb\nc\n" "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x")
|
||||
;; ---------- tests with zero-length matches ----------
|
||||
;; Many of these tests can be repeated with Perl. To try something in Perl,
|
||||
;; put this code in a file:
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
Version 4.2.5.9
|
||||
regexp-match* et al. now disable ^ matching on all but the first
|
||||
match
|
||||
|
||||
Version 4.2.5.3
|
||||
Added chaperones
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -5130,6 +5130,7 @@ static int mark_regwork_MARK(void *p, struct NewGC *gc) {
|
|||
gcMARK2(r->endp, gc);
|
||||
gcMARK2(r->counters, gc);
|
||||
gcMARK2(r->peekskip, gc);
|
||||
gcMARK2(r->prefix, gc);
|
||||
return
|
||||
gcBYTES_TO_WORDS(sizeof(Regwork));
|
||||
}
|
||||
|
@ -5145,6 +5146,7 @@ static int mark_regwork_FIXUP(void *p, struct NewGC *gc) {
|
|||
gcFIXUP2(r->endp, gc);
|
||||
gcFIXUP2(r->counters, gc);
|
||||
gcFIXUP2(r->peekskip, gc);
|
||||
gcFIXUP2(r->prefix, gc);
|
||||
return
|
||||
gcBYTES_TO_WORDS(sizeof(Regwork));
|
||||
}
|
||||
|
|
|
@ -2101,6 +2101,7 @@ mark_regwork {
|
|||
gcMARK2(r->endp, gc);
|
||||
gcMARK2(r->counters, gc);
|
||||
gcMARK2(r->peekskip, gc);
|
||||
gcMARK2(r->prefix, gc);
|
||||
size:
|
||||
gcBYTES_TO_WORDS(sizeof(Regwork));
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -13,7 +13,7 @@
|
|||
|
||||
#define USE_COMPILED_STARTUP 1
|
||||
|
||||
#define EXPECTED_PRIM_COUNT 983
|
||||
#define EXPECTED_PRIM_COUNT 988
|
||||
#define EXPECTED_UNSAFE_COUNT 65
|
||||
#define EXPECTED_FLFXNUM_COUNT 53
|
||||
|
||||
|
|
|
@ -26,11 +26,12 @@ typedef struct regexp {
|
|||
#endif
|
||||
} regexp;
|
||||
|
||||
#define REGEXP_IS_UTF8 0x01
|
||||
#define REGEXP_IS_PCRE 0x02
|
||||
#define REGEXP_ANCH 0x04
|
||||
#define REGEXP_MUST_CI 0x08
|
||||
#define REGEXP_JIT 0x10
|
||||
#define REGEXP_IS_UTF8 0x01
|
||||
#define REGEXP_IS_PCRE 0x02
|
||||
#define REGEXP_ANCH 0x04
|
||||
#define REGEXP_MUST_CI 0x08
|
||||
#define REGEXP_JIT 0x10
|
||||
#define REGEXP_LOOKBEHIND 0x20
|
||||
|
||||
#ifdef INDIRECT_TO_PROGRAM
|
||||
# define N_ITO_DELTA(prog, extra, re) extra
|
||||
|
@ -217,14 +218,17 @@ typedef struct Regwork {
|
|||
char *instr;
|
||||
Scheme_Object *port;
|
||||
Scheme_Object *unless_evt;
|
||||
short nonblock, aborted;
|
||||
char nonblock, aborted;
|
||||
rxpos instr_size; /* For port reads */
|
||||
rxpos input_maxend; /* For port reads */
|
||||
rxpos input, input_end, input_start; /* String-input pointer. */
|
||||
rxpos boi, bol; /* Beginning of input/line, for ^ check. */
|
||||
rxpos input_min; /* input_start minus prefix_size */
|
||||
rxpos boi; /* Beginning of input, for ^ check. */
|
||||
rxpos *startp; /* Pointer to startp array. */
|
||||
rxpos *maybep; /* Pointer to tentative startp array. */
|
||||
rxpos *endp; /* Ditto for endp. */
|
||||
int *counters; /* For {} counters */
|
||||
Scheme_Object *peekskip;
|
||||
char *prefix;
|
||||
rxpos prefix_len, prefix_delta;
|
||||
} Regwork;
|
||||
|
|
|
@ -13,12 +13,12 @@
|
|||
consistently.)
|
||||
*/
|
||||
|
||||
#define MZSCHEME_VERSION "4.2.5.9"
|
||||
#define MZSCHEME_VERSION "4.2.5.10"
|
||||
|
||||
#define MZSCHEME_VERSION_X 4
|
||||
#define MZSCHEME_VERSION_Y 2
|
||||
#define MZSCHEME_VERSION_Z 5
|
||||
#define MZSCHEME_VERSION_W 9
|
||||
#define MZSCHEME_VERSION_W 10
|
||||
|
||||
#define MZSCHEME_VERSION_MAJOR ((MZSCHEME_VERSION_X * 100) + MZSCHEME_VERSION_Y)
|
||||
#define MZSCHEME_VERSION_MINOR ((MZSCHEME_VERSION_Z * 1000) + MZSCHEME_VERSION_W)
|
||||
|
|
Loading…
Reference in New Issue
Block a user