diff --git a/collects/racket/private/string.rkt b/collects/racket/private/string.rkt index 95ce08cc04..76bdaad44e 100644 --- a/collects/racket/private/string.rkt +++ b/collects/racket/private/string.rkt @@ -233,42 +233,91 @@ (port-success-k (lambda (acc new-start new-end) (loop acc new-start new-end ipre 0-ok?)) - acc start end mstart mend) + acc start end m) (loop (success-choose start m acc) mend end ipre 0-ok?))))))))))) ;; Returns all the positions at which the pattern matched. - (define (regexp-match-positions* pattern string [start 0] [end #f] [ipre #""]) - (regexp-loop regexp-match-positions* loop start end pattern string ipre - ;; success-choose: - (lambda (start ms acc) (cons (car ms) acc)) - ;; failure-k: - (lambda (acc start end) acc) - ;; port-success-k: need to shift index of rest as reading; cannot - ;; do a tail call without adding another state variable to the - ;; regexp loop, so this remains inefficient - (lambda (loop acc start end mstart mend) - (append (map (lambda (p) - (cons (+ mend (car p)) (+ mend (cdr p)))) - (loop '() 0 (and end (- end mend)))) - (cons (cons mstart mend) acc))) - ;; other port functions: use string case - #f #f - ;; flags - #f #f)) + (define (regexp-match-positions* pattern string [start 0] [end #f] [ipre #""] + #:match-select [match-select car]) + ;; Note: no need for a #:gap-select, since it is easily inferred from the + ;; resulting matches + (if (eq? match-select car) + ;; common case + (regexp-loop regexp-match-positions* loop start end pattern string ipre + ;; success-choose: + (lambda (start ms acc) (cons (car ms) acc)) + ;; failure-k: + (lambda (acc start end) acc) + ;; port-success-k: need to shift index of rest as reading; cannot + ;; do a tail call without adding another state variable to the + ;; regexp loop, so this remains inefficient + (lambda (loop acc start end ms) + (let ([mstart (caar ms)] [mend (cdar ms)]) + (append (map (lambda (p) (cons (+ mend (car p)) (+ mend (cdr p)))) + (loop '() 0 (and end (- end mend)))) + (cons (car ms) acc)))) + ;; other port functions: use string case + #f #f + ;; flags + #f #f) + ;; using some selector + (regexp-loop regexp-match-positions* loop start end pattern string ipre + ;; success-choose: + (lambda (start ms acc) (cons (match-select ms) acc)) + ;; failure-k: + (lambda (acc start end) acc) + ;; port-success-k: need to shift index of rest as reading; cannot + ;; do a tail call without adding another state variable to the + ;; regexp loop, so this remains inefficient + (lambda (loop acc start end ms) + (let* ([mend (cdar ms)] + [rest (loop '() 0 (and end (- end mend)))] + [s (match-select ms)]) + ;; assumes that it's a valid selector (and always works the same, + ;; ie, does not return a single pair sometimes a list of pairs at + ;; other times) + (append + (map (if (or (and (pair? s) (exact-integer? (car s))) (not s)) + ;; assume that it returns a single match + (lambda (p) (cons (+ mend (car p)) (+ mend (cdr p)))) + ;; assume that it returns a list + (lambda (ps) + (map (lambda (p) + (and p (cons (+ mend (car p)) (+ mend (cdr p))))) + ps))) + rest) + (cons s acc)))) + ;; other port functions: use string case + #f #f + ;; flags + #f #f))) ;; Returns all the positions at which the pattern matched. (define (regexp-match-peek-positions* pattern string [start 0] [end #f] - [ipre #""]) - (regexp-loop regexp-match-peek-positions* loop start end pattern string ipre - ;; success-choose: - (lambda (start ms acc) (cons (car ms) acc)) - ;; failure-k: - (lambda (acc start end) acc) - ;; port functions: use string case - #f #f #f - ;; flags - #f #t)) + [ipre #""] + #:match-select [match-select car]) + (if (eq? match-select car) + ;; common case + (regexp-loop regexp-match-peek-positions* loop start end pattern string ipre + ;; success-choose: + (lambda (start ms acc) (cons (car ms) acc)) + ;; failure-k: + (lambda (acc start end) acc) + ;; port functions: use string case + #f #f #f + ;; flags + #f #t) + ;; using some selector + (regexp-loop regexp-match-peek-positions* loop start end pattern string ipre + ;; success-choose: + (lambda (start ms acc) (cons (match-select ms) acc)) + ;; failure-k: + (lambda (acc start end) acc) + ;; port functions: use string case + #f #f #f + ;; flags + #f #t))) ;; Small helper for the functions below (define (get-buf+sub string pattern) @@ -413,22 +462,91 @@ #t #f)))]) regexp-replace*)) - ;; Returns all the matches for the pattern in the string. - (define (regexp-match* pattern string [start 0] [end #f] [ipre #""]) - (define-values [buf sub] (get-buf+sub string pattern)) - (regexp-loop regexp-match* loop start end pattern buf ipre - ;; success-choose: - (lambda (start ms acc) (cons (sub buf (caar ms) (cdar ms)) acc)) - ;; failure-k: - (lambda (acc start end) acc) - ;; port-success-k: - #f - ;; port-success-choose: - (lambda (leftovers ms acc) (cons (car ms) acc)) - ;; port-failure-k: - (lambda (acc leftover) acc) - ;; flags - #f #f)) + ;; Returns all the matches for the pattern in the string, optionally + ;; other submatches and/or gap strings too. + (define (regexp-match* pattern string [start 0] [end #f] [ipre #""] + #:match-select [match-select car] + #:gap-select [gap-select #f]) + (cond + ;; nonsensical case => throw an error + [(and (not match-select) (not gap-select)) + ;; An alternative would be to throw '(), but that's non-uniform in that + ;; it wouldn't consume the contents of a port. Another alternative is + ;; to do the full code to consume what's needed, but that's spending + ;; cycles to always get a '() result. + (raise + (exn:fail:contract + "regexp-match*: one of `match-select' or `gap-select' must be non-#f" + (current-continuation-marks)))] + ;; no match-select => same as `regexp-split' + [(not match-select) (regexp-split pattern string start end ipre)] + ;; uncommon case => full code + [(not (eq? match-select car)) + (define-values [buf sub] (get-buf+sub string pattern)) + (regexp-loop regexp-explode loop start end pattern buf ipre + ;; success-choose: + (lambda (start ms acc) + (cons (let ([s (match-select ms)]) + ;; assumes a valid selector + (cond [(not (pair? s)) s] ; #f or '() + [(integer? (car s)) (sub buf (car s) (cdr s))] + [else (map (lambda (m) + (and m (sub buf (car m) (cdr m)))) + s)])) + (if gap-select (cons (sub buf start (caar ms)) acc) acc))) + ;; failure-k: + (lambda (acc start end) + (if gap-select + (cons (if end (sub buf start end) (sub buf start)) acc) + acc)) + ;; port-success-k: + #f + ;; port-success-choose: + (lambda (leftovers ms acc) + ;; assumes a valid selector too: here it's used with a bytes list so + ;; it's simple + (cons (match-select ms) (if gap-select (cons leftovers acc) acc))) + ;; port-failure-k: + (lambda (acc leftover) + (if (and gap-select leftover) (cons leftover acc) acc)) + ;; flags + gap-select #f)] + ;; default for matches, but also include gaps + [gap-select + (define-values [buf sub] (get-buf+sub string pattern)) + (regexp-loop regexp-explode loop start end pattern buf ipre + ;; success-choose: + (lambda (start ms acc) + (cons (sub buf (caar ms) (cdar ms)) + (cons (sub buf start (caar ms)) acc))) + ;; failure-k: + (lambda (acc start end) + (cons (if end (sub buf start end) (sub buf start)) acc)) + ;; port-success-k: + #f + ;; port-success-choose: + (lambda (leftovers ms acc) (cons (car ms) (cons leftovers acc))) + ;; port-failure-k: + (lambda (acc leftover) (if leftover (cons leftover acc) acc)) + ;; flags + gap-select #f)] + ;; default case => optimized with specific code (this is the previous + ;; functionality of `regexp-explode*' too) + [else + (define-values [buf sub] (get-buf+sub string pattern)) + (regexp-loop regexp-match* loop start end pattern buf ipre + ;; success-choose: + (lambda (start ms acc) (cons (sub buf (caar ms) (cdar ms)) acc)) + ;; failure-k: + (lambda (acc start end) acc) + ;; port-success-k: + #f + ;; port-success-choose: + (lambda (leftovers ms acc) (cons (car ms) acc)) + ;; port-failure-k: + (lambda (acc leftover) acc) + ;; flags + #f #f)])) (define (regexp-match-exact? p s) (let ([m (regexp-match-positions p s)]) diff --git a/collects/scribblings/reference/regexps.scrbl b/collects/scribblings/reference/regexps.scrbl index df77518a1b..411953812b 100644 --- a/collects/scribblings/reference/regexps.scrbl +++ b/collects/scribblings/reference/regexps.scrbl @@ -400,26 +400,29 @@ bytes. To avoid such interleaving, use @racket[regexp-match-peek] [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] - [input-prefix bytes? #""]) + [input-prefix bytes? #""] + [#:match-select match-select + (or/c ((listof any?) . -> . (or/c any? (listof any?))) + #f) + car] + [#:gap-select gap-select boolean? #f]) (if (and (or (string? pattern) (regexp? pattern)) (or (string? input) (path? input))) - (listof string?) - (listof bytes?))]{ + (listof (or/c string? (listof (or/c #f string?)))) + (listof (or/c bytes? (listof (or/c #f bytes?)))))]{ Like @racket[regexp-match], but the result is a list of strings or byte strings corresponding to a sequence of matches of -@racket[pattern] in @racket[input]. (Unlike @racket[regexp-match], -results for parenthesized sub-patterns in @racket[pattern] are not -returned.) +@racket[pattern] in @racket[input]. The @racket[pattern] is used in order to find matches, where each match attempt starts at the end of the last match, and @litchar{^} is allowed to match the beginning of the input (if @racket[input-prefix] is @racket[#""]) only for the first match. Empty matches are handled like other matches, returning a zero-length string or byte sequence -(they are more useful in the complementing @racket[regexp-split] -function), but @racket[pattern] is restricted from matching an empty -sequence immediately after an empty match. +(they are more useful in making this a complement of +@racket[regexp-split]), but @racket[pattern] is restricted from +matching an empty sequence immediately after an empty match. If @racket[input] contains no matches (in the range @racket[start-pos] to @racket[end-pos]), @racket[null] is returned. Otherwise, each item @@ -432,6 +435,35 @@ port). @examples[ (regexp-match* #rx"x." "12x4x6") (regexp-match* #rx"x*" "12x4x6") +] + +@racket[match-select] specifies the collected results. The default of +@racket[car] means that the result is the list of matches without +returning parenthesized sub-patterns. It can be given as a `selector' +function which chooses an item from a list, or it can choose a list of +items. For example, you can use @racket[cdr] to get a list of lists +of parenthesized sub-patterns matches, or @racket[values] (as an +identity function) to get the full matches as well. (Note that the +selector must choose an element of its input list or a list of +elements, but it must not inspect its input as they can be either a +list of strings or a list of position pairs. Furthermore, the +selector must be consistent in its choice(s).) + +@examples[ +(regexp-match* #rx"x(.)" "12x4x6" #:match-select cadr) +(regexp-match* #rx"x(.)" "12x4x6" #:match-select values) +] + +In addition, specifying @racket[gap-select] as a non-@racket[#f] value +will make the result an interleaved list of the matches as well as the +separators between them matches, starting and ending with a separator. +In this case, @racket[match-select] can be given as @racket[#f] to +return @emph{only} the separators, making such uses equivalent to +@racket[regexp-split]. + +@examples[ +(regexp-match* #rx"x(.)" "12x4x6" #:match-select cadr #:gap-select #t) +(regexp-match* #rx"x(.)" "12x4x6" #:match-select #f #:gap-select #t) ]} @@ -494,16 +526,27 @@ positions indicate the number of bytes that were read, including [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] - [input-prefix bytes? #""]) - (listof (cons/c exact-nonnegative-integer? - exact-nonnegative-integer?))]{ + [input-prefix bytes? #""] + [#:match-select match-select + ((listof any?) . -> . (or/c any? (listof any?))) + car]) + (or/c (listof (cons/c exact-nonnegative-integer? + exact-nonnegative-integer?)) + (listof (listof (or/c #f (cons/c exact-nonnegative-integer? + exact-nonnegative-integer?)))))]{ Like @racket[regexp-match-positions], but returns multiple matches like @racket[regexp-match*]. @examples[ (regexp-match-positions* #rx"x." "12x4x6") -]} +(regexp-match-positions* #rx"x(.)" "12x4x6" #:match-select cadr) +] + +Note that unlike @racket[regexp-match*], there is no +@racket[#:gap-select] input keyword, as this information can be easily +inferred from the resulting matches. +} @defproc[(regexp-match? [pattern (or/c string? bytes? regexp? byte-regexp?)] @@ -623,11 +666,13 @@ used to match @racket[pattern].} [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] [input-prefix bytes? #""]) - (listof (cons/c exact-nonnegative-integer? - exact-nonnegative-integer?))]{ + (or/c (listof (cons/c exact-nonnegative-integer? + exact-nonnegative-integer?)) + (listof (listof (or/c #f (cons/c exact-nonnegative-integer? + exact-nonnegative-integer?)))))]{ Like @racket[regexp-match-peek-positions], but returns multiple matches like -@racket[regexp-match*].} +@racket[regexp-match-positions*].} @defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)] [input (or/c string? bytes? path? input-port?)] diff --git a/collects/tests/racket/string.rktl b/collects/tests/racket/string.rktl index b88d1187be..a580b4f2d7 100644 --- a/collects/tests/racket/string.rktl +++ b/collects/tests/racket/string.rktl @@ -41,23 +41,28 @@ [(string? x) (string->bytes/utf-8 x)] [(pregexp? x) (byte-pregexp (->b (object-name x)))] [else x])) - (define fun* #f) + (define funs '()) + (define (test-funs ks vs res left rx str . args) + (unless (memq regexp-match-peek-positions* funs) + (for ([fun (in-list funs)]) + ;; test with a string + (keyword-apply test ks vs res fun rx str args) + ;; test with a byte-regexp and/or a byte string + (keyword-apply test ks vs (->b res) fun (->b rx) str args) + (keyword-apply test ks vs (->b res) fun rx (->b str) args) + (keyword-apply test ks vs (->b res) fun (->b rx) (->b str) args))) + (for ([fun (in-list funs)]) + ;; test with a port, and test leftovers + (when left + (let ([p (open-input-string str)]) + (keyword-apply test ks vs (->b res) fun rx p args) + (keyword-apply test '() '() left read-string 50 p '()))))) (define t - (case-lambda - [(fun) (set! fun* fun)] - [(res left rx str . args) - (unless (eq? fun* regexp-match-peek-positions*) - ;; test with a string - (apply test res fun* rx str args) - ;; test with a byte-regexp and/or a byte string - (apply test (->b res) fun* (->b rx) str args) - (apply test (->b res) fun* rx (->b str) args) - (apply test (->b res) fun* (->b rx) (->b str) args)) - ;; test with a port, and test leftovers - (when left - (let ([p (open-input-string str)]) - (apply test (->b res) fun* rx p args) - (test left read-string 50 p)))])) + (make-keyword-procedure + test-funs + (case-lambda [(fun*) (set! funs (if (list? fun*) fun* (list fun*)))] + [(res left rx str . args) + (apply test-funs '() '() res left rx str args)]))) ;; -------------------- (t regexp-match*) (t '("a" "b" "c") eof "[abc]" " a b c ") @@ -75,15 +80,65 @@ (t '("a" "b" "c") eof "[abc]" "a b c" 0) (t '("a" "b" "c") eof "[abc]" "a b c" 0 #f) (t '("a" "b" "c") eof "[abc]" "a b c" 0 5) - (t '("a") eof "^." "a b c" 0 5 #"") - (t '() eof "^." "a b c" 0 5 #"x") - (t '("a\n" "b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6) - (t '("b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x") - (for-each (lambda (cvt) - (test '(#"\x80" #"\x80") regexp-match* (cvt #"\x80") #"a\x80z\x80q")) - (list values byte-regexp byte-pregexp)) + (t '("a") eof "^." "a b c" 0 5 #"") + (t '() eof "^." "a b c" 0 5 #"x") + (t '("a\n" "b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6) + (t '("b\n" "c\n") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x") + (for ([cvt (in-list (list values byte-regexp byte-pregexp))]) + (test '(#"\x80" #"\x80") regexp-match* (cvt #"\x80") #"a\x80z\x80q")) ;; -------------------- - (t regexp-match-positions*) + (define (regexp-explode . xs) (apply regexp-match* #:gap-select #t xs)) + (t regexp-explode) + (t '(" " "a" " " "b" " " "c" " ") eof "[abc]" " a b c ") + (t '("" "a" "+" "b" " = " "c" " ") eof "[abc]" "a+b = c ") + (t '(" " "b" " " "c" " ") eof "[abc]" " a b c " 2) + (t '("" "b" " " "c" " ") eof "[abc]" " a b c " 3) + (t '(" " "a" "") " b c " "[abc]" " a b c " 0 2) + (t '(" " "a" " ") "b c " "[abc]" " a b c " 0 3) + (t '(" " "a" " " "b" " " "c" " ") eof "[abc]" " a b c " 0 #f) + (t '(" " "a" " " "b" " " "c" " ") eof "[abc]" " a b c " 0 7) + (t '("" "a" " " "b" " " "c" "") eof "[abc]" "a b c") + (t '(" " "b" " " "c" "") eof "[abc]" "a b c" 1) + (t '("" "b" " " "c" "") eof "[abc]" "a b c" 2) + (t '("" "a" "") " b c" "[abc]" "a b c" 0 1) + (t '("" "a" " ") "b c" "[abc]" "a b c" 0 2) + (t '("" "a" " " "b" " " "c" "") eof "[abc]" "a b c" 0) + (t '("" "a" " " "b" " " "c" "") eof "[abc]" "a b c" 0 #f) + (t '("" "a" " " "b" " " "c" "") eof "[abc]" "a b c" 0 5) + (t '("" "a" " b c") eof "^." "a b c" 0 5 #"") + (t '("a b c") eof "^." "a b c" 0 5 #"x") + (t '("" "a\n" "" "b\n" "" "c\n" "") eof "(?m:^.\n)" "a\nb\nc\n" 0 6) + (t '("a\n" "b\n" "" "c\n" "") eof "(?m:^.\n)" "a\nb\nc\n" 0 6 #"x") + (for ([cvt (in-list (list values byte-regexp byte-pregexp))]) + (test '(#"a" #"\x80" #"z" #"\x80" #"q") + regexp-explode (cvt #"\x80") #"a\x80z\x80q")) + ;; -------------------- + (t regexp-match*) ; some tests with a match-select etc + ;; (tests with #f for #:match-select and #t for #:gap-select done below with + ;; `regexp-split') + (err/rt-test (regexp-match* "[abc]" "a b c" + #:match-select #f #:gap-select #f)) + (t '("a" "b" "c") eof + "<([abc])>" " " #:match-select cadr) + (t '(("a") ("b") ("c")) eof + "<([abc])>" " " #:match-select cdr) + (t '(("" "a") ("" "b") ("" "c")) eof + "<([abc])>" " " #:match-select values) + (t '("" "a" " + " "b" " = " "c" "") eof + "<([abc])>" " + = " #:match-select cadr #:gap-select #t) + (t '("" ("" "a") " + " ("" "b") " = " ("" "c") "") eof + "<([abc])>" " + = " #:match-select values #:gap-select #t) + (t '("" (")?" ")?" "