diff --git a/collects/scheme/base.ss b/collects/scheme/base.ss index 0f21be5c76..08b90e1867 100644 --- a/collects/scheme/base.ss +++ b/collects/scheme/base.ss @@ -16,9 +16,12 @@ call-with-input-file call-with-output-file with-input-from-file - with-output-to-file) + with-output-to-file + regexp-replace*) (all-from "private/list.ss") - (all-from "private/string.ss") + (all-from-except "private/string.ss" + -regexp-replace*) + (rename -regexp-replace* regexp-replace*) identifier? (all-from "private/stxcase-scheme.ss") (all-from "private/qqstx.ss") diff --git a/collects/scheme/private/string.ss b/collects/scheme/private/string.ss index 5a141395a3..5522a6b734 100644 --- a/collects/scheme/private/string.ss +++ b/collects/scheme/private/string.ss @@ -8,7 +8,8 @@ regexp-match-peek-positions* regexp-split regexp-match-exact? - regexp-try-match) + regexp-try-match + -regexp-replace*) (require (for-syntax "stxcase-scheme.ss")) ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -111,8 +112,11 @@ (define (no-empty-edge-matches n) (make-regexp-tweaker (lambda (rx) (if (bytes? rx) - (bytes-append #"(?=.)(?:" rx #")(?<=" (make-bytes n (char->integer #\.)) #")") - (format "(?=.)(?:~a)(?<=~a)" rx (make-bytes n (char->integer #\.))))))) + (bytes-append #"(?:" + rx + #")(?<=" (make-bytes n (char->integer #\.)) #")") + (format "(?:~a)(?<=~a)" + rx (make-bytes n (char->integer #\.))))))) (define-syntax-rule (regexp-loop name loop start end pattern string ipre @@ -159,11 +163,11 @@ (format "ending offset index out of range [~a,~a]: " start len) end)) (reverse - (let loop ([acc '()] [start start] [end end] [ipre ipre] [rx #f] [rx-lb 0]) - (let* ([new-rx-lb (add1 (bytes-length ipre))] - [rx (if (= rx-lb new-rx-lb) - rx - ((no-empty-edge-matches new-rx-lb) orig-rx))]) + (let loop ([acc '()] [start start] [end end] [ipre ipre] [0-ok? #t]) + (let* ([rx (if 0-ok? + orig-rx + ((no-empty-edge-matches (add1 (bytes-length ipre))) + orig-rx))]) (if (and port-success-choose (input-port? string)) ;; Input port match, get string @@ -187,42 +191,47 @@ c)) void))] [end (and end (- end start))]) - (let-values ([(m ipre) (regexp-match/end rx string 0 end spitout ipre + (let-values ([(ms ipre) (regexp-match/end rx + string 0 end spitout ipre max-lookbehind)]) - (let* ([m (and m (car m))] + (let* ([m (and ms (car ms))] [discarded/leftovers (if need-leftover? (get-output-bytes spitout) discarded/leftovers)] + [skipped (if need-leftover? + (bstring-length discarded/leftovers) + discarded/leftovers)] + [got (and m (bstring-length m))] [end (and end m - (- end (if need-leftover? - (bstring-length discarded/leftovers) - discarded/leftovers) - (bstring-length m)))]) + (- end skipped got))]) (if m - (loop (cons (port-success-choose m discarded/leftovers) acc) - 0 end ipre - rx new-rx-lb) + (let ([0-ok? (not (zero? got))]) + (loop (port-success-choose m discarded/leftovers ms acc) + 0 end ipre 0-ok?)) (port-failure-k acc discarded/leftovers))))) ;; String/port match, get positions (let-values ([(m ipre) (if peek? - (regexp-match-peek-positions/end rx string start end #f ipre + (regexp-match-peek-positions/end rx + string start end #f ipre max-lookbehind) - (regexp-match-positions/end rx string start end #f ipre + (regexp-match-positions/end rx + string start end #f ipre max-lookbehind))]) - (if (not m) (failure-k acc start end) - (let ([mstart (caar m)] [mend (cdar m)]) + (let* ([mstart (caar m)] + [mend (cdar m)] + [0-ok? (not (= mstart mend))]) (if port-success-k (port-success-k (lambda (acc new-start new-end) - (loop acc new-start new-end ipre rx new-rx-lb)) + (loop acc new-start new-end ipre 0-ok?)) acc start end mstart mend) - (loop (cons (success-choose start mstart mend) acc) - mend end ipre rx new-rx-lb))))))))))) - + (loop (success-choose start mstart mend m acc) + mend end ipre 0-ok?))))))))))) + ;; Returns all the positions at which the pattern matched. (define (regexp-match-positions* pattern string [start 0] [end #f] [ipre #""]) (regexp-loop @@ -230,7 +239,7 @@ pattern string ipre ;; success-choose: - (lambda (start mstart mend) (cons mstart mend)) + (lambda (start mstart mend ms acc) (cons (cons mstart mend) acc)) ;; failure-k: (lambda (acc start end) acc) ;; port-success-k: need to shift index of rest as reading; cannot @@ -255,7 +264,7 @@ pattern string ipre ;; success-choose: - (lambda (start mstart mend) (cons mstart mend)) + (lambda (start mstart mend ms acc) (cons (cons mstart mend) acc)) ;; failure-k: (lambda (acc start end) acc) ;; port functions: use string case @@ -275,19 +284,133 @@ (define sub (if (bytes? buf) subbytes substring)) (regexp-loop regexp-split loop start end pattern buf ipre ;; success-choose: - (lambda (start mstart mend) (sub buf start mstart)) + (lambda (start mstart mend ms acc) (cons (sub buf start mstart) acc)) ;; failure-k: (lambda (acc start end) (cons (if end (sub buf start end) (sub buf start)) acc)) ;; port-success-k: #f ;; port-success-choose: - (lambda (match-string leftovers) leftovers) + (lambda (match-string leftovers ms acc) (cons leftovers acc)) ;; port-failure-k: (lambda (acc leftover) (if leftover (cons leftover acc) acc)) #t #f)) + ;; Like splitting, but insert a replacement between matches + (define -regexp-replace* + (let ([regexp-replace* + (lambda (pattern string orig-replacement [ipre #""]) + (define buf (if (and (string? string) (or (byte-regexp? pattern) + (bytes? pattern))) + (string->bytes/utf-8 string (char->integer #\?)) + string)) + (define sub (if (bytes? buf) subbytes substring)) + (define start 0) + (define end #f) + (define needs-string? + (and (or (string? pattern) (regexp? pattern)) + (string? string))) + (define replacement + (if (and (not needs-string?) (string? orig-replacement)) + (string->bytes/utf-8 orig-replacement) + orig-replacement)) + (define (check proc arg) + (let ([v (proc arg)]) + (unless (if needs-string? + (string? v) + (bytes? v)) + (raise-mismatch-error '|regexp-replace* (calling given filter procedure)| + (if needs-string? + "expected a string result: " + "expected a byte string result: ") + v)) + v)) + (define rx:sub #rx#"^(?:[^&\\]*[\\][&\\])*[^&\\]*(?:&|[\\](?=[^&\\]|$))") + (define need-replac? (and (not (procedure? replacement)) + (regexp-match? rx:sub replacement))) + (define (replac ms str) + (if need-replac? + ((if (string? str) bytes->string/utf-8 values) + (apply + bytes-append + (let ([str (if (string? str) (string->bytes/utf-8 str) str)] + [get-match (lambda (n) + (if (n . < . (length ms)) + (let* ([p (list-ref ms n)] + [s (if (pair? p) + (sub buf (car p) (cdr p)) + p)]) + (if (string? s) + (string->bytes/utf-8 s) + s)) + #""))]) + (let loop ([pos 0]) + (let ([m (regexp-match-positions #rx#"[\\&]" str pos)]) + (if m + (cons (subbytes str pos (caar m)) + (cond + [(equal? (char->integer #\&) (bytes-ref str (caar m))) + (cons (get-match 0) (loop (cdar m)))] + [(= (cdar m) (bytes-length str)) + ;; \ with no following character + (list (get-match 0))] + [(let ([next (bytes-ref str (cdar m))]) + (or (and (equal? (char->integer #\&) next) + #"&") + (and (equal? (char->integer #\\) next) + #"\\"))) + => (lambda (s) + (cons s (loop (add1 (cdar m)))))] + [else + (let ([n (regexp-match #rx#"^[0-9]+" str (cdar m))]) + (if n + (cons (get-match (string->number (bytes->string/utf-8 (car n)))) + (loop (+ (cdar m) (bytes-length (car n))))) + (cons (get-match 0) + (loop (cdar m)))))])) + (list (subbytes str pos)))))))) + str)) + (when (or (string? pattern) (bytes? pattern) + (regexp? pattern) (byte-regexp? pattern)) + (unless (or (string? string) + (bytes? string)) + (raise-type-error 'regexp-replace* "string or byte string" string)) + (unless (or (string? replacement) + (bytes? replacement) + (and (procedure? replacement) + (procedure-arity-includes? replacement 1))) + (raise-type-error 'regexp-replace* "string, byte string, or procedure (arity 1)" + replacement)) + (when (and needs-string? (bytes? replacement)) + (raise-mismatch-error 'regexp-replace* + "cannot replace a string with a byte string: " + replacement))) + (apply + (if (bytes? buf) bytes-append string-append) + (regexp-loop + regexp-replace* loop start end pattern buf ipre + ;; success-choose: + (lambda (start mstart mend ms acc) (list* (if (procedure? replacement) + (check + replacement + (sub buf mstart mend)) + (replac ms replacement)) + (sub buf start mstart) + acc)) + ;; failure-k: + (lambda (acc start end) + (cons (if end (sub buf start end) (sub buf start)) acc)) + ;; port-success-k: + #f + ;; port-success-choose: + #f + ;; port-failure-k: + #f + #t + #f)))]) + regexp-replace*)) + ;; Returns all the matches for the pattern in the string. (define (regexp-match* pattern string [start 0] [end #f] [ipre #""]) (define buf (if (and (string? string) (or (byte-regexp? pattern) @@ -297,13 +420,13 @@ (define sub (if (bytes? buf) subbytes substring)) (regexp-loop regexp-match* loop start end pattern buf ipre ;; success-choose: - (lambda (start mstart mend) (sub buf mstart mend)) + (lambda (start mstart mend ms acc) (cons (sub buf mstart mend) acc)) ;; failure-k: (lambda (acc start end) acc) ;; port-success-k: #f ;; port-success-choose: - (lambda (match-string leftovers) match-string) + (lambda (match-string leftovers ms acc) (cons match-string acc)) ;; port-failure-k: (lambda (acc leftover) acc) #f diff --git a/collects/scribblings/reference/regexps.scrbl b/collects/scribblings/reference/regexps.scrbl index 3aa1b5a1a8..531149ef53 100644 --- a/collects/scribblings/reference/regexps.scrbl +++ b/collects/scribblings/reference/regexps.scrbl @@ -336,14 +336,13 @@ results for parenthesized sub-patterns in @scheme[pattern] are not returned.) The @scheme[pattern] is used in order to find matches, where each -match attempt starts at the end of the last match, and @litchar{$} is +match attempt starts at the end of the last match, and @litchar{^} is allowed to match the beginning of the input (if @scheme[input-prefix] is @scheme[#""]) only for the first match. Empty matches are handled like other matches, returning a zero-length string or byte sequence (they are more useful in the complementing @scheme[regexp-split] function), but @scheme[pattern] is restricted from matching an empty -string at the beginning (or right after a previous match) or at the -end. +sequence immediately after an empty match. If @scheme[input] contains no matches (in the range @scheme[start-pos] to @scheme[end-pos]), @scheme[null] is returned. Otherwise, each item @@ -355,6 +354,7 @@ port). @examples[ (regexp-match* #rx"x." "12x4x6") +(regexp-match* #rx"x*" "12x4x6") ]} @@ -639,8 +639,8 @@ strings (if @scheme[pattern] is a string or character regexp and @scheme[input] is a string) or byte strings (otherwise) from in @scheme[input] that are separated by matches to @scheme[pattern]. Adjacent matches are separated with @scheme[""] or -@scheme[#""]. Zero-length matches are treated the same as in -@scheme[regexp-match*], but are more useful in this case. +@scheme[#""]. Zero-length matches are treated the same as for +@scheme[regexp-match*]. If @scheme[input] contains no matches (in the range @scheme[start-pos] to @scheme[end-pos]), the result is a list containing @scheme[input]'s diff --git a/collects/tests/mzscheme/basic.ss b/collects/tests/mzscheme/basic.ss index b619613f0e..34b8b8300d 100644 --- a/collects/tests/mzscheme/basic.ss +++ b/collects/tests/mzscheme/basic.ss @@ -1062,13 +1062,15 @@ (test '(#"") regexp-match "$" (open-input-string "123") 3) (test '(#"") regexp-match-peek "" (open-input-string "123") 3) -(test "1b2b3" regexp-replace* "" "123" "b") +(test "b1b2b3b" regexp-replace* "" "123" "b") (test "1b23" regexp-replace* "(?=2)" "123" "b") -(test "ax\u03BB" regexp-replace* "" "a\u03BB" "x") -(test "ax\u03BBxb" regexp-replace* "" "a\u03BBb" "x") -(test #"ax\316x\273xb" regexp-replace* #"" "a\u03BBb" #"x") -(test "1=2===3" regexp-replace* "2*" "123" (lambda (s) (string-append "=" s "="))) -(test "1=2===3==4" regexp-replace* "2*" "1234" (lambda (s) (string-append "=" s "="))) +(test "xax\u03BBx" regexp-replace* "" "a\u03BB" "x") +(test "xax\u03BBxbx" regexp-replace* "" "a\u03BBb" "x") +(test #"xax\316x\273xbx" regexp-replace* #"" "a\u03BBb" #"x") +(test "==1=2===3==" regexp-replace* "2*" "123" (lambda (s) (string-append "=" s "="))) +(test "==1=2===3==4==" regexp-replace* "2*" "1234" (lambda (s) (string-append "=" s "="))) + +(test "x&b\\ab=cy&w\\aw=z" regexp-replace* #rx"a(.)" "xabcyawz" "\\&\\1\\\\&\\99=") ;; Test weird port offsets: (define (test-weird-offset regexp-match regexp-match-positions) diff --git a/collects/tests/mzscheme/rx.ss b/collects/tests/mzscheme/rx.ss index da23b0c4a3..b7d7be3b81 100644 --- a/collects/tests/mzscheme/rx.ss +++ b/collects/tests/mzscheme/rx.ss @@ -912,6 +912,7 @@ (#"(?<=(foo)a)bar" #"fooabar" (#"bar" #"foo")) (#"(?<=(foo)a)bar" #"bar" #f) (#"(?<=(foo)a)bar" #"foobbar" #f) + (#"a(?<=x|ab)b" #"ab" #f) (#"(?>(\\.\\d\\d[1-9]?))\\d+" #"1.230003938" (#".230003938" #".23")) (#"(?>(\\.\\d\\d[1-9]?))\\d+" #"1.875000282" (#".875000282" #".875")) (#"(?>(\\.\\d\\d[1-9]?))\\d+" #"1.235 " #f) diff --git a/collects/tests/mzscheme/string.ss b/collects/tests/mzscheme/string.ss index 4e900328ea..ce1156766d 100644 --- a/collects/tests/mzscheme/string.ss +++ b/collects/tests/mzscheme/string.ss @@ -180,27 +180,25 @@ ;; test("a","1a2a3"); (t '("1" "2" "3") eof "a" "1a2a3") ;; test("","123"); - (t '("1" "2" "3") eof "" "123") + (t '("" "1" "2" "3" "") eof "" "123") ;; test("","12 34"); - (t '("1" "2" " " "3" "4") eof "" "12 34") + (t '("" "1" "2" " " "3" "4" "") eof "" "12 34") ;; test(" *","123"); - (t '("1" "2" "3") eof " *" "123") + (t '("" "1" "2" "3" "") eof " *" "123") ;; test(" *","12 34"); - (t '("1" "2" "3" "4") eof " *" "12 34") + (t '("" "1" "2" "" "3" "4" "") eof " *" "12 34") ;; test(" *"," 12 34 "); - ;; (t '("" "1" "2" "3" "4") eof " *" " 12 34 ") - ;; again, perl drops the last empty string but we don't - (t '("" "1" "2" "3" "4" "") eof " *" " 12 34 ") + (t '("" "" "1" "2" "" "3" "4" "" "") eof " *" " 12 34 ") ;; test("2|", "1234"); - (t '("1" "3" "4") eof "2|" "1234") + (t '("" "1" "" "3" "4" "") eof "2|" "1234") ;; test("1|", "1234"); - (t '("" "2" "3" "4") eof "1|" "1234") + (t '("" "" "2" "3" "4" "") eof "1|" "1234") ;; test("4|", "1234"); ;; (t '("1" "2" "3") eof "4|" "1234") ;; perl perl drops the last empty string again - (t '("1" "2" "3" "") eof "4|" "1234") + (t '("" "1" "2" "3" "" "") eof "4|" "1234") ;; test("|2", "1234"); - (t '("1" "" "3" "4") eof "|2" "1234") + (t '("" "1" "" "" "3" "4" "") eof "|2" "1234") ;; test("2|3", "1234"); (t '("1" "" "4") eof "2|3" "1234") ;; test("2|3|4", "12345"); @@ -216,55 +214,46 @@ ;; ...and even three in this example (t '("1" "" "" "") eof "2|3|4" "1234") ;; test('$',"123"); - (t '("123") eof "$" "123") + (t '("123" "") eof "$" "123") ;; test('^',"123"); - ;; (t '("123") eof "^" "123") - ;; this is a technicality: perl matches "^" once, but mzscheme matches on - ;; whatever `start' may be; perl is treating it as a beginning-of-line - ;; instead of a beginning-of-string behind your back "since it isn't much - ;; use otherwise" (http://perldoc.perl.org/functions/split.html); but we - ;; don't allow empty matches at the beginning, so a `^' will never match, - ;; and we get the same behavior anyway: - (t '("123") eof "^" "123") + (t '("" "123") eof "^" "123") ;; test('^',"123\n456"); - ;; (t '("123\n" "456") eof "^" "123\n456") - ;; we can get the same behavior as perl's with "(m?:^)": - (t '("123\n" "456") eof "(?m:^)" "123\n456") + (t '("" "123\n" "456") eof "(?m:^)" "123\n456") ;; test("\\b", "123 456"); - (t '("123" " " "456") eof #px"\\b" "123 456") + (t '("" "123" " " "456" "") eof #px"\\b" "123 456") ;; test("^|a", "abc"); - (t '("" "bc") eof "^|a" "abc") + (t '("" "" "bc") eof "^|a" "abc") ;; some tests with bounds (these have no perl equivalences) - (t '("1" "2" " " "3" "4") eof "" "12 34" 0) - (t '("1" "2" " " "3" "4") eof "" "12 34" 0 #f) - (t '("1" "2" " " "3" "4") eof "" "12 34" 0 5) - (t '("" "1" "2" "3" "4" "") eof " *" " 12 34 " 0) - (t '("" "1" "2" "3" "4" "") eof " *" " 12 34 " 0 #f) - (t '("" "1" "2" "3" "4" "") eof " *" " 12 34 " 0 7) - (t '("2" "3") eof " *" "123" 1) - (t '("2" "3" "4") eof " *" "12 34" 1) - (t '("2" "3" "4" "") eof " *" " 12 34 " 2) - (t '("1") "23" "" "123" 0 1) - (t '("2") "3" "" "123" 1 2) - (t '("3") eof "" "123" 2 3) - (t '("1" "2") "3" " *" "123" 0 2) - (t '("2" "3") eof " *" "123" 1 3) + (t '("" "1" "2" " " "3" "4" "") eof "" "12 34" 0) + (t '("" "1" "2" " " "3" "4" "") eof "" "12 34" 0 #f) + (t '("" "1" "2" " " "3" "4" "") eof "" "12 34" 0 5) + (t '("" "" "1" "2" "" "3" "4" "" "") eof " *" " 12 34 " 0) + (t '("" "" "1" "2" "" "3" "4" "" "") eof " *" " 12 34 " 0 #f) + (t '("" "" "1" "2" "" "3" "4" "" "") eof " *" " 12 34 " 0 7) + (t '("" "2" "3" "") eof " *" "123" 1) + (t '("" "2" "" "3" "4" "") eof " *" "12 34" 1) + (t '("" "2" "" "3" "4" "" "") eof " *" " 12 34 " 2) + (t '("" "1" "") "23" "" "123" 0 1) + (t '("" "2" "") "3" "" "123" 1 2) + (t '("" "3" "") eof "" "123" 2 3) + (t '("" "1" "2" "") "3" " *" "123" 0 2) + (t '("" "2" "3" "") eof " *" "123" 1 3) ;; more tests with match* (t regexp-match*) - (t '("" "") eof "" "123") - (t '("" " " "") eof " *" "12 34") - (t '(" " "" " " "" " ") eof " *" " 12 34 ") - (t '("" " " "") " " " *" " 12 34 " 1 6) + (t '("" "" "" "") eof "" "123") + (t '("" "" " " "" "" "") eof " *" "12 34") + (t '(" " "" "" " " "" "" " " "") eof " *" " 12 34 ") + (t '("" "" " " "" "" "") " " " *" " 12 34 " 1 6) (t regexp-match-positions*) - (t '((1 . 1) (2 . 2)) eof "" "123") - (t '((1 . 1) (2 . 3) (4 . 4)) eof " *" "12 34") - (t '((0 . 1) (2 . 2) (3 . 4) (5 . 5) (6 . 7)) eof " *" " 12 34 ") - (t '((2 . 2) (3 . 4) (5 . 5)) " " " *" " 12 34 " 1 6) + (t '((0 . 0) (1 . 1) (2 . 2) (3 . 3)) eof "" "123") + (t '((0 . 0) (1 . 1) (2 . 3) (3 . 3) (4 . 4) (5 . 5)) eof " *" "12 34") + (t '((0 . 1) (1 . 1) (2 . 2) (3 . 4) (4 . 4) (5 . 5) (6 . 7) (7 . 7)) eof " *" " 12 34 ") + (t '((1 . 1) (2 . 2) (3 . 4) (4 . 4) (5 . 5) (6 . 6)) " " " *" " 12 34 " 1 6) (t regexp-match-peek-positions*) - (t '((1 . 1) (2 . 2)) "123" "" "123") - (t '((1 . 1) (2 . 3) (4 . 4)) "12 34" " *" "12 34") - (t '((0 . 1) (2 . 2) (3 . 4) (5 . 5) (6 . 7)) " 12 34 " " *" " 12 34 ") - (t '((2 . 2) (3 . 4) (5 . 5)) " 12 34 " " *" " 12 34 " 1 6) + (t '((0 . 0) (1 . 1) (2 . 2) (3 . 3)) "123" "" "123") + (t '((0 . 0) (1 . 1) (2 . 3) (3 . 3) (4 . 4) (5 . 5)) "12 34" " *" "12 34") + (t '((0 . 1) (1 . 1) (2 . 2) (3 . 4) (4 . 4) (5 . 5) (6 . 7) (7 . 7)) " 12 34 " " *" " 12 34 ") + (t '((1 . 1) (2 . 2) (3 . 4) (4 . 4) (5 . 5) (6 . 6)) " 12 34 " " *" " 12 34 " 1 6) ;; finally, some tests for the match* + split property (t (lambda (rx str) (let ([s (regexp-split rx str)] diff --git a/doc/release-notes/mzscheme/HISTORY.txt b/doc/release-notes/mzscheme/HISTORY.txt index cfffd59212..8104cd3671 100644 --- a/doc/release-notes/mzscheme/HISTORY.txt +++ b/doc/release-notes/mzscheme/HISTORY.txt @@ -1,6 +1,7 @@ -Version 4.2.5.9 +Version 4.2.5.10 regexp-match* et al. now disable ^ matching on all but the first - match + match, and empty matches are allowed in all positions except + immediately after an empty match Version 4.2.5.3 Added chaperones diff --git a/src/mzscheme/src/regexp.c b/src/mzscheme/src/regexp.c index 1fff57d150..b23fb50381 100644 --- a/src/mzscheme/src/regexp.c +++ b/src/mzscheme/src/regexp.c @@ -3420,7 +3420,7 @@ regmatch(Regwork *rw, rxpos prog) if (is - rw->input_min >= no) { rw->input = save - no; if (regmatch(rw, next)) { - if (is == save) { + if (rw->input == save) { /* Match */ if (!t) return 0; found = 1;