diff --git a/collects/scheme/private/string.ss b/collects/scheme/private/string.ss index 2ec85b8de4..51cad2399d 100644 --- a/collects/scheme/private/string.ss +++ b/collects/scheme/private/string.ss @@ -76,9 +76,10 @@ ;; because of the intermediate regexp being recreated [(string? rx) (tweak (lambda (x) x) regexp ->str)] [(bytes? rx) (tweak (lambda (x) x) byte-regexp ->bts)] - [else (raise-type-error 'regexp-tweaker - "regexp of any kind, string, or bytes" - rx)])) + [else (raise-type-error + 'regexp-tweaker + "regexp, byte regexp, string, or byte string" + rx)])) (or (hash-ref t rx #f) (let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*))))) @@ -107,13 +108,14 @@ ;; Helper macro for the regexp functions below, with some utilities. (define (bstring-length s) (if (bytes? s) (bytes-length s) (string-length s))) - (define (bstring->regexp name pattern) - (cond [(regexp? pattern) pattern] - [(byte-regexp? pattern) pattern] - [(string? pattern) (regexp pattern)] - [(bytes? pattern) (byte-regexp pattern)] - [else (raise-type-error - name "regexp, byte regexp, string, or byte string" pattern)])) + (define no-empty-edge-matches + (make-regexp-tweaker (lambda (rx) (format "(?=.)(?:~a)(?<=.)" rx)))) + (define (bstring->no-edge-regexp name pattern) + (if (or (regexp? pattern) (byte-regexp? pattern) + (string? pattern) (bytes? pattern)) + (no-empty-edge-matches pattern) + (raise-type-error + name "regexp, byte regexp, string, or byte string" pattern))) (define-syntax-rule (regexp-loop name loop start end rx string success-choose failure-k @@ -174,21 +176,6 @@ void))] [end (and end (- end start))] [m (regexp-match rx string 0 end spitout)] - ;; re-match if we get a zero-length match at the - ;; beginning - [m (if (and m ; we have a match - ;; and it's an empty one - (zero? (bstring-length (car m))) - ;; and it's at the beginning - (zero? (if need-leftover? - (file-position spitout) - discarded/leftovers)) - ;; and we still have stuff to match - (if end - (< 0 end) - (not (eof-object? (peek-byte string))))) - (regexp-match rx string 1 end spitout) - m)] [m (and m (car m))] [discarded/leftovers (if need-leftover? (get-output-bytes spitout) @@ -198,70 +185,31 @@ (bstring-length discarded/leftovers) discarded/leftovers) (bstring-length m)))]) - ;; drop matches that are both empty and at the end - (if (and m (or (< 0 (bstring-length m)) - (if end - (< 0 end) - (not (eof-object? (peek-byte string)))))) + (if m (loop (cons (port-success-choose m discarded/leftovers) acc) 0 end) (port-failure-k acc discarded/leftovers))) ;; String/port match, get positions - (let* ([match (if peek? - regexp-match-peek-positions - regexp-match-positions)] - [m (match rx string start end)]) + (let ([m (if peek? + (regexp-match-peek-positions rx string start end) + (regexp-match-positions rx string start end))]) (if (not m) (failure-k acc start end) - (let* ([mstart (caar m)] - [mend (cdar m)] - ;; re-match if we get a zero-length match at the - ;; beginning, and we can continue - [m (if (and (= mstart mend start) - (cond - [end (< start end)] - [len (< start len)] - [(input-port? string) - (not (eof-object? (peek-byte string)))] - [else (error "internal error (str)")])) - (if (or peek? (not (input-port? string))) - (match rx string (add1 start) end) - ;; rematching on a port requires adding `start' - ;; offsets - (let ([m (match rx string 1 end)]) - (if (and m (positive? start)) - (list (cons (+ start (caar m)) - (+ start (cdar m)))) - m))) - m)]) - ;; fail if rematch failed - (if (not m) - (failure-k acc start end) - (let ([mstart (caar m)] - [mend (cdar m)]) - ;; or if we have a zero-length match at the end - (if (and (= mstart mend) - (cond [end (= mend end)] - [len (= mend len)] - [(input-port? string) - (eof-object? - (peek-byte string (if peek? mend 0)))] - [else (error "internal error (str)")])) - (failure-k acc start end) - (if port-success-k - (port-success-k - (lambda (acc new-start new-end) - (loop acc new-start new-end)) - acc start end mstart mend) - (loop (cons (success-choose start mstart mend) acc) - mend end))))))))))))) + (let ([mstart (caar m)] [mend (cdar m)]) + (if port-success-k + (port-success-k + (lambda (acc new-start new-end) + (loop acc new-start new-end)) + acc start end mstart mend) + (loop (cons (success-choose start mstart mend) acc) + mend end)))))))))) ;; Returns all the positions at which the pattern matched. (define (regexp-match-positions* pattern string [start 0] [end #f]) (regexp-loop regexp-match-positions* loop start end - (bstring->regexp 'regexp-match-positions* pattern) string + (bstring->no-edge-regexp 'regexp-match-positions* pattern) string ;; success-choose: (lambda (start mstart mend) (cons mstart mend)) ;; failure-k: @@ -285,7 +233,7 @@ (define (regexp-match-peek-positions* pattern string [start 0] [end #f]) (regexp-loop regexp-match-peek-positions* loop start end - (bstring->regexp 'regexp-match-peek-positions* pattern) string + (bstring->no-edge-regexp 'regexp-match-peek-positions* pattern) string ;; success-choose: (lambda (start mstart mend) (cons mstart mend)) ;; failure-k: @@ -300,7 +248,7 @@ ;; Splits a string into a list by removing any piece which matches ;; the pattern. (define (regexp-split pattern string [start 0] [end #f]) - (define rx (bstring->regexp 'regexp-split pattern)) + (define rx (bstring->no-edge-regexp 'regexp-split pattern)) (define buf (if (and (string? string) (byte-regexp? rx)) (string->bytes/utf-8 string (char->integer #\?)) string)) @@ -322,7 +270,7 @@ ;; Returns all the matches for the pattern in the string. (define (regexp-match* pattern string [start 0] [end #f]) - (define rx (bstring->regexp 'regexp-match* pattern)) + (define rx (bstring->no-edge-regexp 'regexp-match* pattern)) (define buf (if (and (string? string) (byte-regexp? rx)) (string->bytes/utf-8 string (char->integer #\?)) string)) diff --git a/collects/tests/mzscheme/string.ss b/collects/tests/mzscheme/string.ss index 55f00ed806..8f4314bbac 100644 --- a/collects/tests/mzscheme/string.ss +++ b/collects/tests/mzscheme/string.ss @@ -39,6 +39,7 @@ (define (->b x) (cond [(list? x) (map ->b x)] [(string? x) (string->bytes/utf-8 x)] + [(pregexp? x) (byte-pregexp (->b (object-name x)))] [else x])) (define fun* #f) (define t @@ -126,8 +127,8 @@ (t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 #f) (t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 5) ;; ---------- tests with zero-length matches ---------- - ;; Many of these tests can be repeated with Perl. To try something - ;; in Perl, put this code in a file: + ;; Many of these tests can be repeated with Perl. To try something in Perl, + ;; put this code in a file: ;; #!/usr/bin/perl ;; sub test { ;; my ($rx,$str) = @_; @words = split /$rx/, $str; @@ -136,15 +137,16 @@ ;; print ") eof \"$rx\" \"$str\")\n"; ;; }; ;; test("[abc]","1a2b3"); - ;; and it will print a test that does what perl is doing. Tests - ;; that differ from Perl have explanations. + ;; and it will print a test that does what perl is doing. Tests that differ + ;; from Perl have explanations. ;; (t regexp-split) ;; test("a","a"); ;; (t '() eof "a" "a") - ;; perl returns an empty list, we return '("" ""), and this is a - ;; difference that is unrelated to dealing with empty matches, - ;; just the way that perl's split throws out some empty matches. + ;; perl returns an empty list, we return '("" ""), and this is a difference + ;; that is unrelated to dealing with empty matches, just the way that + ;; perl's split throws out some empty matches (it throws empty matches at + ;; the end (but not at the beginning for some reason...)) (t '("" "") eof "a" "a") ;; test("3","123"); ;; (t '("12") eof "3" "123") @@ -162,49 +164,51 @@ (t '("1" "2" "3" "4") eof " *" "12 34") ;; test(" *"," 12 34 "); ;; (t '("" "1" "2" "3" "4") eof " *" " 12 34 ") - ;; perl drops the last empty string, we don't -- unrelated to - ;; empty matches (same as the <"a","a"> case above) + ;; again, perl drops the last empty string but we don't (t '("" "1" "2" "3" "4" "") eof " *" " 12 34 ") ;; test("2|", "1234"); (t '("1" "3" "4") eof "2|" "1234") ;; test("1|", "1234"); (t '("" "2" "3" "4") eof "1|" "1234") ;; test("4|", "1234"); - ;; perl drops the last empty string, we don't, same as above + ;; (t '("1" "2" "3") eof "4|" "1234") + ;; perl perl drops the last empty string again (t '("1" "2" "3" "") eof "4|" "1234") ;; test("|2", "1234"); - ;; (t '("1" "" "3" "4") eof "|2" "1234") - ;; perl will find the "2", we can't do that since we'll always - ;; find the empty match first, so it's just like using "" (to do - ;; the perl thing, we'll need a hook into the matcher's C code, or - ;; some way of saying `match this pattern but prefer a non-empty - ;; match if possible') - (t '("1" "2" "3" "4") eof "|2" "1234") + (t '("1" "" "3" "4") eof "|2" "1234") ;; test("2|3", "1234"); (t '("1" "" "4") eof "2|3" "1234") + ;; test("2|3|4", "12345"); + (t '("1" "" "" "5") eof "2|3|4" "12345") ;; test("1|2", "1234"); (t '("" "" "34") eof "1|2" "1234") ;; test("3|4", "1234"); ;; (t '("12") eof "3|4" "1234") - ;; again, perl dumps empty matches at the end, even two + ;; perl perl drops the last empty string again -- even two here (t '("12" "" "") eof "3|4" "1234") + ;; test("2|3|4", "1234"); + ;; (t '("1") eof "2|3|4" "1234") + ;; ...and even three in this example + (t '("1" "" "" "") eof "2|3|4" "1234") ;; test('$',"123"); (t '("123") eof "$" "123") ;; test('^',"123"); ;; (t '("123") eof "^" "123") - ;; this is a technicality: perl matches "^" once, but mzscheme - ;; matches on whatever `start' may be; perl is treating it as a - ;; beginning-of-line instead of a ...-of-string behind your back - ;; "since it isn't much use otherwise" - ;; (http://perldoc.perl.org/functions/split.html); so our correct - ;; test is: - (t '("1" "2" "3") eof "^" "123") - ;; and we can get the same with "(m?:^)": - (t '("123") eof "(m?:^)" "123") + ;; this is a technicality: perl matches "^" once, but mzscheme matches on + ;; whatever `start' may be; perl is treating it as a beginning-of-line + ;; instead of a beginning-of-string behind your back "since it isn't much + ;; use otherwise" (http://perldoc.perl.org/functions/split.html); but we + ;; don't allow empty matches at the beginning, so a `^' will never match, + ;; and we get the same behavior anyway: + (t '("123") eof "^" "123") + ;; test('^',"123\n456"); + ;; (t '("123\n" "456") eof "^" "123\n456") + ;; we can get the same behavior as perl's with "(m?:^)": + (t '("123\n" "456") eof "(?m:^)" "123\n456") + ;; test("\\b", "123 456"); + (t '("123" " " "456") eof #px"\\b" "123 456") ;; test("^|a", "abc"); - ;; (t '("" "bc") eof "^|a" "abc") - ;; same deal here, use "(m?:^)": - (t '("" "bc") eof "(m?:^|a)" "abc") + (t '("" "bc") eof "^|a" "abc") ;; some tests with bounds (these have no perl equivalences) (t '("1" "2" " " "3" "4") eof "" "12 34" 0) (t '("1" "2" " " "3" "4") eof "" "12 34" 0 #f) @@ -244,12 +248,13 @@ (apply (if (string? (car s)) string-append bytes-append) (car s) (append-map list m (cdr s))))))) - (t "12 34" #f " " "12 34") + (t "12 34" #f " " "12 34") (t " 12 34 " #f " " " 12 34 ") - (t "12 34" #f " *" "12 34") + (t "12 34" #f " *" "12 34") (t " 12 34 " #f " *" " 12 34 ") - (t "12 34" #f "" "12 34") - (t " 12 34 " #f "" " 12 34 ")) + (t "12 34" #f "" "12 34") + (t " 12 34 " #f "" " 12 34 ") + ) ;; ---------- string-append* ---------- (let ()