Implemented a `no-empty-edge-matches' function using the new

`make-regexp-tweaker', which produces a regexp that cannot match an
empty string at the beginning or the end of the input.  This:
* simplies a whole bunch of messy and fragile code
* makes `regexp-split' and friends usable with a pattern like #px"\\b"
  (which previously would match in every position, making it explode
  the string to 1-character strings)
* makes it even closer to what perl does, the only way that we produce
  a different result from perl now is that we don't blindly drop empty
  matches at the end like perl (it *does* keep empty matches in the
  beginning though)
The two tests that demonstrated the difference are now changed, and a
bunch of other tests added.

svn: r12575
This commit is contained in:
Eli Barzilay 2008-11-23 05:40:54 +00:00
parent 70e85a62a9
commit 5b9f0aa322
2 changed files with 67 additions and 114 deletions

View File

@ -76,9 +76,10 @@
;; because of the intermediate regexp being recreated
[(string? rx) (tweak (lambda (x) x) regexp ->str)]
[(bytes? rx) (tweak (lambda (x) x) byte-regexp ->bts)]
[else (raise-type-error 'regexp-tweaker
"regexp of any kind, string, or bytes"
rx)]))
[else (raise-type-error
'regexp-tweaker
"regexp, byte regexp, string, or byte string"
rx)]))
(or (hash-ref t rx #f)
(let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*)))))
@ -107,13 +108,14 @@
;; Helper macro for the regexp functions below, with some utilities.
(define (bstring-length s)
(if (bytes? s) (bytes-length s) (string-length s)))
(define (bstring->regexp name pattern)
(cond [(regexp? pattern) pattern]
[(byte-regexp? pattern) pattern]
[(string? pattern) (regexp pattern)]
[(bytes? pattern) (byte-regexp pattern)]
[else (raise-type-error
name "regexp, byte regexp, string, or byte string" pattern)]))
(define no-empty-edge-matches
(make-regexp-tweaker (lambda (rx) (format "(?=.)(?:~a)(?<=.)" rx))))
(define (bstring->no-edge-regexp name pattern)
(if (or (regexp? pattern) (byte-regexp? pattern)
(string? pattern) (bytes? pattern))
(no-empty-edge-matches pattern)
(raise-type-error
name "regexp, byte regexp, string, or byte string" pattern)))
(define-syntax-rule (regexp-loop
name loop start end rx string
success-choose failure-k
@ -174,21 +176,6 @@
void))]
[end (and end (- end start))]
[m (regexp-match rx string 0 end spitout)]
;; re-match if we get a zero-length match at the
;; beginning
[m (if (and m ; we have a match
;; and it's an empty one
(zero? (bstring-length (car m)))
;; and it's at the beginning
(zero? (if need-leftover?
(file-position spitout)
discarded/leftovers))
;; and we still have stuff to match
(if end
(< 0 end)
(not (eof-object? (peek-byte string)))))
(regexp-match rx string 1 end spitout)
m)]
[m (and m (car m))]
[discarded/leftovers (if need-leftover?
(get-output-bytes spitout)
@ -198,70 +185,31 @@
(bstring-length discarded/leftovers)
discarded/leftovers)
(bstring-length m)))])
;; drop matches that are both empty and at the end
(if (and m (or (< 0 (bstring-length m))
(if end
(< 0 end)
(not (eof-object? (peek-byte string))))))
(if m
(loop (cons (port-success-choose m discarded/leftovers) acc)
0 end)
(port-failure-k acc discarded/leftovers)))
;; String/port match, get positions
(let* ([match (if peek?
regexp-match-peek-positions
regexp-match-positions)]
[m (match rx string start end)])
(let ([m (if peek?
(regexp-match-peek-positions rx string start end)
(regexp-match-positions rx string start end))])
(if (not m)
(failure-k acc start end)
(let* ([mstart (caar m)]
[mend (cdar m)]
;; re-match if we get a zero-length match at the
;; beginning, and we can continue
[m (if (and (= mstart mend start)
(cond
[end (< start end)]
[len (< start len)]
[(input-port? string)
(not (eof-object? (peek-byte string)))]
[else (error "internal error (str)")]))
(if (or peek? (not (input-port? string)))
(match rx string (add1 start) end)
;; rematching on a port requires adding `start'
;; offsets
(let ([m (match rx string 1 end)])
(if (and m (positive? start))
(list (cons (+ start (caar m))
(+ start (cdar m))))
m)))
m)])
;; fail if rematch failed
(if (not m)
(failure-k acc start end)
(let ([mstart (caar m)]
[mend (cdar m)])
;; or if we have a zero-length match at the end
(if (and (= mstart mend)
(cond [end (= mend end)]
[len (= mend len)]
[(input-port? string)
(eof-object?
(peek-byte string (if peek? mend 0)))]
[else (error "internal error (str)")]))
(failure-k acc start end)
(if port-success-k
(port-success-k
(lambda (acc new-start new-end)
(loop acc new-start new-end))
acc start end mstart mend)
(loop (cons (success-choose start mstart mend) acc)
mend end)))))))))))))
(let ([mstart (caar m)] [mend (cdar m)])
(if port-success-k
(port-success-k
(lambda (acc new-start new-end)
(loop acc new-start new-end))
acc start end mstart mend)
(loop (cons (success-choose start mstart mend) acc)
mend end))))))))))
;; Returns all the positions at which the pattern matched.
(define (regexp-match-positions* pattern string [start 0] [end #f])
(regexp-loop
regexp-match-positions* loop start end
(bstring->regexp 'regexp-match-positions* pattern) string
(bstring->no-edge-regexp 'regexp-match-positions* pattern) string
;; success-choose:
(lambda (start mstart mend) (cons mstart mend))
;; failure-k:
@ -285,7 +233,7 @@
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
(regexp-loop
regexp-match-peek-positions* loop start end
(bstring->regexp 'regexp-match-peek-positions* pattern) string
(bstring->no-edge-regexp 'regexp-match-peek-positions* pattern) string
;; success-choose:
(lambda (start mstart mend) (cons mstart mend))
;; failure-k:
@ -300,7 +248,7 @@
;; Splits a string into a list by removing any piece which matches
;; the pattern.
(define (regexp-split pattern string [start 0] [end #f])
(define rx (bstring->regexp 'regexp-split pattern))
(define rx (bstring->no-edge-regexp 'regexp-split pattern))
(define buf (if (and (string? string) (byte-regexp? rx))
(string->bytes/utf-8 string (char->integer #\?))
string))
@ -322,7 +270,7 @@
;; Returns all the matches for the pattern in the string.
(define (regexp-match* pattern string [start 0] [end #f])
(define rx (bstring->regexp 'regexp-match* pattern))
(define rx (bstring->no-edge-regexp 'regexp-match* pattern))
(define buf (if (and (string? string) (byte-regexp? rx))
(string->bytes/utf-8 string (char->integer #\?))
string))

View File

@ -39,6 +39,7 @@
(define (->b x)
(cond [(list? x) (map ->b x)]
[(string? x) (string->bytes/utf-8 x)]
[(pregexp? x) (byte-pregexp (->b (object-name x)))]
[else x]))
(define fun* #f)
(define t
@ -126,8 +127,8 @@
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 #f)
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 5)
;; ---------- tests with zero-length matches ----------
;; Many of these tests can be repeated with Perl. To try something
;; in Perl, put this code in a file:
;; Many of these tests can be repeated with Perl. To try something in Perl,
;; put this code in a file:
;; #!/usr/bin/perl
;; sub test {
;; my ($rx,$str) = @_; @words = split /$rx/, $str;
@ -136,15 +137,16 @@
;; print ") eof \"$rx\" \"$str\")\n";
;; };
;; test("[abc]","1a2b3");
;; and it will print a test that does what perl is doing. Tests
;; that differ from Perl have explanations.
;; and it will print a test that does what perl is doing. Tests that differ
;; from Perl have explanations.
;;
(t regexp-split)
;; test("a","a");
;; (t '() eof "a" "a")
;; perl returns an empty list, we return '("" ""), and this is a
;; difference that is unrelated to dealing with empty matches,
;; just the way that perl's split throws out some empty matches.
;; perl returns an empty list, we return '("" ""), and this is a difference
;; that is unrelated to dealing with empty matches, just the way that
;; perl's split throws out some empty matches (it throws empty matches at
;; the end (but not at the beginning for some reason...))
(t '("" "") eof "a" "a")
;; test("3","123");
;; (t '("12") eof "3" "123")
@ -162,49 +164,51 @@
(t '("1" "2" "3" "4") eof " *" "12 34")
;; test(" *"," 12 34 ");
;; (t '("" "1" "2" "3" "4") eof " *" " 12 34 ")
;; perl drops the last empty string, we don't -- unrelated to
;; empty matches (same as the <"a","a"> case above)
;; again, perl drops the last empty string but we don't
(t '("" "1" "2" "3" "4" "") eof " *" " 12 34 ")
;; test("2|", "1234");
(t '("1" "3" "4") eof "2|" "1234")
;; test("1|", "1234");
(t '("" "2" "3" "4") eof "1|" "1234")
;; test("4|", "1234");
;; perl drops the last empty string, we don't, same as above
;; (t '("1" "2" "3") eof "4|" "1234")
;; perl perl drops the last empty string again
(t '("1" "2" "3" "") eof "4|" "1234")
;; test("|2", "1234");
;; (t '("1" "" "3" "4") eof "|2" "1234")
;; perl will find the "2", we can't do that since we'll always
;; find the empty match first, so it's just like using "" (to do
;; the perl thing, we'll need a hook into the matcher's C code, or
;; some way of saying `match this pattern but prefer a non-empty
;; match if possible')
(t '("1" "2" "3" "4") eof "|2" "1234")
(t '("1" "" "3" "4") eof "|2" "1234")
;; test("2|3", "1234");
(t '("1" "" "4") eof "2|3" "1234")
;; test("2|3|4", "12345");
(t '("1" "" "" "5") eof "2|3|4" "12345")
;; test("1|2", "1234");
(t '("" "" "34") eof "1|2" "1234")
;; test("3|4", "1234");
;; (t '("12") eof "3|4" "1234")
;; again, perl dumps empty matches at the end, even two
;; perl perl drops the last empty string again -- even two here
(t '("12" "" "") eof "3|4" "1234")
;; test("2|3|4", "1234");
;; (t '("1") eof "2|3|4" "1234")
;; ...and even three in this example
(t '("1" "" "" "") eof "2|3|4" "1234")
;; test('$',"123");
(t '("123") eof "$" "123")
;; test('^',"123");
;; (t '("123") eof "^" "123")
;; this is a technicality: perl matches "^" once, but mzscheme
;; matches on whatever `start' may be; perl is treating it as a
;; beginning-of-line instead of a ...-of-string behind your back
;; "since it isn't much use otherwise"
;; (http://perldoc.perl.org/functions/split.html); so our correct
;; test is:
(t '("1" "2" "3") eof "^" "123")
;; and we can get the same with "(m?:^)":
(t '("123") eof "(m?:^)" "123")
;; this is a technicality: perl matches "^" once, but mzscheme matches on
;; whatever `start' may be; perl is treating it as a beginning-of-line
;; instead of a beginning-of-string behind your back "since it isn't much
;; use otherwise" (http://perldoc.perl.org/functions/split.html); but we
;; don't allow empty matches at the beginning, so a `^' will never match,
;; and we get the same behavior anyway:
(t '("123") eof "^" "123")
;; test('^',"123\n456");
;; (t '("123\n" "456") eof "^" "123\n456")
;; we can get the same behavior as perl's with "(m?:^)":
(t '("123\n" "456") eof "(?m:^)" "123\n456")
;; test("\\b", "123 456");
(t '("123" " " "456") eof #px"\\b" "123 456")
;; test("^|a", "abc");
;; (t '("" "bc") eof "^|a" "abc")
;; same deal here, use "(m?:^)":
(t '("" "bc") eof "(m?:^|a)" "abc")
(t '("" "bc") eof "^|a" "abc")
;; some tests with bounds (these have no perl equivalences)
(t '("1" "2" " " "3" "4") eof "" "12 34" 0)
(t '("1" "2" " " "3" "4") eof "" "12 34" 0 #f)
@ -244,12 +248,13 @@
(apply (if (string? (car s)) string-append bytes-append)
(car s)
(append-map list m (cdr s)))))))
(t "12 34" #f " " "12 34")
(t "12 34" #f " " "12 34")
(t " 12 34 " #f " " " 12 34 ")
(t "12 34" #f " *" "12 34")
(t "12 34" #f " *" "12 34")
(t " 12 34 " #f " *" " 12 34 ")
(t "12 34" #f "" "12 34")
(t " 12 34 " #f "" " 12 34 "))
(t "12 34" #f "" "12 34")
(t " 12 34 " #f "" " 12 34 ")
)
;; ---------- string-append* ----------
(let ()