Implemented a `no-empty-edge-matches' function using the new
`make-regexp-tweaker', which produces a regexp that cannot match an empty string at the beginning or the end of the input. This: * simplies a whole bunch of messy and fragile code * makes `regexp-split' and friends usable with a pattern like #px"\\b" (which previously would match in every position, making it explode the string to 1-character strings) * makes it even closer to what perl does, the only way that we produce a different result from perl now is that we don't blindly drop empty matches at the end like perl (it *does* keep empty matches in the beginning though) The two tests that demonstrated the difference are now changed, and a bunch of other tests added. svn: r12575
This commit is contained in:
parent
70e85a62a9
commit
5b9f0aa322
|
@ -76,9 +76,10 @@
|
||||||
;; because of the intermediate regexp being recreated
|
;; because of the intermediate regexp being recreated
|
||||||
[(string? rx) (tweak (lambda (x) x) regexp ->str)]
|
[(string? rx) (tweak (lambda (x) x) regexp ->str)]
|
||||||
[(bytes? rx) (tweak (lambda (x) x) byte-regexp ->bts)]
|
[(bytes? rx) (tweak (lambda (x) x) byte-regexp ->bts)]
|
||||||
[else (raise-type-error 'regexp-tweaker
|
[else (raise-type-error
|
||||||
"regexp of any kind, string, or bytes"
|
'regexp-tweaker
|
||||||
rx)]))
|
"regexp, byte regexp, string, or byte string"
|
||||||
|
rx)]))
|
||||||
(or (hash-ref t rx #f)
|
(or (hash-ref t rx #f)
|
||||||
(let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*)))))
|
(let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*)))))
|
||||||
|
|
||||||
|
@ -107,13 +108,14 @@
|
||||||
;; Helper macro for the regexp functions below, with some utilities.
|
;; Helper macro for the regexp functions below, with some utilities.
|
||||||
(define (bstring-length s)
|
(define (bstring-length s)
|
||||||
(if (bytes? s) (bytes-length s) (string-length s)))
|
(if (bytes? s) (bytes-length s) (string-length s)))
|
||||||
(define (bstring->regexp name pattern)
|
(define no-empty-edge-matches
|
||||||
(cond [(regexp? pattern) pattern]
|
(make-regexp-tweaker (lambda (rx) (format "(?=.)(?:~a)(?<=.)" rx))))
|
||||||
[(byte-regexp? pattern) pattern]
|
(define (bstring->no-edge-regexp name pattern)
|
||||||
[(string? pattern) (regexp pattern)]
|
(if (or (regexp? pattern) (byte-regexp? pattern)
|
||||||
[(bytes? pattern) (byte-regexp pattern)]
|
(string? pattern) (bytes? pattern))
|
||||||
[else (raise-type-error
|
(no-empty-edge-matches pattern)
|
||||||
name "regexp, byte regexp, string, or byte string" pattern)]))
|
(raise-type-error
|
||||||
|
name "regexp, byte regexp, string, or byte string" pattern)))
|
||||||
(define-syntax-rule (regexp-loop
|
(define-syntax-rule (regexp-loop
|
||||||
name loop start end rx string
|
name loop start end rx string
|
||||||
success-choose failure-k
|
success-choose failure-k
|
||||||
|
@ -174,21 +176,6 @@
|
||||||
void))]
|
void))]
|
||||||
[end (and end (- end start))]
|
[end (and end (- end start))]
|
||||||
[m (regexp-match rx string 0 end spitout)]
|
[m (regexp-match rx string 0 end spitout)]
|
||||||
;; re-match if we get a zero-length match at the
|
|
||||||
;; beginning
|
|
||||||
[m (if (and m ; we have a match
|
|
||||||
;; and it's an empty one
|
|
||||||
(zero? (bstring-length (car m)))
|
|
||||||
;; and it's at the beginning
|
|
||||||
(zero? (if need-leftover?
|
|
||||||
(file-position spitout)
|
|
||||||
discarded/leftovers))
|
|
||||||
;; and we still have stuff to match
|
|
||||||
(if end
|
|
||||||
(< 0 end)
|
|
||||||
(not (eof-object? (peek-byte string)))))
|
|
||||||
(regexp-match rx string 1 end spitout)
|
|
||||||
m)]
|
|
||||||
[m (and m (car m))]
|
[m (and m (car m))]
|
||||||
[discarded/leftovers (if need-leftover?
|
[discarded/leftovers (if need-leftover?
|
||||||
(get-output-bytes spitout)
|
(get-output-bytes spitout)
|
||||||
|
@ -198,70 +185,31 @@
|
||||||
(bstring-length discarded/leftovers)
|
(bstring-length discarded/leftovers)
|
||||||
discarded/leftovers)
|
discarded/leftovers)
|
||||||
(bstring-length m)))])
|
(bstring-length m)))])
|
||||||
;; drop matches that are both empty and at the end
|
(if m
|
||||||
(if (and m (or (< 0 (bstring-length m))
|
|
||||||
(if end
|
|
||||||
(< 0 end)
|
|
||||||
(not (eof-object? (peek-byte string))))))
|
|
||||||
(loop (cons (port-success-choose m discarded/leftovers) acc)
|
(loop (cons (port-success-choose m discarded/leftovers) acc)
|
||||||
0 end)
|
0 end)
|
||||||
(port-failure-k acc discarded/leftovers)))
|
(port-failure-k acc discarded/leftovers)))
|
||||||
|
|
||||||
;; String/port match, get positions
|
;; String/port match, get positions
|
||||||
(let* ([match (if peek?
|
(let ([m (if peek?
|
||||||
regexp-match-peek-positions
|
(regexp-match-peek-positions rx string start end)
|
||||||
regexp-match-positions)]
|
(regexp-match-positions rx string start end))])
|
||||||
[m (match rx string start end)])
|
|
||||||
(if (not m)
|
(if (not m)
|
||||||
(failure-k acc start end)
|
(failure-k acc start end)
|
||||||
(let* ([mstart (caar m)]
|
(let ([mstart (caar m)] [mend (cdar m)])
|
||||||
[mend (cdar m)]
|
(if port-success-k
|
||||||
;; re-match if we get a zero-length match at the
|
(port-success-k
|
||||||
;; beginning, and we can continue
|
(lambda (acc new-start new-end)
|
||||||
[m (if (and (= mstart mend start)
|
(loop acc new-start new-end))
|
||||||
(cond
|
acc start end mstart mend)
|
||||||
[end (< start end)]
|
(loop (cons (success-choose start mstart mend) acc)
|
||||||
[len (< start len)]
|
mend end))))))))))
|
||||||
[(input-port? string)
|
|
||||||
(not (eof-object? (peek-byte string)))]
|
|
||||||
[else (error "internal error (str)")]))
|
|
||||||
(if (or peek? (not (input-port? string)))
|
|
||||||
(match rx string (add1 start) end)
|
|
||||||
;; rematching on a port requires adding `start'
|
|
||||||
;; offsets
|
|
||||||
(let ([m (match rx string 1 end)])
|
|
||||||
(if (and m (positive? start))
|
|
||||||
(list (cons (+ start (caar m))
|
|
||||||
(+ start (cdar m))))
|
|
||||||
m)))
|
|
||||||
m)])
|
|
||||||
;; fail if rematch failed
|
|
||||||
(if (not m)
|
|
||||||
(failure-k acc start end)
|
|
||||||
(let ([mstart (caar m)]
|
|
||||||
[mend (cdar m)])
|
|
||||||
;; or if we have a zero-length match at the end
|
|
||||||
(if (and (= mstart mend)
|
|
||||||
(cond [end (= mend end)]
|
|
||||||
[len (= mend len)]
|
|
||||||
[(input-port? string)
|
|
||||||
(eof-object?
|
|
||||||
(peek-byte string (if peek? mend 0)))]
|
|
||||||
[else (error "internal error (str)")]))
|
|
||||||
(failure-k acc start end)
|
|
||||||
(if port-success-k
|
|
||||||
(port-success-k
|
|
||||||
(lambda (acc new-start new-end)
|
|
||||||
(loop acc new-start new-end))
|
|
||||||
acc start end mstart mend)
|
|
||||||
(loop (cons (success-choose start mstart mend) acc)
|
|
||||||
mend end)))))))))))))
|
|
||||||
|
|
||||||
;; Returns all the positions at which the pattern matched.
|
;; Returns all the positions at which the pattern matched.
|
||||||
(define (regexp-match-positions* pattern string [start 0] [end #f])
|
(define (regexp-match-positions* pattern string [start 0] [end #f])
|
||||||
(regexp-loop
|
(regexp-loop
|
||||||
regexp-match-positions* loop start end
|
regexp-match-positions* loop start end
|
||||||
(bstring->regexp 'regexp-match-positions* pattern) string
|
(bstring->no-edge-regexp 'regexp-match-positions* pattern) string
|
||||||
;; success-choose:
|
;; success-choose:
|
||||||
(lambda (start mstart mend) (cons mstart mend))
|
(lambda (start mstart mend) (cons mstart mend))
|
||||||
;; failure-k:
|
;; failure-k:
|
||||||
|
@ -285,7 +233,7 @@
|
||||||
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
|
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
|
||||||
(regexp-loop
|
(regexp-loop
|
||||||
regexp-match-peek-positions* loop start end
|
regexp-match-peek-positions* loop start end
|
||||||
(bstring->regexp 'regexp-match-peek-positions* pattern) string
|
(bstring->no-edge-regexp 'regexp-match-peek-positions* pattern) string
|
||||||
;; success-choose:
|
;; success-choose:
|
||||||
(lambda (start mstart mend) (cons mstart mend))
|
(lambda (start mstart mend) (cons mstart mend))
|
||||||
;; failure-k:
|
;; failure-k:
|
||||||
|
@ -300,7 +248,7 @@
|
||||||
;; Splits a string into a list by removing any piece which matches
|
;; Splits a string into a list by removing any piece which matches
|
||||||
;; the pattern.
|
;; the pattern.
|
||||||
(define (regexp-split pattern string [start 0] [end #f])
|
(define (regexp-split pattern string [start 0] [end #f])
|
||||||
(define rx (bstring->regexp 'regexp-split pattern))
|
(define rx (bstring->no-edge-regexp 'regexp-split pattern))
|
||||||
(define buf (if (and (string? string) (byte-regexp? rx))
|
(define buf (if (and (string? string) (byte-regexp? rx))
|
||||||
(string->bytes/utf-8 string (char->integer #\?))
|
(string->bytes/utf-8 string (char->integer #\?))
|
||||||
string))
|
string))
|
||||||
|
@ -322,7 +270,7 @@
|
||||||
|
|
||||||
;; Returns all the matches for the pattern in the string.
|
;; Returns all the matches for the pattern in the string.
|
||||||
(define (regexp-match* pattern string [start 0] [end #f])
|
(define (regexp-match* pattern string [start 0] [end #f])
|
||||||
(define rx (bstring->regexp 'regexp-match* pattern))
|
(define rx (bstring->no-edge-regexp 'regexp-match* pattern))
|
||||||
(define buf (if (and (string? string) (byte-regexp? rx))
|
(define buf (if (and (string? string) (byte-regexp? rx))
|
||||||
(string->bytes/utf-8 string (char->integer #\?))
|
(string->bytes/utf-8 string (char->integer #\?))
|
||||||
string))
|
string))
|
||||||
|
|
|
@ -39,6 +39,7 @@
|
||||||
(define (->b x)
|
(define (->b x)
|
||||||
(cond [(list? x) (map ->b x)]
|
(cond [(list? x) (map ->b x)]
|
||||||
[(string? x) (string->bytes/utf-8 x)]
|
[(string? x) (string->bytes/utf-8 x)]
|
||||||
|
[(pregexp? x) (byte-pregexp (->b (object-name x)))]
|
||||||
[else x]))
|
[else x]))
|
||||||
(define fun* #f)
|
(define fun* #f)
|
||||||
(define t
|
(define t
|
||||||
|
@ -126,8 +127,8 @@
|
||||||
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 #f)
|
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 #f)
|
||||||
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 5)
|
(t '((0 . 1) (2 . 3) (4 . 5)) "a b c" "[abc]" "a b c" 0 5)
|
||||||
;; ---------- tests with zero-length matches ----------
|
;; ---------- tests with zero-length matches ----------
|
||||||
;; Many of these tests can be repeated with Perl. To try something
|
;; Many of these tests can be repeated with Perl. To try something in Perl,
|
||||||
;; in Perl, put this code in a file:
|
;; put this code in a file:
|
||||||
;; #!/usr/bin/perl
|
;; #!/usr/bin/perl
|
||||||
;; sub test {
|
;; sub test {
|
||||||
;; my ($rx,$str) = @_; @words = split /$rx/, $str;
|
;; my ($rx,$str) = @_; @words = split /$rx/, $str;
|
||||||
|
@ -136,15 +137,16 @@
|
||||||
;; print ") eof \"$rx\" \"$str\")\n";
|
;; print ") eof \"$rx\" \"$str\")\n";
|
||||||
;; };
|
;; };
|
||||||
;; test("[abc]","1a2b3");
|
;; test("[abc]","1a2b3");
|
||||||
;; and it will print a test that does what perl is doing. Tests
|
;; and it will print a test that does what perl is doing. Tests that differ
|
||||||
;; that differ from Perl have explanations.
|
;; from Perl have explanations.
|
||||||
;;
|
;;
|
||||||
(t regexp-split)
|
(t regexp-split)
|
||||||
;; test("a","a");
|
;; test("a","a");
|
||||||
;; (t '() eof "a" "a")
|
;; (t '() eof "a" "a")
|
||||||
;; perl returns an empty list, we return '("" ""), and this is a
|
;; perl returns an empty list, we return '("" ""), and this is a difference
|
||||||
;; difference that is unrelated to dealing with empty matches,
|
;; that is unrelated to dealing with empty matches, just the way that
|
||||||
;; just the way that perl's split throws out some empty matches.
|
;; perl's split throws out some empty matches (it throws empty matches at
|
||||||
|
;; the end (but not at the beginning for some reason...))
|
||||||
(t '("" "") eof "a" "a")
|
(t '("" "") eof "a" "a")
|
||||||
;; test("3","123");
|
;; test("3","123");
|
||||||
;; (t '("12") eof "3" "123")
|
;; (t '("12") eof "3" "123")
|
||||||
|
@ -162,49 +164,51 @@
|
||||||
(t '("1" "2" "3" "4") eof " *" "12 34")
|
(t '("1" "2" "3" "4") eof " *" "12 34")
|
||||||
;; test(" *"," 12 34 ");
|
;; test(" *"," 12 34 ");
|
||||||
;; (t '("" "1" "2" "3" "4") eof " *" " 12 34 ")
|
;; (t '("" "1" "2" "3" "4") eof " *" " 12 34 ")
|
||||||
;; perl drops the last empty string, we don't -- unrelated to
|
;; again, perl drops the last empty string but we don't
|
||||||
;; empty matches (same as the <"a","a"> case above)
|
|
||||||
(t '("" "1" "2" "3" "4" "") eof " *" " 12 34 ")
|
(t '("" "1" "2" "3" "4" "") eof " *" " 12 34 ")
|
||||||
;; test("2|", "1234");
|
;; test("2|", "1234");
|
||||||
(t '("1" "3" "4") eof "2|" "1234")
|
(t '("1" "3" "4") eof "2|" "1234")
|
||||||
;; test("1|", "1234");
|
;; test("1|", "1234");
|
||||||
(t '("" "2" "3" "4") eof "1|" "1234")
|
(t '("" "2" "3" "4") eof "1|" "1234")
|
||||||
;; test("4|", "1234");
|
;; test("4|", "1234");
|
||||||
;; perl drops the last empty string, we don't, same as above
|
;; (t '("1" "2" "3") eof "4|" "1234")
|
||||||
|
;; perl perl drops the last empty string again
|
||||||
(t '("1" "2" "3" "") eof "4|" "1234")
|
(t '("1" "2" "3" "") eof "4|" "1234")
|
||||||
;; test("|2", "1234");
|
;; test("|2", "1234");
|
||||||
;; (t '("1" "" "3" "4") eof "|2" "1234")
|
(t '("1" "" "3" "4") eof "|2" "1234")
|
||||||
;; perl will find the "2", we can't do that since we'll always
|
|
||||||
;; find the empty match first, so it's just like using "" (to do
|
|
||||||
;; the perl thing, we'll need a hook into the matcher's C code, or
|
|
||||||
;; some way of saying `match this pattern but prefer a non-empty
|
|
||||||
;; match if possible')
|
|
||||||
(t '("1" "2" "3" "4") eof "|2" "1234")
|
|
||||||
;; test("2|3", "1234");
|
;; test("2|3", "1234");
|
||||||
(t '("1" "" "4") eof "2|3" "1234")
|
(t '("1" "" "4") eof "2|3" "1234")
|
||||||
|
;; test("2|3|4", "12345");
|
||||||
|
(t '("1" "" "" "5") eof "2|3|4" "12345")
|
||||||
;; test("1|2", "1234");
|
;; test("1|2", "1234");
|
||||||
(t '("" "" "34") eof "1|2" "1234")
|
(t '("" "" "34") eof "1|2" "1234")
|
||||||
;; test("3|4", "1234");
|
;; test("3|4", "1234");
|
||||||
;; (t '("12") eof "3|4" "1234")
|
;; (t '("12") eof "3|4" "1234")
|
||||||
;; again, perl dumps empty matches at the end, even two
|
;; perl perl drops the last empty string again -- even two here
|
||||||
(t '("12" "" "") eof "3|4" "1234")
|
(t '("12" "" "") eof "3|4" "1234")
|
||||||
|
;; test("2|3|4", "1234");
|
||||||
|
;; (t '("1") eof "2|3|4" "1234")
|
||||||
|
;; ...and even three in this example
|
||||||
|
(t '("1" "" "" "") eof "2|3|4" "1234")
|
||||||
;; test('$',"123");
|
;; test('$',"123");
|
||||||
(t '("123") eof "$" "123")
|
(t '("123") eof "$" "123")
|
||||||
;; test('^',"123");
|
;; test('^',"123");
|
||||||
;; (t '("123") eof "^" "123")
|
;; (t '("123") eof "^" "123")
|
||||||
;; this is a technicality: perl matches "^" once, but mzscheme
|
;; this is a technicality: perl matches "^" once, but mzscheme matches on
|
||||||
;; matches on whatever `start' may be; perl is treating it as a
|
;; whatever `start' may be; perl is treating it as a beginning-of-line
|
||||||
;; beginning-of-line instead of a ...-of-string behind your back
|
;; instead of a beginning-of-string behind your back "since it isn't much
|
||||||
;; "since it isn't much use otherwise"
|
;; use otherwise" (http://perldoc.perl.org/functions/split.html); but we
|
||||||
;; (http://perldoc.perl.org/functions/split.html); so our correct
|
;; don't allow empty matches at the beginning, so a `^' will never match,
|
||||||
;; test is:
|
;; and we get the same behavior anyway:
|
||||||
(t '("1" "2" "3") eof "^" "123")
|
(t '("123") eof "^" "123")
|
||||||
;; and we can get the same with "(m?:^)":
|
;; test('^',"123\n456");
|
||||||
(t '("123") eof "(m?:^)" "123")
|
;; (t '("123\n" "456") eof "^" "123\n456")
|
||||||
|
;; we can get the same behavior as perl's with "(m?:^)":
|
||||||
|
(t '("123\n" "456") eof "(?m:^)" "123\n456")
|
||||||
|
;; test("\\b", "123 456");
|
||||||
|
(t '("123" " " "456") eof #px"\\b" "123 456")
|
||||||
;; test("^|a", "abc");
|
;; test("^|a", "abc");
|
||||||
;; (t '("" "bc") eof "^|a" "abc")
|
(t '("" "bc") eof "^|a" "abc")
|
||||||
;; same deal here, use "(m?:^)":
|
|
||||||
(t '("" "bc") eof "(m?:^|a)" "abc")
|
|
||||||
;; some tests with bounds (these have no perl equivalences)
|
;; some tests with bounds (these have no perl equivalences)
|
||||||
(t '("1" "2" " " "3" "4") eof "" "12 34" 0)
|
(t '("1" "2" " " "3" "4") eof "" "12 34" 0)
|
||||||
(t '("1" "2" " " "3" "4") eof "" "12 34" 0 #f)
|
(t '("1" "2" " " "3" "4") eof "" "12 34" 0 #f)
|
||||||
|
@ -244,12 +248,13 @@
|
||||||
(apply (if (string? (car s)) string-append bytes-append)
|
(apply (if (string? (car s)) string-append bytes-append)
|
||||||
(car s)
|
(car s)
|
||||||
(append-map list m (cdr s)))))))
|
(append-map list m (cdr s)))))))
|
||||||
(t "12 34" #f " " "12 34")
|
(t "12 34" #f " " "12 34")
|
||||||
(t " 12 34 " #f " " " 12 34 ")
|
(t " 12 34 " #f " " " 12 34 ")
|
||||||
(t "12 34" #f " *" "12 34")
|
(t "12 34" #f " *" "12 34")
|
||||||
(t " 12 34 " #f " *" " 12 34 ")
|
(t " 12 34 " #f " *" " 12 34 ")
|
||||||
(t "12 34" #f "" "12 34")
|
(t "12 34" #f "" "12 34")
|
||||||
(t " 12 34 " #f "" " 12 34 "))
|
(t " 12 34 " #f "" " 12 34 ")
|
||||||
|
)
|
||||||
|
|
||||||
;; ---------- string-append* ----------
|
;; ---------- string-append* ----------
|
||||||
(let ()
|
(let ()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user