Perl-like handling of empty matches in regexp-match*, etc., though without Perl-like filtering of empty strings in the result of regexp-split (PR 10855)

This commit is contained in:
Matthew Flatt 2010-04-14 22:53:14 -04:00
parent 5abf8bb530
commit 4bc155905a
8 changed files with 217 additions and 98 deletions

View File

@ -16,9 +16,12 @@
call-with-input-file
call-with-output-file
with-input-from-file
with-output-to-file)
with-output-to-file
regexp-replace*)
(all-from "private/list.ss")
(all-from "private/string.ss")
(all-from-except "private/string.ss"
-regexp-replace*)
(rename -regexp-replace* regexp-replace*)
identifier?
(all-from "private/stxcase-scheme.ss")
(all-from "private/qqstx.ss")

View File

@ -8,7 +8,8 @@
regexp-match-peek-positions*
regexp-split
regexp-match-exact?
regexp-try-match)
regexp-try-match
-regexp-replace*)
(require (for-syntax "stxcase-scheme.ss"))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -111,8 +112,11 @@
(define (no-empty-edge-matches n)
(make-regexp-tweaker (lambda (rx)
(if (bytes? rx)
(bytes-append #"(?=.)(?:" rx #")(?<=" (make-bytes n (char->integer #\.)) #")")
(format "(?=.)(?:~a)(?<=~a)" rx (make-bytes n (char->integer #\.)))))))
(bytes-append #"(?:"
rx
#")(?<=" (make-bytes n (char->integer #\.)) #")")
(format "(?:~a)(?<=~a)"
rx (make-bytes n (char->integer #\.)))))))
(define-syntax-rule (regexp-loop
name loop start end pattern string
ipre
@ -159,11 +163,11 @@
(format "ending offset index out of range [~a,~a]: " start len)
end))
(reverse
(let loop ([acc '()] [start start] [end end] [ipre ipre] [rx #f] [rx-lb 0])
(let* ([new-rx-lb (add1 (bytes-length ipre))]
[rx (if (= rx-lb new-rx-lb)
rx
((no-empty-edge-matches new-rx-lb) orig-rx))])
(let loop ([acc '()] [start start] [end end] [ipre ipre] [0-ok? #t])
(let* ([rx (if 0-ok?
orig-rx
((no-empty-edge-matches (add1 (bytes-length ipre)))
orig-rx))])
(if (and port-success-choose (input-port? string))
;; Input port match, get string
@ -187,42 +191,47 @@
c))
void))]
[end (and end (- end start))])
(let-values ([(m ipre) (regexp-match/end rx string 0 end spitout ipre
(let-values ([(ms ipre) (regexp-match/end rx
string 0 end spitout ipre
max-lookbehind)])
(let* ([m (and m (car m))]
(let* ([m (and ms (car ms))]
[discarded/leftovers (if need-leftover?
(get-output-bytes spitout)
discarded/leftovers)]
[skipped (if need-leftover?
(bstring-length discarded/leftovers)
discarded/leftovers)]
[got (and m (bstring-length m))]
[end (and end m
(- end (if need-leftover?
(bstring-length discarded/leftovers)
discarded/leftovers)
(bstring-length m)))])
(- end skipped got))])
(if m
(loop (cons (port-success-choose m discarded/leftovers) acc)
0 end ipre
rx new-rx-lb)
(let ([0-ok? (not (zero? got))])
(loop (port-success-choose m discarded/leftovers ms acc)
0 end ipre 0-ok?))
(port-failure-k acc discarded/leftovers)))))
;; String/port match, get positions
(let-values ([(m ipre)
(if peek?
(regexp-match-peek-positions/end rx string start end #f ipre
(regexp-match-peek-positions/end rx
string start end #f ipre
max-lookbehind)
(regexp-match-positions/end rx string start end #f ipre
(regexp-match-positions/end rx
string start end #f ipre
max-lookbehind))])
(if (not m)
(failure-k acc start end)
(let ([mstart (caar m)] [mend (cdar m)])
(let* ([mstart (caar m)]
[mend (cdar m)]
[0-ok? (not (= mstart mend))])
(if port-success-k
(port-success-k
(lambda (acc new-start new-end)
(loop acc new-start new-end ipre rx new-rx-lb))
(loop acc new-start new-end ipre 0-ok?))
acc start end mstart mend)
(loop (cons (success-choose start mstart mend) acc)
mend end ipre rx new-rx-lb)))))))))))
(loop (success-choose start mstart mend m acc)
mend end ipre 0-ok?)))))))))))
;; Returns all the positions at which the pattern matched.
(define (regexp-match-positions* pattern string [start 0] [end #f] [ipre #""])
(regexp-loop
@ -230,7 +239,7 @@
pattern string
ipre
;; success-choose:
(lambda (start mstart mend) (cons mstart mend))
(lambda (start mstart mend ms acc) (cons (cons mstart mend) acc))
;; failure-k:
(lambda (acc start end) acc)
;; port-success-k: need to shift index of rest as reading; cannot
@ -255,7 +264,7 @@
pattern string
ipre
;; success-choose:
(lambda (start mstart mend) (cons mstart mend))
(lambda (start mstart mend ms acc) (cons (cons mstart mend) acc))
;; failure-k:
(lambda (acc start end) acc)
;; port functions: use string case
@ -275,19 +284,133 @@
(define sub (if (bytes? buf) subbytes substring))
(regexp-loop regexp-split loop start end pattern buf ipre
;; success-choose:
(lambda (start mstart mend) (sub buf start mstart))
(lambda (start mstart mend ms acc) (cons (sub buf start mstart) acc))
;; failure-k:
(lambda (acc start end)
(cons (if end (sub buf start end) (sub buf start)) acc))
;; port-success-k:
#f
;; port-success-choose:
(lambda (match-string leftovers) leftovers)
(lambda (match-string leftovers ms acc) (cons leftovers acc))
;; port-failure-k:
(lambda (acc leftover) (if leftover (cons leftover acc) acc))
#t
#f))
;; Like splitting, but insert a replacement between matches
(define -regexp-replace*
(let ([regexp-replace*
(lambda (pattern string orig-replacement [ipre #""])
(define buf (if (and (string? string) (or (byte-regexp? pattern)
(bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?))
string))
(define sub (if (bytes? buf) subbytes substring))
(define start 0)
(define end #f)
(define needs-string?
(and (or (string? pattern) (regexp? pattern))
(string? string)))
(define replacement
(if (and (not needs-string?) (string? orig-replacement))
(string->bytes/utf-8 orig-replacement)
orig-replacement))
(define (check proc arg)
(let ([v (proc arg)])
(unless (if needs-string?
(string? v)
(bytes? v))
(raise-mismatch-error '|regexp-replace* (calling given filter procedure)|
(if needs-string?
"expected a string result: "
"expected a byte string result: ")
v))
v))
(define rx:sub #rx#"^(?:[^&\\]*[\\][&\\])*[^&\\]*(?:&|[\\](?=[^&\\]|$))")
(define need-replac? (and (not (procedure? replacement))
(regexp-match? rx:sub replacement)))
(define (replac ms str)
(if need-replac?
((if (string? str) bytes->string/utf-8 values)
(apply
bytes-append
(let ([str (if (string? str) (string->bytes/utf-8 str) str)]
[get-match (lambda (n)
(if (n . < . (length ms))
(let* ([p (list-ref ms n)]
[s (if (pair? p)
(sub buf (car p) (cdr p))
p)])
(if (string? s)
(string->bytes/utf-8 s)
s))
#""))])
(let loop ([pos 0])
(let ([m (regexp-match-positions #rx#"[\\&]" str pos)])
(if m
(cons (subbytes str pos (caar m))
(cond
[(equal? (char->integer #\&) (bytes-ref str (caar m)))
(cons (get-match 0) (loop (cdar m)))]
[(= (cdar m) (bytes-length str))
;; \ with no following character
(list (get-match 0))]
[(let ([next (bytes-ref str (cdar m))])
(or (and (equal? (char->integer #\&) next)
#"&")
(and (equal? (char->integer #\\) next)
#"\\")))
=> (lambda (s)
(cons s (loop (add1 (cdar m)))))]
[else
(let ([n (regexp-match #rx#"^[0-9]+" str (cdar m))])
(if n
(cons (get-match (string->number (bytes->string/utf-8 (car n))))
(loop (+ (cdar m) (bytes-length (car n)))))
(cons (get-match 0)
(loop (cdar m)))))]))
(list (subbytes str pos))))))))
str))
(when (or (string? pattern) (bytes? pattern)
(regexp? pattern) (byte-regexp? pattern))
(unless (or (string? string)
(bytes? string))
(raise-type-error 'regexp-replace* "string or byte string" string))
(unless (or (string? replacement)
(bytes? replacement)
(and (procedure? replacement)
(procedure-arity-includes? replacement 1)))
(raise-type-error 'regexp-replace* "string, byte string, or procedure (arity 1)"
replacement))
(when (and needs-string? (bytes? replacement))
(raise-mismatch-error 'regexp-replace*
"cannot replace a string with a byte string: "
replacement)))
(apply
(if (bytes? buf) bytes-append string-append)
(regexp-loop
regexp-replace* loop start end pattern buf ipre
;; success-choose:
(lambda (start mstart mend ms acc) (list* (if (procedure? replacement)
(check
replacement
(sub buf mstart mend))
(replac ms replacement))
(sub buf start mstart)
acc))
;; failure-k:
(lambda (acc start end)
(cons (if end (sub buf start end) (sub buf start)) acc))
;; port-success-k:
#f
;; port-success-choose:
#f
;; port-failure-k:
#f
#t
#f)))])
regexp-replace*))
;; Returns all the matches for the pattern in the string.
(define (regexp-match* pattern string [start 0] [end #f] [ipre #""])
(define buf (if (and (string? string) (or (byte-regexp? pattern)
@ -297,13 +420,13 @@
(define sub (if (bytes? buf) subbytes substring))
(regexp-loop regexp-match* loop start end pattern buf ipre
;; success-choose:
(lambda (start mstart mend) (sub buf mstart mend))
(lambda (start mstart mend ms acc) (cons (sub buf mstart mend) acc))
;; failure-k:
(lambda (acc start end) acc)
;; port-success-k:
#f
;; port-success-choose:
(lambda (match-string leftovers) match-string)
(lambda (match-string leftovers ms acc) (cons match-string acc))
;; port-failure-k:
(lambda (acc leftover) acc)
#f

View File

@ -336,14 +336,13 @@ results for parenthesized sub-patterns in @scheme[pattern] are not
returned.)
The @scheme[pattern] is used in order to find matches, where each
match attempt starts at the end of the last match, and @litchar{$} is
match attempt starts at the end of the last match, and @litchar{^} is
allowed to match the beginning of the input (if @scheme[input-prefix]
is @scheme[#""]) only for the first match. Empty matches are handled
like other matches, returning a zero-length string or byte sequence
(they are more useful in the complementing @scheme[regexp-split]
function), but @scheme[pattern] is restricted from matching an empty
string at the beginning (or right after a previous match) or at the
end.
sequence immediately after an empty match.
If @scheme[input] contains no matches (in the range @scheme[start-pos]
to @scheme[end-pos]), @scheme[null] is returned. Otherwise, each item
@ -355,6 +354,7 @@ port).
@examples[
(regexp-match* #rx"x." "12x4x6")
(regexp-match* #rx"x*" "12x4x6")
]}
@ -639,8 +639,8 @@ strings (if @scheme[pattern] is a string or character regexp and
@scheme[input] is a string) or byte strings (otherwise) from in
@scheme[input] that are separated by matches to
@scheme[pattern]. Adjacent matches are separated with @scheme[""] or
@scheme[#""]. Zero-length matches are treated the same as in
@scheme[regexp-match*], but are more useful in this case.
@scheme[#""]. Zero-length matches are treated the same as for
@scheme[regexp-match*].
If @scheme[input] contains no matches (in the range @scheme[start-pos]
to @scheme[end-pos]), the result is a list containing @scheme[input]'s

View File

@ -1062,13 +1062,15 @@
(test '(#"") regexp-match "$" (open-input-string "123") 3)
(test '(#"") regexp-match-peek "" (open-input-string "123") 3)
(test "1b2b3" regexp-replace* "" "123" "b")
(test "b1b2b3b" regexp-replace* "" "123" "b")
(test "1b23" regexp-replace* "(?=2)" "123" "b")
(test "ax\u03BB" regexp-replace* "" "a\u03BB" "x")
(test "ax\u03BBxb" regexp-replace* "" "a\u03BBb" "x")
(test #"ax\316x\273xb" regexp-replace* #"" "a\u03BBb" #"x")
(test "1=2===3" regexp-replace* "2*" "123" (lambda (s) (string-append "=" s "=")))
(test "1=2===3==4" regexp-replace* "2*" "1234" (lambda (s) (string-append "=" s "=")))
(test "xax\u03BBx" regexp-replace* "" "a\u03BB" "x")
(test "xax\u03BBxbx" regexp-replace* "" "a\u03BBb" "x")
(test #"xax\316x\273xbx" regexp-replace* #"" "a\u03BBb" #"x")
(test "==1=2===3==" regexp-replace* "2*" "123" (lambda (s) (string-append "=" s "=")))
(test "==1=2===3==4==" regexp-replace* "2*" "1234" (lambda (s) (string-append "=" s "=")))
(test "x&b\\ab=cy&w\\aw=z" regexp-replace* #rx"a(.)" "xabcyawz" "\\&\\1\\\\&\\99=")
;; Test weird port offsets:
(define (test-weird-offset regexp-match regexp-match-positions)

View File

@ -912,6 +912,7 @@
(#"(?<=(foo)a)bar" #"fooabar" (#"bar" #"foo"))
(#"(?<=(foo)a)bar" #"bar" #f)
(#"(?<=(foo)a)bar" #"foobbar" #f)
(#"a(?<=x|ab)b" #"ab" #f)
(#"(?>(\\.\\d\\d[1-9]?))\\d+" #"1.230003938" (#".230003938" #".23"))
(#"(?>(\\.\\d\\d[1-9]?))\\d+" #"1.875000282" (#".875000282" #".875"))
(#"(?>(\\.\\d\\d[1-9]?))\\d+" #"1.235 " #f)

View File

@ -180,27 +180,25 @@
;; test("a","1a2a3");
(t '("1" "2" "3") eof "a" "1a2a3")
;; test("","123");
(t '("1" "2" "3") eof "" "123")
(t '("" "1" "2" "3" "") eof "" "123")
;; test("","12 34");
(t '("1" "2" " " "3" "4") eof "" "12 34")
(t '("" "1" "2" " " "3" "4" "") eof "" "12 34")
;; test(" *","123");
(t '("1" "2" "3") eof " *" "123")
(t '("" "1" "2" "3" "") eof " *" "123")
;; test(" *","12 34");
(t '("1" "2" "3" "4") eof " *" "12 34")
(t '("" "1" "2" "" "3" "4" "") eof " *" "12 34")
;; test(" *"," 12 34 ");
;; (t '("" "1" "2" "3" "4") eof " *" " 12 34 ")
;; again, perl drops the last empty string but we don't
(t '("" "1" "2" "3" "4" "") eof " *" " 12 34 ")
(t '("" "" "1" "2" "" "3" "4" "" "") eof " *" " 12 34 ")
;; test("2|", "1234");
(t '("1" "3" "4") eof "2|" "1234")
(t '("" "1" "" "3" "4" "") eof "2|" "1234")
;; test("1|", "1234");
(t '("" "2" "3" "4") eof "1|" "1234")
(t '("" "" "2" "3" "4" "") eof "1|" "1234")
;; test("4|", "1234");
;; (t '("1" "2" "3") eof "4|" "1234")
;; perl perl drops the last empty string again
(t '("1" "2" "3" "") eof "4|" "1234")
(t '("" "1" "2" "3" "" "") eof "4|" "1234")
;; test("|2", "1234");
(t '("1" "" "3" "4") eof "|2" "1234")
(t '("" "1" "" "" "3" "4" "") eof "|2" "1234")
;; test("2|3", "1234");
(t '("1" "" "4") eof "2|3" "1234")
;; test("2|3|4", "12345");
@ -216,55 +214,46 @@
;; ...and even three in this example
(t '("1" "" "" "") eof "2|3|4" "1234")
;; test('$',"123");
(t '("123") eof "$" "123")
(t '("123" "") eof "$" "123")
;; test('^',"123");
;; (t '("123") eof "^" "123")
;; this is a technicality: perl matches "^" once, but mzscheme matches on
;; whatever `start' may be; perl is treating it as a beginning-of-line
;; instead of a beginning-of-string behind your back "since it isn't much
;; use otherwise" (http://perldoc.perl.org/functions/split.html); but we
;; don't allow empty matches at the beginning, so a `^' will never match,
;; and we get the same behavior anyway:
(t '("123") eof "^" "123")
(t '("" "123") eof "^" "123")
;; test('^',"123\n456");
;; (t '("123\n" "456") eof "^" "123\n456")
;; we can get the same behavior as perl's with "(m?:^)":
(t '("123\n" "456") eof "(?m:^)" "123\n456")
(t '("" "123\n" "456") eof "(?m:^)" "123\n456")
;; test("\\b", "123 456");
(t '("123" " " "456") eof #px"\\b" "123 456")
(t '("" "123" " " "456" "") eof #px"\\b" "123 456")
;; test("^|a", "abc");
(t '("" "bc") eof "^|a" "abc")
(t '("" "" "bc") eof "^|a" "abc")
;; some tests with bounds (these have no perl equivalences)
(t '("1" "2" " " "3" "4") eof "" "12 34" 0)
(t '("1" "2" " " "3" "4") eof "" "12 34" 0 #f)
(t '("1" "2" " " "3" "4") eof "" "12 34" 0 5)
(t '("" "1" "2" "3" "4" "") eof " *" " 12 34 " 0)
(t '("" "1" "2" "3" "4" "") eof " *" " 12 34 " 0 #f)
(t '("" "1" "2" "3" "4" "") eof " *" " 12 34 " 0 7)
(t '("2" "3") eof " *" "123" 1)
(t '("2" "3" "4") eof " *" "12 34" 1)
(t '("2" "3" "4" "") eof " *" " 12 34 " 2)
(t '("1") "23" "" "123" 0 1)
(t '("2") "3" "" "123" 1 2)
(t '("3") eof "" "123" 2 3)
(t '("1" "2") "3" " *" "123" 0 2)
(t '("2" "3") eof " *" "123" 1 3)
(t '("" "1" "2" " " "3" "4" "") eof "" "12 34" 0)
(t '("" "1" "2" " " "3" "4" "") eof "" "12 34" 0 #f)
(t '("" "1" "2" " " "3" "4" "") eof "" "12 34" 0 5)
(t '("" "" "1" "2" "" "3" "4" "" "") eof " *" " 12 34 " 0)
(t '("" "" "1" "2" "" "3" "4" "" "") eof " *" " 12 34 " 0 #f)
(t '("" "" "1" "2" "" "3" "4" "" "") eof " *" " 12 34 " 0 7)
(t '("" "2" "3" "") eof " *" "123" 1)
(t '("" "2" "" "3" "4" "") eof " *" "12 34" 1)
(t '("" "2" "" "3" "4" "" "") eof " *" " 12 34 " 2)
(t '("" "1" "") "23" "" "123" 0 1)
(t '("" "2" "") "3" "" "123" 1 2)
(t '("" "3" "") eof "" "123" 2 3)
(t '("" "1" "2" "") "3" " *" "123" 0 2)
(t '("" "2" "3" "") eof " *" "123" 1 3)
;; more tests with match*
(t regexp-match*)
(t '("" "") eof "" "123")
(t '("" " " "") eof " *" "12 34")
(t '(" " "" " " "" " ") eof " *" " 12 34 ")
(t '("" " " "") " " " *" " 12 34 " 1 6)
(t '("" "" "" "") eof "" "123")
(t '("" "" " " "" "" "") eof " *" "12 34")
(t '(" " "" "" " " "" "" " " "") eof " *" " 12 34 ")
(t '("" "" " " "" "" "") " " " *" " 12 34 " 1 6)
(t regexp-match-positions*)
(t '((1 . 1) (2 . 2)) eof "" "123")
(t '((1 . 1) (2 . 3) (4 . 4)) eof " *" "12 34")
(t '((0 . 1) (2 . 2) (3 . 4) (5 . 5) (6 . 7)) eof " *" " 12 34 ")
(t '((2 . 2) (3 . 4) (5 . 5)) " " " *" " 12 34 " 1 6)
(t '((0 . 0) (1 . 1) (2 . 2) (3 . 3)) eof "" "123")
(t '((0 . 0) (1 . 1) (2 . 3) (3 . 3) (4 . 4) (5 . 5)) eof " *" "12 34")
(t '((0 . 1) (1 . 1) (2 . 2) (3 . 4) (4 . 4) (5 . 5) (6 . 7) (7 . 7)) eof " *" " 12 34 ")
(t '((1 . 1) (2 . 2) (3 . 4) (4 . 4) (5 . 5) (6 . 6)) " " " *" " 12 34 " 1 6)
(t regexp-match-peek-positions*)
(t '((1 . 1) (2 . 2)) "123" "" "123")
(t '((1 . 1) (2 . 3) (4 . 4)) "12 34" " *" "12 34")
(t '((0 . 1) (2 . 2) (3 . 4) (5 . 5) (6 . 7)) " 12 34 " " *" " 12 34 ")
(t '((2 . 2) (3 . 4) (5 . 5)) " 12 34 " " *" " 12 34 " 1 6)
(t '((0 . 0) (1 . 1) (2 . 2) (3 . 3)) "123" "" "123")
(t '((0 . 0) (1 . 1) (2 . 3) (3 . 3) (4 . 4) (5 . 5)) "12 34" " *" "12 34")
(t '((0 . 1) (1 . 1) (2 . 2) (3 . 4) (4 . 4) (5 . 5) (6 . 7) (7 . 7)) " 12 34 " " *" " 12 34 ")
(t '((1 . 1) (2 . 2) (3 . 4) (4 . 4) (5 . 5) (6 . 6)) " 12 34 " " *" " 12 34 " 1 6)
;; finally, some tests for the match* + split property
(t (lambda (rx str)
(let ([s (regexp-split rx str)]

View File

@ -1,6 +1,7 @@
Version 4.2.5.9
Version 4.2.5.10
regexp-match* et al. now disable ^ matching on all but the first
match
match, and empty matches are allowed in all positions except
immediately after an empty match
Version 4.2.5.3
Added chaperones

View File

@ -3420,7 +3420,7 @@ regmatch(Regwork *rw, rxpos prog)
if (is - rw->input_min >= no) {
rw->input = save - no;
if (regmatch(rw, next)) {
if (is == save) {
if (rw->input == save) {
/* Match */
if (!t) return 0;
found = 1;