Make regexp-split and relatives work with empty matches like other

regexp packages (eg, Dorai's pregexp and Emacs).

svn: r8556
This commit is contained in:
Eli Barzilay 2008-02-06 19:19:44 +00:00
parent 6436441ebd
commit 8a17372db3
2 changed files with 101 additions and 76 deletions

View File

@ -88,7 +88,8 @@
(define-syntax regexp-loop (define-syntax regexp-loop
(syntax-rules () (syntax-rules ()
[(regexp-loop name loop start end rx string [(regexp-loop name loop start end rx string
success-k port-success-k failure-k port-failure-k success-choose failure-k
port-success-k port-success-choose port-failure-k
need-leftover? peek?) need-leftover? peek?)
(let ([len (cond [(string? string) (string-length string)] (let ([len (cond [(string? string) (string-length string)]
[(bytes? string) (bytes-length string)] [(bytes? string) (bytes-length string)]
@ -120,7 +121,7 @@
(format "ending offset index out of range [~a,~a]: " start len) (format "ending offset index out of range [~a,~a]: " start len)
end)) end))
(reverse (reverse
(let loop ([acc '()] [start start] [end end]) (let loop ([acc '()] [start start] [end end] [skipped? #f])
(when (and need-leftover? (positive? start) (input-port? string)) (when (and need-leftover? (positive? start) (input-port? string))
;; Skip start chars: ;; Skip start chars:
(let ([s (make-bytes 4096)]) (let ([s (make-bytes 4096)])
@ -130,74 +131,90 @@
s string 0 (min (- start n) 4096))]) s string 0 (min (- start n) 4096))])
(unless (eof-object? m) (loop (+ n m)))))))) (unless (eof-object? m) (loop (+ n m))))))))
(if (and port-success-k (input-port? string)) (if (and port-success-choose (input-port? string))
;; Input port match, get string ;; Input port match, get string
(let ([discarded 0] (let* ([discarded 0]
[leftover-port (and need-leftover? (open-output-bytes))]) [leftover-port (and need-leftover? (open-output-bytes))]
(let ([match [match
(regexp-match (regexp-match
rx string rx string
(if need-leftover? 0 start) (if need-leftover? (if skipped? 1 0) start)
(and end (if need-leftover? (- end start) end)) (and end (if need-leftover?
(if need-leftover? (if skipped? (- end start -1) (- end start))
leftover-port end))
(make-output-port (if need-leftover?
'counter leftover-port
always-evt (make-output-port
(lambda (s start end flush? breakable?) 'counter
(let ([c (- end start)]) always-evt
(set! discarded (+ c discarded)) (lambda (s start end flush? breakable?)
c)) (let ([c (- end start)])
void)))] (set! discarded (+ c discarded))
[leftovers c))
(and need-leftover? void)))]
(if (and (regexp? rx) (string? string)) [leftovers
(get-output-string leftover-port) (and need-leftover?
(get-output-bytes leftover-port)))]) (if (and (regexp? rx) (string? string))
(if match (get-output-string leftover-port)
(port-success-k (get-output-bytes leftover-port)))])
acc (if match
(car match) (let* ([mlen (bstring-length (car match))]
(and end (- end (if need-leftover? [skip? (zero? mlen)])
(+ (bstring-length leftovers) start) (loop (cons (port-success-choose (car match) leftovers) acc)
discarded) (if skip? 1 0)
(bstring-length (car match)))) (and end (- end (if need-leftover?
leftovers) (+ (bstring-length leftovers) start
(port-failure-k acc leftovers)))) (if skipped? 1 0))
discarded)
mlen))
skip?))
(port-failure-k acc leftovers)))
;; String/port match, get positions ;; String/port match, get positions
(let ([match ((if peek? (let ([match ((if peek?
regexp-match-peek-positions regexp-match-peek-positions
regexp-match-positions) regexp-match-positions)
rx string start end)]) rx string start end)]
[start (if skipped? (sub1 start) start)])
(if match (if match
(let ([match-start (caar match)] (let* ([mstart (caar match)]
[match-end (cdar match)]) [mend (cdar match)]
(if (= match-start match-end) [skip? (= mstart mend)])
(error 'name ;; The following two pieces are similar, but not
"pattern matched a zero-length substring: ~e" rx) ;; simple to combine and preserve efficiency
(success-k acc start end match-start match-end))) (define (cont acc end* new-start new-end)
(if skip?
(if (and end* (new-start . >= . end*))
(if failure-k (failure-k acc end* end) acc)
(loop acc (add1 new-start) new-end #t))
(loop acc new-start new-end #f)))
(if port-success-k
(port-success-k
(lambda (acc new-start new-end)
(cont acc (or new-end end len) new-start new-end))
acc start end mstart mend)
(cont (cons (success-choose start mstart mend) acc)
(or end len) mend end)))
(failure-k acc start end)))))))])) (failure-k acc start end)))))))]))
;; Returns all the positions at which the pattern matched. ;; Returns all the positions at which the pattern matched.
(define (regexp-match-positions* pattern string [start 0] [end #f]) (define (regexp-match-positions* pattern string [start 0] [end #f])
(define rx (bstring->regexp 'regexp-match-positions* pattern)) (define rx (bstring->regexp 'regexp-match-positions* pattern))
(regexp-loop regexp-match-positions* loop start end rx string (regexp-loop regexp-match-positions* loop start end rx string
;; success-k: ;; success-choose:
(lambda (acc start end match-start match-end) (lambda (start mstart mend) (cons mstart mend))
(let ([acc (cons (cons match-start match-end) acc)])
(if (input-port? string)
;; Need to shift index of rest as reading, cannot do a
;; tail call without adding another state variable to the loop:
(append (map (lambda (p)
(cons (+ match-end (car p)) (+ match-end (cdr p))))
(loop '() 0 (and end (- end match-end))))
acc)
(loop acc match-end end))))
;; port-success-k: use string case
#f
;; failure-k: ;; failure-k:
(lambda (acc start end) acc) (lambda (acc start end) acc)
;; port-fail-k: use string case ;; port-success-k: need to shift index of rest as reading; cannot
;; do a tail call without adding another state variable to the
;; regexp loop, so this remains inefficient
(and (input-port? string)
(lambda (loop acc start end mstart mend)
(append (map (lambda (p)
(cons (+ mend (car p)) (+ mend (cdr p))))
(loop '() 0 (and end (- end mend))))
(cons (cons mstart mend) acc))))
;; other port functions: use string case
#f
#f #f
#f #f
#f)) #f))
@ -206,14 +223,13 @@
(define (regexp-match-peek-positions* pattern string [start 0] [end #f]) (define (regexp-match-peek-positions* pattern string [start 0] [end #f])
(define rx (bstring->regexp 'regexp-match-peek-positions* pattern)) (define rx (bstring->regexp 'regexp-match-peek-positions* pattern))
(regexp-loop regexp-match-peek-positions* loop start end rx string (regexp-loop regexp-match-peek-positions* loop start end rx string
;; success-k: ;; success-choose:
(lambda (acc start end match-start match-end) (lambda (start mstart mend) (cons mstart mend))
(loop (cons (cons match-start match-end) acc) match-end end))
;; port-success-k: use string case
#f
;; failure-k: ;; failure-k:
(lambda (acc start end) acc) (lambda (acc start end) acc)
;; port-fail-k: use string case ;; port functions: use string case
#f
#f
#f #f
#f #f
#t)) #t))
@ -227,16 +243,16 @@
string)) string))
(define sub (if (bytes? buf) subbytes substring)) (define sub (if (bytes? buf) subbytes substring))
(regexp-loop regexp-split loop start end rx buf (regexp-loop regexp-split loop start end rx buf
;; success-k: ;; success-choose:
(lambda (acc start end match-start match-end) (lambda (start mstart mend) (sub buf start mstart))
(loop (cons (sub buf start match-start) acc) match-end end))
;; port-success-k:
(lambda (acc match-string new-end leftovers)
(loop (cons leftovers acc) 0 new-end))
;; failure-k: ;; failure-k:
(lambda (acc start end) (lambda (acc start end)
(cons (if end (sub buf start end) (sub buf start)) acc)) (cons (if end (sub buf start end) (sub buf start)) acc))
;; port-fail-k ;; port-success-k:
#f
;; port-success-choose:
(lambda (match-string leftovers) leftovers)
;; port-failure-k:
(lambda (acc leftover) (cons leftover acc)) (lambda (acc leftover) (cons leftover acc))
#t #t
#f)) #f))
@ -249,15 +265,15 @@
string)) string))
(define sub (if (bytes? buf) subbytes substring)) (define sub (if (bytes? buf) subbytes substring))
(regexp-loop regexp-match* loop start end rx buf (regexp-loop regexp-match* loop start end rx buf
;; success-k: ;; success-choose:
(lambda (acc start end match-start match-end) (lambda (start mstart mend) (sub buf mstart mend))
(loop (cons (sub buf match-start match-end) acc) match-end end))
;; port-success-k:
(lambda (acc match-string new-end leftovers)
(loop (cons match-string acc) 0 new-end))
;; failure-k: ;; failure-k:
(lambda (acc start end) acc) (lambda (acc start end) acc)
;; port-fail-k: ;; port-success-k:
#f
;; port-success-choose:
(lambda (match-string leftovers) match-string)
;; port-failure-k:
(lambda (acc leftover) acc) (lambda (acc leftover) acc)
#f #f
#f)) #f))

View File

@ -138,6 +138,15 @@
(test '(#"here's " #" " #"u" #"k") regexp-split "[abc]" s 0 #f) (test '(#"here's " #" " #"u" #"k") regexp-split "[abc]" s 0 #f)
(test eof read-char s)) (test eof read-char s))
;; test with zero-length matches
(test '("" "f" "o" "o" "") regexp-split #rx"" "foo")
(test '("" "f" "o" "o" " " "b" "a" "r" "") regexp-split #rx"" "foo bar")
(test '("" "f" "o" "o" "" "b" "a" "r" "") regexp-split #rx" *" "foo bar")
(test '("f" "" "ar") regexp-split #rx"oo| b" "foo bar")
(test '("foo bar" "") regexp-split #rx"$" "foo bar")
;; this doesn't work (like in Emacs) because ^ matches the start pos
;; (test '("" "foo bar") regexp-split #rx"^" "foo bar")
(let ([g->re-test (let ([g->re-test
(lambda (glob . more) (lambda (glob . more)
(let ([re (apply glob->regexp glob more)]) (let ([re (apply glob->regexp glob more)])