regexp: repair match results on large string

This commit is contained in:
Matthew Flatt 2019-01-19 10:04:46 -07:00
parent 74b34c210d
commit b4cc2d849c
4 changed files with 66 additions and 11 deletions

View File

@ -20,7 +20,7 @@
(end-pos . > . max-end)) (end-pos . > . max-end))
(raise-range-error who (raise-range-error who
"byte string" "byte string"
"starting " "ending "
end-pos end-pos
in-value in-value
0 0

View File

@ -51,6 +51,30 @@
(rx:regexp-match "[abc]" in 0 1 discard2 #"") (rx:regexp-match "[abc]" in 0 1 discard2 #"")
(test (get-output-bytes discard2) #" ")) (test (get-output-bytes discard2) #" "))
;; Input streams that are large enough for bytes to be discarded along the way
(test (rx:regexp-match #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x")))
'(#"yx" #"y"))
(test (rx:regexp-match-positions #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x")))
'((49999 . 50001) (49999 . 50000)))
(test (rx:regexp-match "(.)x" (string-append (make-string 50000 #\y) "x"))
'("yx" "y"))
(test (rx:regexp-match-positions "(.)x" (string-append (make-string 50000 #\y) "x"))
'((49999 . 50001) (49999 . 50000)))
(test (rx:regexp-match "(.)\u3BC" (string-append (make-string 50000 #\u3BB) "\u3BC"))
'("\u3BB\u3BC" "\u3BB"))
(test (rx:regexp-match-positions "(.)\u3BC" (string-append (make-string 50000 #\y) "\u3BC"))
'((49999 . 50001) (49999 . 50000)))
(test (rx:regexp-match-positions #"<([abc])(>)?" "<a + <b = <c" 3)
'((5 . 7) (6 . 7) #f))
(test (rx:regexp-match-positions "[abc]" " a b c " 2)
'((3 . 4)))
(test (rx:regexp-match-positions "(?m:^.\n)" "a\nb\nc\n" 2 6 #f #"\n")
'((2 . 4)))
(test (regexp-replace* "-" "zero-or-more?" "_")
"zero_or_more?")
;; ---------------------------------------- ;; ----------------------------------------
(define (check rx in N [M (max 1 (quotient N 10))]) (define (check rx in N [M (max 1 (quotient N 10))])

View File

@ -8,6 +8,8 @@
byte-positions->string-positions byte-positions->string-positions
byte-positions->strings byte-positions->strings
byte-index->string-index
add-end-bytes) add-end-bytes)
@ -27,18 +29,19 @@
(define (byte-positions->bytess in ms-pos me-pos state (define (byte-positions->bytess in ms-pos me-pos state
#:delta [delta 0]) #:delta [delta 0])
(cons (subbytes in (+ ms-pos delta) (+ me-pos delta)) (cons (subbytes in (- ms-pos delta) (- me-pos delta))
(if state (if state
(for/list ([p (in-vector state)]) (for/list ([p (in-vector state)])
(and p (and p
(subbytes in (+ (car p) delta) (+ (cdr p) delta)))) (subbytes in (- (car p) delta) (- (cdr p) delta))))
null))) null)))
(define (byte-positions->string-positions bstr-in ms-pos me-pos state (define (byte-positions->string-positions bstr-in ms-pos me-pos state
#:start-offset start-offset #:start-index [start-index 0]
#:start-pos [start-pos 0]) #:delta [delta 0]
#:result-offset [result-offset 0])
(define (string-offset pos) (define (string-offset pos)
(+ start-offset (bytes-utf-8-length bstr-in #\? start-pos pos))) (+ result-offset (bytes-utf-8-length bstr-in #\? start-index (- pos delta))))
(cons (cons (string-offset ms-pos) (string-offset me-pos)) (cons (cons (string-offset ms-pos) (string-offset me-pos))
(if state (if state
(for/list ([p (in-vector state)]) (for/list ([p (in-vector state)])
@ -49,13 +52,32 @@
(define (byte-positions->strings bstr-in ms-pos me-pos state (define (byte-positions->strings bstr-in ms-pos me-pos state
#:delta [delta 0]) #:delta [delta 0])
(cons (bytes->string/utf-8 bstr-in #\? (+ ms-pos delta) (+ me-pos delta)) (cons (bytes->string/utf-8 bstr-in #\? (- ms-pos delta) (- me-pos delta))
(if state (if state
(for/list ([p (in-vector state)]) (for/list ([p (in-vector state)])
(and p (and p
(bytes->string/utf-8 bstr-in #\? (+ (car p) delta) (+ delta (cdr p))))) (bytes->string/utf-8 bstr-in #\? (- (car p) delta) (- (cdr p) delta))))
null))) null)))
(define (byte-index->string-index str pos)
;; We assume that pos is on a code-point boundary in the
;; UTF-8 encoding of str. Find out how many code points
;; are before the index.
(let loop ([lo-pos 0] [lo 0] [hi (min (string-length str)
(* pos 6))])
(cond
[(= lo hi) lo]
[(= (add1 lo) hi)
(if (= lo-pos pos) lo hi)]
[else
(define mid (quotient (+ lo hi) 2))
(define len (string-utf-8-length str lo mid))
(define mid-pos (+ lo-pos len))
(cond
[(= mid-pos pos) mid]
[(mid-pos . > . pos) (loop lo-pos lo mid)]
[else (loop (+ lo-pos len) mid hi)])])))
;; For functions like `regexp-match/end`: ;; For functions like `regexp-match/end`:
(define (add-end-bytes results end-bytes-count bstr me-pos) (define (add-end-bytes results end-bytes-count bstr me-pos)
(if end-bytes-count (if end-bytes-count

View File

@ -236,7 +236,7 @@
(byte-positions->byte-positions ms-pos me-pos state #:delta delta)] (byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
[else [else
(byte-positions->string-positions bstr-in ms-pos me-pos state (byte-positions->string-positions bstr-in ms-pos me-pos state
#:start-offset start-offset)])) #:result-offset start-offset)]))
(add-end-bytes positions end-bytes-count bstr-in me-pos)] (add-end-bytes positions end-bytes-count bstr-in me-pos)]
[(strings) [(strings)
;; If pattern is bytes-based, then results will be bytes instead of strings: ;; If pattern is bytes-based, then results will be bytes instead of strings:
@ -343,9 +343,18 @@
(define delta (- start-offset start-pos)) (define delta (- start-offset start-pos))
(byte-positions->byte-positions ms-pos me-pos state #:delta delta)] (byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
[else [else
;; Some bytes may have been discarded in `lb-in`, and we
;; don't know how many characters those add up to. The
;; starting position `ms-pos` must be on a code-point
;; boundary, and everything from `ms-pos` to `ms-end` must
;; still be in `lb-in`. So, find `ms-pos` in the original
;; string, and take it from there.
(define ms-str-pos (byte-index->string-index in (- ms-pos start-pos)))
(define delta (lazy-bytes-discarded-count lb-in))
(byte-positions->string-positions bstr ms-pos me-pos state (byte-positions->string-positions bstr ms-pos me-pos state
#:start-pos start-pos #:start-index (- ms-pos delta)
#:start-offset start-offset)])) #:delta delta
#:result-offset (+ ms-str-pos start-offset))]))
(add-end-bytes positions end-bytes-count bstr me-pos)] (add-end-bytes positions end-bytes-count bstr me-pos)]
[(strings) [(strings)
;; The byte string may be shifted by discarded bytes, if not ;; The byte string may be shifted by discarded bytes, if not