regexp: repair match results on large string
This commit is contained in:
parent
74b34c210d
commit
b4cc2d849c
|
@ -20,7 +20,7 @@
|
||||||
(end-pos . > . max-end))
|
(end-pos . > . max-end))
|
||||||
(raise-range-error who
|
(raise-range-error who
|
||||||
"byte string"
|
"byte string"
|
||||||
"starting "
|
"ending "
|
||||||
end-pos
|
end-pos
|
||||||
in-value
|
in-value
|
||||||
0
|
0
|
||||||
|
|
|
@ -51,6 +51,30 @@
|
||||||
(rx:regexp-match "[abc]" in 0 1 discard2 #"")
|
(rx:regexp-match "[abc]" in 0 1 discard2 #"")
|
||||||
(test (get-output-bytes discard2) #" "))
|
(test (get-output-bytes discard2) #" "))
|
||||||
|
|
||||||
|
;; Input streams that are large enough for bytes to be discarded along the way
|
||||||
|
(test (rx:regexp-match #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x")))
|
||||||
|
'(#"yx" #"y"))
|
||||||
|
(test (rx:regexp-match-positions #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x")))
|
||||||
|
'((49999 . 50001) (49999 . 50000)))
|
||||||
|
(test (rx:regexp-match "(.)x" (string-append (make-string 50000 #\y) "x"))
|
||||||
|
'("yx" "y"))
|
||||||
|
(test (rx:regexp-match-positions "(.)x" (string-append (make-string 50000 #\y) "x"))
|
||||||
|
'((49999 . 50001) (49999 . 50000)))
|
||||||
|
(test (rx:regexp-match "(.)\u3BC" (string-append (make-string 50000 #\u3BB) "\u3BC"))
|
||||||
|
'("\u3BB\u3BC" "\u3BB"))
|
||||||
|
(test (rx:regexp-match-positions "(.)\u3BC" (string-append (make-string 50000 #\y) "\u3BC"))
|
||||||
|
'((49999 . 50001) (49999 . 50000)))
|
||||||
|
|
||||||
|
(test (rx:regexp-match-positions #"<([abc])(>)?" "<a + <b = <c" 3)
|
||||||
|
'((5 . 7) (6 . 7) #f))
|
||||||
|
(test (rx:regexp-match-positions "[abc]" " a b c " 2)
|
||||||
|
'((3 . 4)))
|
||||||
|
(test (rx:regexp-match-positions "(?m:^.\n)" "a\nb\nc\n" 2 6 #f #"\n")
|
||||||
|
'((2 . 4)))
|
||||||
|
|
||||||
|
(test (regexp-replace* "-" "zero-or-more?" "_")
|
||||||
|
"zero_or_more?")
|
||||||
|
|
||||||
;; ----------------------------------------
|
;; ----------------------------------------
|
||||||
|
|
||||||
(define (check rx in N [M (max 1 (quotient N 10))])
|
(define (check rx in N [M (max 1 (quotient N 10))])
|
||||||
|
|
|
@ -8,6 +8,8 @@
|
||||||
|
|
||||||
byte-positions->string-positions
|
byte-positions->string-positions
|
||||||
byte-positions->strings
|
byte-positions->strings
|
||||||
|
|
||||||
|
byte-index->string-index
|
||||||
|
|
||||||
add-end-bytes)
|
add-end-bytes)
|
||||||
|
|
||||||
|
@ -27,18 +29,19 @@
|
||||||
|
|
||||||
(define (byte-positions->bytess in ms-pos me-pos state
|
(define (byte-positions->bytess in ms-pos me-pos state
|
||||||
#:delta [delta 0])
|
#:delta [delta 0])
|
||||||
(cons (subbytes in (+ ms-pos delta) (+ me-pos delta))
|
(cons (subbytes in (- ms-pos delta) (- me-pos delta))
|
||||||
(if state
|
(if state
|
||||||
(for/list ([p (in-vector state)])
|
(for/list ([p (in-vector state)])
|
||||||
(and p
|
(and p
|
||||||
(subbytes in (+ (car p) delta) (+ (cdr p) delta))))
|
(subbytes in (- (car p) delta) (- (cdr p) delta))))
|
||||||
null)))
|
null)))
|
||||||
|
|
||||||
(define (byte-positions->string-positions bstr-in ms-pos me-pos state
|
(define (byte-positions->string-positions bstr-in ms-pos me-pos state
|
||||||
#:start-offset start-offset
|
#:start-index [start-index 0]
|
||||||
#:start-pos [start-pos 0])
|
#:delta [delta 0]
|
||||||
|
#:result-offset [result-offset 0])
|
||||||
(define (string-offset pos)
|
(define (string-offset pos)
|
||||||
(+ start-offset (bytes-utf-8-length bstr-in #\? start-pos pos)))
|
(+ result-offset (bytes-utf-8-length bstr-in #\? start-index (- pos delta))))
|
||||||
(cons (cons (string-offset ms-pos) (string-offset me-pos))
|
(cons (cons (string-offset ms-pos) (string-offset me-pos))
|
||||||
(if state
|
(if state
|
||||||
(for/list ([p (in-vector state)])
|
(for/list ([p (in-vector state)])
|
||||||
|
@ -49,13 +52,32 @@
|
||||||
|
|
||||||
(define (byte-positions->strings bstr-in ms-pos me-pos state
|
(define (byte-positions->strings bstr-in ms-pos me-pos state
|
||||||
#:delta [delta 0])
|
#:delta [delta 0])
|
||||||
(cons (bytes->string/utf-8 bstr-in #\? (+ ms-pos delta) (+ me-pos delta))
|
(cons (bytes->string/utf-8 bstr-in #\? (- ms-pos delta) (- me-pos delta))
|
||||||
(if state
|
(if state
|
||||||
(for/list ([p (in-vector state)])
|
(for/list ([p (in-vector state)])
|
||||||
(and p
|
(and p
|
||||||
(bytes->string/utf-8 bstr-in #\? (+ (car p) delta) (+ delta (cdr p)))))
|
(bytes->string/utf-8 bstr-in #\? (- (car p) delta) (- (cdr p) delta))))
|
||||||
null)))
|
null)))
|
||||||
|
|
||||||
|
(define (byte-index->string-index str pos)
|
||||||
|
;; We assume that pos is on a code-point boundary in the
|
||||||
|
;; UTF-8 encoding of str. Find out how many code points
|
||||||
|
;; are before the index.
|
||||||
|
(let loop ([lo-pos 0] [lo 0] [hi (min (string-length str)
|
||||||
|
(* pos 6))])
|
||||||
|
(cond
|
||||||
|
[(= lo hi) lo]
|
||||||
|
[(= (add1 lo) hi)
|
||||||
|
(if (= lo-pos pos) lo hi)]
|
||||||
|
[else
|
||||||
|
(define mid (quotient (+ lo hi) 2))
|
||||||
|
(define len (string-utf-8-length str lo mid))
|
||||||
|
(define mid-pos (+ lo-pos len))
|
||||||
|
(cond
|
||||||
|
[(= mid-pos pos) mid]
|
||||||
|
[(mid-pos . > . pos) (loop lo-pos lo mid)]
|
||||||
|
[else (loop (+ lo-pos len) mid hi)])])))
|
||||||
|
|
||||||
;; For functions like `regexp-match/end`:
|
;; For functions like `regexp-match/end`:
|
||||||
(define (add-end-bytes results end-bytes-count bstr me-pos)
|
(define (add-end-bytes results end-bytes-count bstr me-pos)
|
||||||
(if end-bytes-count
|
(if end-bytes-count
|
||||||
|
|
|
@ -236,7 +236,7 @@
|
||||||
(byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
|
(byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
|
||||||
[else
|
[else
|
||||||
(byte-positions->string-positions bstr-in ms-pos me-pos state
|
(byte-positions->string-positions bstr-in ms-pos me-pos state
|
||||||
#:start-offset start-offset)]))
|
#:result-offset start-offset)]))
|
||||||
(add-end-bytes positions end-bytes-count bstr-in me-pos)]
|
(add-end-bytes positions end-bytes-count bstr-in me-pos)]
|
||||||
[(strings)
|
[(strings)
|
||||||
;; If pattern is bytes-based, then results will be bytes instead of strings:
|
;; If pattern is bytes-based, then results will be bytes instead of strings:
|
||||||
|
@ -343,9 +343,18 @@
|
||||||
(define delta (- start-offset start-pos))
|
(define delta (- start-offset start-pos))
|
||||||
(byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
|
(byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
|
||||||
[else
|
[else
|
||||||
|
;; Some bytes may have been discarded in `lb-in`, and we
|
||||||
|
;; don't know how many characters those add up to. The
|
||||||
|
;; starting position `ms-pos` must be on a code-point
|
||||||
|
;; boundary, and everything from `ms-pos` to `ms-end` must
|
||||||
|
;; still be in `lb-in`. So, find `ms-pos` in the original
|
||||||
|
;; string, and take it from there.
|
||||||
|
(define ms-str-pos (byte-index->string-index in (- ms-pos start-pos)))
|
||||||
|
(define delta (lazy-bytes-discarded-count lb-in))
|
||||||
(byte-positions->string-positions bstr ms-pos me-pos state
|
(byte-positions->string-positions bstr ms-pos me-pos state
|
||||||
#:start-pos start-pos
|
#:start-index (- ms-pos delta)
|
||||||
#:start-offset start-offset)]))
|
#:delta delta
|
||||||
|
#:result-offset (+ ms-str-pos start-offset))]))
|
||||||
(add-end-bytes positions end-bytes-count bstr me-pos)]
|
(add-end-bytes positions end-bytes-count bstr me-pos)]
|
||||||
[(strings)
|
[(strings)
|
||||||
;; The byte string may be shifted by discarded bytes, if not
|
;; The byte string may be shifted by discarded bytes, if not
|
||||||
|
|
Loading…
Reference in New Issue
Block a user