regexp: repair match results on large string
This commit is contained in:
parent
74b34c210d
commit
b4cc2d849c
|
@ -20,7 +20,7 @@
|
|||
(end-pos . > . max-end))
|
||||
(raise-range-error who
|
||||
"byte string"
|
||||
"starting "
|
||||
"ending "
|
||||
end-pos
|
||||
in-value
|
||||
0
|
||||
|
|
|
@ -51,6 +51,30 @@
|
|||
(rx:regexp-match "[abc]" in 0 1 discard2 #"")
|
||||
(test (get-output-bytes discard2) #" "))
|
||||
|
||||
;; Input streams that are large enough for bytes to be discarded along the way
|
||||
(test (rx:regexp-match #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x")))
|
||||
'(#"yx" #"y"))
|
||||
(test (rx:regexp-match-positions #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x")))
|
||||
'((49999 . 50001) (49999 . 50000)))
|
||||
(test (rx:regexp-match "(.)x" (string-append (make-string 50000 #\y) "x"))
|
||||
'("yx" "y"))
|
||||
(test (rx:regexp-match-positions "(.)x" (string-append (make-string 50000 #\y) "x"))
|
||||
'((49999 . 50001) (49999 . 50000)))
|
||||
(test (rx:regexp-match "(.)\u3BC" (string-append (make-string 50000 #\u3BB) "\u3BC"))
|
||||
'("\u3BB\u3BC" "\u3BB"))
|
||||
(test (rx:regexp-match-positions "(.)\u3BC" (string-append (make-string 50000 #\y) "\u3BC"))
|
||||
'((49999 . 50001) (49999 . 50000)))
|
||||
|
||||
(test (rx:regexp-match-positions #"<([abc])(>)?" "<a + <b = <c" 3)
|
||||
'((5 . 7) (6 . 7) #f))
|
||||
(test (rx:regexp-match-positions "[abc]" " a b c " 2)
|
||||
'((3 . 4)))
|
||||
(test (rx:regexp-match-positions "(?m:^.\n)" "a\nb\nc\n" 2 6 #f #"\n")
|
||||
'((2 . 4)))
|
||||
|
||||
(test (regexp-replace* "-" "zero-or-more?" "_")
|
||||
"zero_or_more?")
|
||||
|
||||
;; ----------------------------------------
|
||||
|
||||
(define (check rx in N [M (max 1 (quotient N 10))])
|
||||
|
|
|
@ -9,6 +9,8 @@
|
|||
byte-positions->string-positions
|
||||
byte-positions->strings
|
||||
|
||||
byte-index->string-index
|
||||
|
||||
add-end-bytes)
|
||||
|
||||
(define (byte-positions->byte-positions ms-pos me-pos state
|
||||
|
@ -27,18 +29,19 @@
|
|||
|
||||
(define (byte-positions->bytess in ms-pos me-pos state
|
||||
#:delta [delta 0])
|
||||
(cons (subbytes in (+ ms-pos delta) (+ me-pos delta))
|
||||
(cons (subbytes in (- ms-pos delta) (- me-pos delta))
|
||||
(if state
|
||||
(for/list ([p (in-vector state)])
|
||||
(and p
|
||||
(subbytes in (+ (car p) delta) (+ (cdr p) delta))))
|
||||
(subbytes in (- (car p) delta) (- (cdr p) delta))))
|
||||
null)))
|
||||
|
||||
(define (byte-positions->string-positions bstr-in ms-pos me-pos state
|
||||
#:start-offset start-offset
|
||||
#:start-pos [start-pos 0])
|
||||
#:start-index [start-index 0]
|
||||
#:delta [delta 0]
|
||||
#:result-offset [result-offset 0])
|
||||
(define (string-offset pos)
|
||||
(+ start-offset (bytes-utf-8-length bstr-in #\? start-pos pos)))
|
||||
(+ result-offset (bytes-utf-8-length bstr-in #\? start-index (- pos delta))))
|
||||
(cons (cons (string-offset ms-pos) (string-offset me-pos))
|
||||
(if state
|
||||
(for/list ([p (in-vector state)])
|
||||
|
@ -49,13 +52,32 @@
|
|||
|
||||
(define (byte-positions->strings bstr-in ms-pos me-pos state
|
||||
#:delta [delta 0])
|
||||
(cons (bytes->string/utf-8 bstr-in #\? (+ ms-pos delta) (+ me-pos delta))
|
||||
(cons (bytes->string/utf-8 bstr-in #\? (- ms-pos delta) (- me-pos delta))
|
||||
(if state
|
||||
(for/list ([p (in-vector state)])
|
||||
(and p
|
||||
(bytes->string/utf-8 bstr-in #\? (+ (car p) delta) (+ delta (cdr p)))))
|
||||
(bytes->string/utf-8 bstr-in #\? (- (car p) delta) (- (cdr p) delta))))
|
||||
null)))
|
||||
|
||||
(define (byte-index->string-index str pos)
|
||||
;; We assume that pos is on a code-point boundary in the
|
||||
;; UTF-8 encoding of str. Find out how many code points
|
||||
;; are before the index.
|
||||
(let loop ([lo-pos 0] [lo 0] [hi (min (string-length str)
|
||||
(* pos 6))])
|
||||
(cond
|
||||
[(= lo hi) lo]
|
||||
[(= (add1 lo) hi)
|
||||
(if (= lo-pos pos) lo hi)]
|
||||
[else
|
||||
(define mid (quotient (+ lo hi) 2))
|
||||
(define len (string-utf-8-length str lo mid))
|
||||
(define mid-pos (+ lo-pos len))
|
||||
(cond
|
||||
[(= mid-pos pos) mid]
|
||||
[(mid-pos . > . pos) (loop lo-pos lo mid)]
|
||||
[else (loop (+ lo-pos len) mid hi)])])))
|
||||
|
||||
;; For functions like `regexp-match/end`:
|
||||
(define (add-end-bytes results end-bytes-count bstr me-pos)
|
||||
(if end-bytes-count
|
||||
|
|
|
@ -236,7 +236,7 @@
|
|||
(byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
|
||||
[else
|
||||
(byte-positions->string-positions bstr-in ms-pos me-pos state
|
||||
#:start-offset start-offset)]))
|
||||
#:result-offset start-offset)]))
|
||||
(add-end-bytes positions end-bytes-count bstr-in me-pos)]
|
||||
[(strings)
|
||||
;; If pattern is bytes-based, then results will be bytes instead of strings:
|
||||
|
@ -343,9 +343,18 @@
|
|||
(define delta (- start-offset start-pos))
|
||||
(byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
|
||||
[else
|
||||
;; Some bytes may have been discarded in `lb-in`, and we
|
||||
;; don't know how many characters those add up to. The
|
||||
;; starting position `ms-pos` must be on a code-point
|
||||
;; boundary, and everything from `ms-pos` to `ms-end` must
|
||||
;; still be in `lb-in`. So, find `ms-pos` in the original
|
||||
;; string, and take it from there.
|
||||
(define ms-str-pos (byte-index->string-index in (- ms-pos start-pos)))
|
||||
(define delta (lazy-bytes-discarded-count lb-in))
|
||||
(byte-positions->string-positions bstr ms-pos me-pos state
|
||||
#:start-pos start-pos
|
||||
#:start-offset start-offset)]))
|
||||
#:start-index (- ms-pos delta)
|
||||
#:delta delta
|
||||
#:result-offset (+ ms-str-pos start-offset))]))
|
||||
(add-end-bytes positions end-bytes-count bstr me-pos)]
|
||||
[(strings)
|
||||
;; The byte string may be shifted by discarded bytes, if not
|
||||
|
|
Loading…
Reference in New Issue
Block a user