diff --git a/racket/src/io/common/check.rkt b/racket/src/io/common/check.rkt index 7c0c5af920..aa0294f451 100644 --- a/racket/src/io/common/check.rkt +++ b/racket/src/io/common/check.rkt @@ -20,7 +20,7 @@ (end-pos . > . max-end)) (raise-range-error who "byte string" - "starting " + "ending " end-pos in-value 0 diff --git a/racket/src/regexp/demo.rkt b/racket/src/regexp/demo.rkt index 5780283617..3089217198 100644 --- a/racket/src/regexp/demo.rkt +++ b/racket/src/regexp/demo.rkt @@ -51,6 +51,30 @@ (rx:regexp-match "[abc]" in 0 1 discard2 #"") (test (get-output-bytes discard2) #" ")) +;; Input streams that are large enough for bytes to be discarded along the way +(test (rx:regexp-match #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x"))) + '(#"yx" #"y")) +(test (rx:regexp-match-positions #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x"))) + '((49999 . 50001) (49999 . 50000))) +(test (rx:regexp-match "(.)x" (string-append (make-string 50000 #\y) "x")) + '("yx" "y")) +(test (rx:regexp-match-positions "(.)x" (string-append (make-string 50000 #\y) "x")) + '((49999 . 50001) (49999 . 50000))) +(test (rx:regexp-match "(.)\u3BC" (string-append (make-string 50000 #\u3BB) "\u3BC")) + '("\u3BB\u3BC" "\u3BB")) +(test (rx:regexp-match-positions "(.)\u3BC" (string-append (make-string 50000 #\y) "\u3BC")) + '((49999 . 50001) (49999 . 50000))) + +(test (rx:regexp-match-positions #"<([abc])(>)?" "string-positions byte-positions->strings + + byte-index->string-index add-end-bytes) @@ -27,18 +29,19 @@ (define (byte-positions->bytess in ms-pos me-pos state #:delta [delta 0]) - (cons (subbytes in (+ ms-pos delta) (+ me-pos delta)) + (cons (subbytes in (- ms-pos delta) (- me-pos delta)) (if state (for/list ([p (in-vector state)]) (and p - (subbytes in (+ (car p) delta) (+ (cdr p) delta)))) + (subbytes in (- (car p) delta) (- (cdr p) delta)))) null))) (define (byte-positions->string-positions bstr-in ms-pos me-pos state - #:start-offset start-offset - #:start-pos [start-pos 0]) + #:start-index [start-index 0] + #:delta [delta 0] + #:result-offset [result-offset 0]) (define (string-offset pos) - (+ start-offset (bytes-utf-8-length bstr-in #\? start-pos pos))) + (+ result-offset (bytes-utf-8-length bstr-in #\? start-index (- pos delta)))) (cons (cons (string-offset ms-pos) (string-offset me-pos)) (if state (for/list ([p (in-vector state)]) @@ -49,13 +52,32 @@ (define (byte-positions->strings bstr-in ms-pos me-pos state #:delta [delta 0]) - (cons (bytes->string/utf-8 bstr-in #\? (+ ms-pos delta) (+ me-pos delta)) + (cons (bytes->string/utf-8 bstr-in #\? (- ms-pos delta) (- me-pos delta)) (if state (for/list ([p (in-vector state)]) (and p - (bytes->string/utf-8 bstr-in #\? (+ (car p) delta) (+ delta (cdr p))))) + (bytes->string/utf-8 bstr-in #\? (- (car p) delta) (- (cdr p) delta)))) null))) +(define (byte-index->string-index str pos) + ;; We assume that pos is on a code-point boundary in the + ;; UTF-8 encoding of str. Find out how many code points + ;; are before the index. + (let loop ([lo-pos 0] [lo 0] [hi (min (string-length str) + (* pos 6))]) + (cond + [(= lo hi) lo] + [(= (add1 lo) hi) + (if (= lo-pos pos) lo hi)] + [else + (define mid (quotient (+ lo hi) 2)) + (define len (string-utf-8-length str lo mid)) + (define mid-pos (+ lo-pos len)) + (cond + [(= mid-pos pos) mid] + [(mid-pos . > . pos) (loop lo-pos lo mid)] + [else (loop (+ lo-pos len) mid hi)])]))) + ;; For functions like `regexp-match/end`: (define (add-end-bytes results end-bytes-count bstr me-pos) (if end-bytes-count diff --git a/racket/src/regexp/match/main.rkt b/racket/src/regexp/match/main.rkt index 2d1ed2290d..56315f4a83 100644 --- a/racket/src/regexp/match/main.rkt +++ b/racket/src/regexp/match/main.rkt @@ -236,7 +236,7 @@ (byte-positions->byte-positions ms-pos me-pos state #:delta delta)] [else (byte-positions->string-positions bstr-in ms-pos me-pos state - #:start-offset start-offset)])) + #:result-offset start-offset)])) (add-end-bytes positions end-bytes-count bstr-in me-pos)] [(strings) ;; If pattern is bytes-based, then results will be bytes instead of strings: @@ -343,9 +343,18 @@ (define delta (- start-offset start-pos)) (byte-positions->byte-positions ms-pos me-pos state #:delta delta)] [else + ;; Some bytes may have been discarded in `lb-in`, and we + ;; don't know how many characters those add up to. The + ;; starting position `ms-pos` must be on a code-point + ;; boundary, and everything from `ms-pos` to `ms-end` must + ;; still be in `lb-in`. So, find `ms-pos` in the original + ;; string, and take it from there. + (define ms-str-pos (byte-index->string-index in (- ms-pos start-pos))) + (define delta (lazy-bytes-discarded-count lb-in)) (byte-positions->string-positions bstr ms-pos me-pos state - #:start-pos start-pos - #:start-offset start-offset)])) + #:start-index (- ms-pos delta) + #:delta delta + #:result-offset (+ ms-str-pos start-offset))])) (add-end-bytes positions end-bytes-count bstr me-pos)] [(strings) ;; The byte string may be shifted by discarded bytes, if not