cs & regexp: fix regexp-match/end on large strings

When a string is large enough, its conversion to bytes is internally
streamed, and `regexp-match/end` did not get the match-ending bytes
correctly.

Closes #3684
This commit is contained in:
Matthew Flatt 2021-02-14 09:38:17 -07:00
parent 0541fe3b54
commit b9ef307b30
3 changed files with 103 additions and 7 deletions

View File

@ -1884,4 +1884,88 @@
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Check that `regexp-match/end` produces the right suffix
;; when a string to convert is large enough that its
;; conversion is internally streamed
(for ([N (in-list '(100 1000 10000 100000))])
(test-values (list '(#"") #"!")
(lambda ()
(regexp-match/end (byte-pregexp #"(?=b)")
(bytes-append (make-bytes N (char->integer #\a)) #"!b"))))
(test-values (list '(#"") #"!")
(lambda ()
(regexp-match/end (byte-pregexp #"(?=b)")
(string-append (make-string N #\a) "!b"))))
(test-values (list '("") #"!")
(lambda ()
(regexp-match/end (pregexp "(?=b)")
(string-append (make-string N #\a) "!b"))))
(test-values (list (list (cons (add1 N) (add1 N))) #"!")
(lambda ()
(regexp-match-positions/end (byte-pregexp #"(?=b)")
(bytes-append (make-bytes N (char->integer #\a)) #"!b"))))
(test-values (list (list (cons (add1 N) (add1 N))) #"!")
(lambda ()
(regexp-match-positions/end (byte-pregexp #"(?=b)")
(string-append (make-string N #\a) "!b"))))
(test-values (list (list (cons (add1 N) (add1 N))) #"!")
(lambda ()
(regexp-match-positions/end (pregexp "(?=b)")
(string-append (make-string N #\a) "!b"))))
(test-values (list '(#"") #"!")
(lambda ()
(regexp-match/end (byte-pregexp #"(?=b)")
(bytes-append (make-bytes N (char->integer #\a)) #"!b")
0 #f #f #"prefix")))
(test-values (list '(#"") #"!")
(lambda ()
(regexp-match/end (byte-pregexp #"(?=b)")
(string-append (make-string N #\a) "!b")
0 #f #f #"prefix")))
(test-values (list '("") #"!")
(lambda ()
(regexp-match/end (pregexp "(?=b)")
(string-append (make-string N #\a) "!b")
0 #f #f #"prefix")))
(test-values (list (list (cons (add1 N) (add1 N))) #"!")
(lambda ()
(regexp-match-positions/end (byte-pregexp #"(?=b)")
(bytes-append (make-bytes N (char->integer #\a)) #"!b")
0 #f #f #"prefix")))
(test-values (list (list (cons (add1 N) (add1 N))) #"!")
(lambda ()
(regexp-match-positions/end (byte-pregexp #"(?=b)")
(string-append (make-string N #\a) "!b")
0 #f #f #"prefix")))
(test-values (list (list (cons (add1 N) (add1 N))) #"!")
(lambda ()
(regexp-match-positions/end (pregexp "(?=b)")
(string-append (make-string N #\a) "!b")
0 #f #f #"prefix")))
(test-values (list (list (cons (add1 N) (add1 N)))
(bytes-append (make-bytes (sub1 N) (char->integer #\a)) #"!"))
(lambda ()
(regexp-match-positions/end (byte-pregexp #"(?=b)")
(bytes-append (make-bytes N (char->integer #\a)) #"!b")
0 #f #f #"prefix" N)))
(test-values (list (list (cons (add1 N) (add1 N)))
(bytes-append (make-bytes (sub1 N) (char->integer #\a)) #"!"))
(lambda ()
(regexp-match-positions/end (byte-pregexp #"(?=b)")
(string-append (make-string N #\a) "!b")
0 #f #f #"prefix" N)))
(test-values (list (list (cons (add1 N) (add1 N)))
(bytes-append (make-bytes (sub1 N) (char->integer #\a)) #"!"))
(lambda ()
(regexp-match-positions/end (pregexp "(?=b)")
(string-append (make-string N #\a) "!b")
0 #f #f #"prefix" N))))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(report-errs)

View File

@ -8810,8 +8810,14 @@
start-offset_0
0)))
(let ((max-lookbehind_0
(rx:regexp-max-lookbehind
rx_0)))
(let ((app_0
(rx:regexp-max-lookbehind
rx_0)))
(max
app_0
(if end-bytes-count9_0
end-bytes-count9_0
0)))))
(let ((max-peek_0
(if (input-port? in_0)
(if (not
@ -8997,7 +9003,10 @@
positions_0
end-bytes-count9_0
bstr_0
me-pos_0)))
(-
me-pos_0
(lazy-bytes-discarded-count
lb-in_0)))))
(if (eq?
tmp_0
'strings)
@ -9032,7 +9041,9 @@
bytes/strings_0
end-bytes-count9_0
bstr_0
me-pos_0))))
(-
me-pos_0
delta_0)))))
(void))))))
(write/consume-skipped_0))))
(args

View File

@ -281,7 +281,8 @@
;; Create a lazy string from the port:
(define lb-in (make-lazy-bytes port-in (if peek? start-offset 0) prefix
peek? immediate-only? progress-evt
out (rx:regexp-max-lookbehind rx)
out (max (rx:regexp-max-lookbehind rx)
(or end-bytes-count 0))
(and (input-port? in)
(not (eq? 'eof end-offset))
(- end-offset start-offset))))
@ -355,7 +356,7 @@
#:start-index (- ms-pos delta)
#:delta delta
#:result-offset (+ ms-str-pos start-offset))]))
(add-end-bytes positions end-bytes-count bstr me-pos)]
(add-end-bytes positions end-bytes-count bstr (- me-pos (lazy-bytes-discarded-count lb-in)))]
[(strings)
;; The byte string may be shifted by discarded bytes, if not
;; in `peek?` mode
@ -368,7 +369,7 @@
(byte-positions->bytess bstr ms-pos me-pos state #:delta delta)]
[else
(byte-positions->strings bstr ms-pos me-pos state #:delta delta)]))
(add-end-bytes bytes/strings end-bytes-count bstr me-pos)])
(add-end-bytes bytes/strings end-bytes-count bstr (- me-pos delta))])
;; Now, write and consume port content:
(write/consume-skipped))]))