From b9ef307b306f65aed4ea6157234c2fbb86dabfe0 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Sun, 14 Feb 2021 09:38:17 -0700 Subject: [PATCH] cs & regexp: fix `regexp-match/end` on large strings When a string is large enough, its conversion to bytes is internally streamed, and `regexp-match/end` did not get the match-ending bytes correctly. Closes #3684 --- pkgs/racket-test-core/tests/racket/rx.rktl | 84 ++++++++++++++++++++++ racket/src/cs/schemified/regexp.scm | 19 +++-- racket/src/regexp/match/main.rkt | 7 +- 3 files changed, 103 insertions(+), 7 deletions(-) diff --git a/pkgs/racket-test-core/tests/racket/rx.rktl b/pkgs/racket-test-core/tests/racket/rx.rktl index 077bd52915..302d101d53 100644 --- a/pkgs/racket-test-core/tests/racket/rx.rktl +++ b/pkgs/racket-test-core/tests/racket/rx.rktl @@ -1884,4 +1884,88 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Check that `regexp-match/end` produces the right suffix +;; when a string to convert is large enough that its +;; conversion is internally streamed +(for ([N (in-list '(100 1000 10000 100000))]) + (test-values (list '(#"") #"!") + (lambda () + (regexp-match/end (byte-pregexp #"(?=b)") + (bytes-append (make-bytes N (char->integer #\a)) #"!b")))) + (test-values (list '(#"") #"!") + (lambda () + (regexp-match/end (byte-pregexp #"(?=b)") + (string-append (make-string N #\a) "!b")))) + (test-values (list '("") #"!") + (lambda () + (regexp-match/end (pregexp "(?=b)") + (string-append (make-string N #\a) "!b")))) + + (test-values (list (list (cons (add1 N) (add1 N))) #"!") + (lambda () + (regexp-match-positions/end (byte-pregexp #"(?=b)") + (bytes-append (make-bytes N (char->integer #\a)) #"!b")))) + + (test-values (list (list (cons (add1 N) (add1 N))) #"!") + (lambda () + (regexp-match-positions/end (byte-pregexp #"(?=b)") + (string-append (make-string N #\a) "!b")))) + (test-values (list (list (cons (add1 N) (add1 N))) #"!") + (lambda () + (regexp-match-positions/end (pregexp "(?=b)") + (string-append (make-string N #\a) "!b")))) + + (test-values (list '(#"") #"!") + (lambda () + (regexp-match/end (byte-pregexp #"(?=b)") + (bytes-append (make-bytes N (char->integer #\a)) #"!b") + 0 #f #f #"prefix"))) + (test-values (list '(#"") #"!") + (lambda () + (regexp-match/end (byte-pregexp #"(?=b)") + (string-append (make-string N #\a) "!b") + 0 #f #f #"prefix"))) + (test-values (list '("") #"!") + (lambda () + (regexp-match/end (pregexp "(?=b)") + (string-append (make-string N #\a) "!b") + 0 #f #f #"prefix"))) + + (test-values (list (list (cons (add1 N) (add1 N))) #"!") + (lambda () + (regexp-match-positions/end (byte-pregexp #"(?=b)") + (bytes-append (make-bytes N (char->integer #\a)) #"!b") + 0 #f #f #"prefix"))) + (test-values (list (list (cons (add1 N) (add1 N))) #"!") + (lambda () + (regexp-match-positions/end (byte-pregexp #"(?=b)") + (string-append (make-string N #\a) "!b") + 0 #f #f #"prefix"))) + (test-values (list (list (cons (add1 N) (add1 N))) #"!") + (lambda () + (regexp-match-positions/end (pregexp "(?=b)") + (string-append (make-string N #\a) "!b") + 0 #f #f #"prefix"))) + + (test-values (list (list (cons (add1 N) (add1 N))) + (bytes-append (make-bytes (sub1 N) (char->integer #\a)) #"!")) + (lambda () + (regexp-match-positions/end (byte-pregexp #"(?=b)") + (bytes-append (make-bytes N (char->integer #\a)) #"!b") + 0 #f #f #"prefix" N))) + (test-values (list (list (cons (add1 N) (add1 N))) + (bytes-append (make-bytes (sub1 N) (char->integer #\a)) #"!")) + (lambda () + (regexp-match-positions/end (byte-pregexp #"(?=b)") + (string-append (make-string N #\a) "!b") + 0 #f #f #"prefix" N))) + (test-values (list (list (cons (add1 N) (add1 N))) + (bytes-append (make-bytes (sub1 N) (char->integer #\a)) #"!")) + (lambda () + (regexp-match-positions/end (pregexp "(?=b)") + (string-append (make-string N #\a) "!b") + 0 #f #f #"prefix" N)))) + +;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + (report-errs) diff --git a/racket/src/cs/schemified/regexp.scm b/racket/src/cs/schemified/regexp.scm index 52aa249e38..7322b8af07 100644 --- a/racket/src/cs/schemified/regexp.scm +++ b/racket/src/cs/schemified/regexp.scm @@ -8810,8 +8810,14 @@ start-offset_0 0))) (let ((max-lookbehind_0 - (rx:regexp-max-lookbehind - rx_0))) + (let ((app_0 + (rx:regexp-max-lookbehind + rx_0))) + (max + app_0 + (if end-bytes-count9_0 + end-bytes-count9_0 + 0))))) (let ((max-peek_0 (if (input-port? in_0) (if (not @@ -8997,7 +9003,10 @@ positions_0 end-bytes-count9_0 bstr_0 - me-pos_0))) + (- + me-pos_0 + (lazy-bytes-discarded-count + lb-in_0))))) (if (eq? tmp_0 'strings) @@ -9032,7 +9041,9 @@ bytes/strings_0 end-bytes-count9_0 bstr_0 - me-pos_0)))) + (- + me-pos_0 + delta_0))))) (void)))))) (write/consume-skipped_0)))) (args diff --git a/racket/src/regexp/match/main.rkt b/racket/src/regexp/match/main.rkt index 776c2c0f6b..be1f5f13ae 100644 --- a/racket/src/regexp/match/main.rkt +++ b/racket/src/regexp/match/main.rkt @@ -281,7 +281,8 @@ ;; Create a lazy string from the port: (define lb-in (make-lazy-bytes port-in (if peek? start-offset 0) prefix peek? immediate-only? progress-evt - out (rx:regexp-max-lookbehind rx) + out (max (rx:regexp-max-lookbehind rx) + (or end-bytes-count 0)) (and (input-port? in) (not (eq? 'eof end-offset)) (- end-offset start-offset)))) @@ -355,7 +356,7 @@ #:start-index (- ms-pos delta) #:delta delta #:result-offset (+ ms-str-pos start-offset))])) - (add-end-bytes positions end-bytes-count bstr me-pos)] + (add-end-bytes positions end-bytes-count bstr (- me-pos (lazy-bytes-discarded-count lb-in)))] [(strings) ;; The byte string may be shifted by discarded bytes, if not ;; in `peek?` mode @@ -368,7 +369,7 @@ (byte-positions->bytess bstr ms-pos me-pos state #:delta delta)] [else (byte-positions->strings bstr ms-pos me-pos state #:delta delta)])) - (add-end-bytes bytes/strings end-bytes-count bstr me-pos)]) + (add-end-bytes bytes/strings end-bytes-count bstr (- me-pos delta))]) ;; Now, write and consume port content: (write/consume-skipped))]))