From 6c8ba483a1705d664a1cb1720b50bea1f0fa5c8e Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Wed, 16 Jan 2019 19:01:35 -0700 Subject: [PATCH] regexp: fix consumption of bytes from a port --- racket/src/regexp/demo.rkt | 11 ++++++++ racket/src/regexp/match/lazy-bytes.rkt | 38 ++++++++++++++++++-------- racket/src/regexp/match/main.rkt | 25 +++++++++-------- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/racket/src/regexp/demo.rkt b/racket/src/regexp/demo.rkt index 40b8f9ae20..5780283617 100644 --- a/racket/src/regexp/demo.rkt +++ b/racket/src/regexp/demo.rkt @@ -40,6 +40,17 @@ (test (rx:regexp-match rx:.n in 0 #f #f #"\n") '(#"b\n")) (test (rx:regexp-match rx:.n in 0 #f #f #"\n") '(#"c\n"))) +(let () + (define in (open-input-bytes #" a b c ")) + + (define discard (open-output-bytes)) + (rx:regexp-match "[abc]" in 0 3 discard #"") + (test (get-output-bytes discard) #" ") + + (define discard2 (open-output-bytes)) + (rx:regexp-match "[abc]" in 0 1 discard2 #"") + (test (get-output-bytes discard2) #" ")) + ;; ---------------------------------------- (define (check rx in N [M (max 1 (quotient N 10))]) diff --git a/racket/src/regexp/match/lazy-bytes.rkt b/racket/src/regexp/match/lazy-bytes.rkt index e5432c60dd..afdd36357b 100644 --- a/racket/src/regexp/match/lazy-bytes.rkt +++ b/racket/src/regexp/match/lazy-bytes.rkt @@ -23,16 +23,19 @@ out ; output hold discarded bytes; implies `(not peek?)` max-lookbehind ; bytes before current counter to preserve, if `out` [failed? #:mutable] ; set to #t if `progress-evt` fires or read blocks - [discarded-count #:mutable])) ; bytes discarded, if not `peek?` + [discarded-count #:mutable] ; bytes discarded, if not `peek?` + max-peek)) ; maximum number of bytes to peek or #f (define (make-lazy-bytes in skip-amt prefix peek? immediate-only? progress-evt - out max-lookbehind) + out max-lookbehind + max-peek) (define len (bytes-length prefix)) (lazy-bytes prefix len in skip-amt len peek? immediate-only? progress-evt out max-lookbehind - #f 0)) + #f 0 + max-peek)) (define (lazy-bytes-before-end? s pos end) (and (or (not (exact-integer? end)) @@ -54,9 +57,9 @@ ;; then flush unneeded bytes... ;; The promise is that we won't ask for bytes before ;; `pos` minus the `max-lookbehind` - (define pos (min given-pos (lazy-bytes-end s))) (when force? - (lazy-bytes-before-end? s pos 'eof)) + (lazy-bytes-before-end? s given-pos 'eof)) + (define pos (min given-pos (lazy-bytes-end s))) (when (and (lazy-bytes? s) (not (lazy-bytes-peek? s))) (define discarded-count (lazy-bytes-discarded-count s)) @@ -80,7 +83,9 @@ [else (min amt (- prefix-len discarded-count))]) ;; To amount to discard: amt)) - (bytes-copy! bstr 0 bstr amt (- (lazy-bytes-end s) discarded-count)) + (define copy-end (- (lazy-bytes-end s) discarded-count)) + (unless (= amt copy-end) + (bytes-copy! bstr 0 bstr amt copy-end)) (set-lazy-bytes-discarded-count! s (+ amt discarded-count))))) ;; ---------------------------------------- @@ -116,9 +121,20 @@ (set-lazy-bytes-end! s (+ n len discarded-count)) #t])] [else - ;; We're going to need a bigger byte string - (define bstr2 (make-bytes (max 32 (* 2 (bytes-length bstr))))) - (bytes-copy! bstr2 0 bstr 0 len) - (set-lazy-bytes-bstr! s bstr2) - (get-more-bytes! s)])] + (define max-peek (lazy-bytes-max-peek s)) + (define prefix-len (and max-peek (lazy-bytes-prefix-len s))) + (cond + [(and max-peek + (len . >= . (- (+ max-peek prefix-len) discarded-count))) + ;; Not allowed to read any more + #f] + [else + ;; We're going to need a bigger byte string + (define bstr2 (make-bytes (let ([sz (max 32 (* 2 (bytes-length bstr)))]) + (if max-peek + (min sz (- (+ prefix-len max-peek) discarded-count)) + sz)))) + (bytes-copy! bstr2 0 bstr 0 len) + (set-lazy-bytes-bstr! s bstr2) + (get-more-bytes! s)])])] [else #f])) diff --git a/racket/src/regexp/match/main.rkt b/racket/src/regexp/match/main.rkt index 60c2dd6918..118d631438 100644 --- a/racket/src/regexp/match/main.rkt +++ b/racket/src/regexp/match/main.rkt @@ -281,7 +281,10 @@ ;; Create a lazy string from the port: (define lb-in (make-lazy-bytes port-in (if peek? start-offset 0) prefix peek? immediate-only? progress-evt - out (rx:regexp-max-lookbehind rx))) + out (rx:regexp-max-lookbehind rx) + (and (input-port? in) + (not (eq? 'eof end-offset)) + (- end-offset start-offset)))) (define end-pos (if (eq? 'eof end-offset) 'eof (+ start-pos @@ -303,17 +306,22 @@ (when (not peek?) (cond [ms-pos - (when (or out (input-port? in)) + (when out ;; Flush bytes before match: - (lazy-bytes-advance! lb-in ms-pos #t) + (lazy-bytes-advance! lb-in ms-pos #t)) + (when (input-port? in) ;; Consume bytes that correspond to match: (copy-port-bytes port-in #f (- me-pos prefix-len)))] [(eq? end-pos 'eof) - ;; copy all remaining bytes from input to output + ;; Copy all remaining bytes from input to output (copy-port-bytes port-in out #f)] [else - (when (or out (input-port? in)) - (lazy-bytes-advance! lb-in end-pos #t))]))) + (when out + ;; Copy all bytes to output + (lazy-bytes-advance! lb-in end-pos #t)) + (when (input-port? in) + ;; Consume all bytes + (copy-port-bytes port-in #f (- end-pos start-pos)))]))) (begin0 @@ -322,11 +330,6 @@ (not (lazy-bytes-failed? lb-in)) mode) [(#f) - (when (and (not peek?) - any-bytes-left? - (input-port? in)) - ;; Consume non-matching bytes - (copy-port-bytes port-in out (if (eq? 'eof end-offset) #f end-offset))) (add-end-bytes #f end-bytes-count #f #f)] [(?) #t] [(positions)