diff --git a/racket/src/io/port/bytes-input.rkt b/racket/src/io/port/bytes-input.rkt index 3e76cb3519..b359646fca 100644 --- a/racket/src/io/port/bytes-input.rkt +++ b/racket/src/io/port/bytes-input.rkt @@ -23,7 +23,8 @@ do-read-byte/core-port) (module+ internal - (provide do-read-bytes!)) + (provide do-read-bytes! + peek-byte/core-port)) ;; ---------------------------------------- @@ -132,11 +133,14 @@ (check who input-port? orig-in) (check who exact-nonnegative-integer? skip-k) (let ([in (->core-input-port orig-in)]) - (define peek-byte (and (zero? skip-k) - (core-input-port-peek-byte in))) - (cond - [peek-byte (do-peek-byte who peek-byte in orig-in)] - [else (peek-byte-via-bytes in skip-k #:special-ok? #f)]))) + (peek-byte/core-port in skip-k))) + +(define/who (peek-byte/core-port in skip-k) + (define peek-byte (and (zero? skip-k) + (core-input-port-peek-byte in))) + (cond + [peek-byte (do-peek-byte who peek-byte in)] + [else (peek-byte-via-bytes in skip-k #:special-ok? #f)])) (define/who (peek-bytes amt skip-k [in (current-input-port)]) (check who exact-nonnegative-integer? amt) diff --git a/racket/src/io/port/read-and-peek.rkt b/racket/src/io/port/read-and-peek.rkt index eb47c855bf..3d46922528 100644 --- a/racket/src/io/port/read-and-peek.rkt +++ b/racket/src/io/port/read-and-peek.rkt @@ -213,7 +213,7 @@ v)) ;; Use a `peek-byte` shortcut -(define (do-peek-byte who peek-byte in orig-in) +(define (do-peek-byte who peek-byte in) (let loop () (start-atomic) (prepare-change in) diff --git a/racket/src/io/port/special-input.rkt b/racket/src/io/port/special-input.rkt index ceddc49bf7..87c6942647 100644 --- a/racket/src/io/port/special-input.rkt +++ b/racket/src/io/port/special-input.rkt @@ -41,7 +41,7 @@ (let ([in (->core-input-port orig-in)]) (define peek-byte (core-input-port-peek-byte in)) (cond - [peek-byte (do-peek-byte who peek-byte in orig-in)] + [peek-byte (do-peek-byte who peek-byte in)] [else (extract-special-value (peek-byte-via-bytes in skip-k #:progress-evt progress-evt) in source-name skip-k diff --git a/racket/src/io/port/string-input.rkt b/racket/src/io/port/string-input.rkt index 9a235bae9c..f7cb7fcf9b 100644 --- a/racket/src/io/port/string-input.rkt +++ b/racket/src/io/port/string-input.rkt @@ -197,24 +197,7 @@ ;; A shortcut to implement `read-char` in terms of a port-specific ;; `read-byte`: (define (read-char-via-read-byte who in read-byte #:special-ok? [special-ok? #t]) - (define b - (let loop () - (start-atomic) - (prepare-change in) - (check-not-closed who in) - (define b (read-byte (core-port-self in))) - (cond - [(fixnum? b) - (port-count-byte! in b) - (end-atomic) - b] - [(eof-object? b) - (end-atomic) - eof] - [else - (end-atomic) - (sync b) - (loop)]))) + (define b (do-read-byte who read-byte in)) (cond [(eof-object? b) b] [else @@ -222,48 +205,38 @@ [(b . fx< . 128) (integer->char b)] [else ;; UTF-8 decoding... May need to peek bytes to discover - ;; whether the decoding will work (in which case this wasn't - ;; much of a shortcut) - (define bstr (bytes b)) - (define str (make-string 1)) - (define-values (used-bytes got-chars state) - (utf-8-decode! bstr 0 1 - #f 0 #f - #:abort-mode 'state)) + ;; whether the decoding will work + (define-values (accum remaining state) + (utf-8-decode-byte b 0 0)) (cond [(eq? state 'error) ;; This happens if the byte is a UTF-8 continuation byte #\uFFFD] [else - ;; Need to peek ahead - (let loop ([skip-k 0] [state state]) - (define v (peek-some-bytes! who in bstr 0 1 skip-k #:copy-bstr? #f #:special-ok? special-ok?)) + ;; Need to peek ahead; don't consume any more bytes until + ;; complete, and consume only the already-consumed byte + ;; if there's a decoding error + (let loop ([skip-k 0] [accum accum] [remaining remaining]) + (define b (peek-byte/core-port in skip-k)) (cond - [(or (eof-object? v) - (procedure? v)) - ;; Already-consumed byte is an error byte + [(eof-object? b) + ;; Already-consumed byte is consume as an error byte #\uFFFD] [else - (define-values (used-bytes got-chars new-state) - (utf-8-decode! bstr 0 1 - str 0 1 - #:state state - #:error-char #\uFFFD - #:abort-mode 'state)) + (define-values (next-accum next-remaining state) + (utf-8-decode-byte b accum remaining)) (cond - [(fx= got-chars 1) - (define actually-used-bytes (fx+ skip-k used-bytes)) - (cond - [(fx= actually-used-bytes 0) (void)] - [(fx= actually-used-bytes 1) (do-read-byte who read-byte in)] - [else - (define finish-bstr (if (actually-used-bytes . fx<= . (bytes-length bstr)) - bstr - (make-bytes actually-used-bytes))) - (do-read-bytes! who in finish-bstr 0 actually-used-bytes)]) - (string-ref str 0)] + [(eq? state 'complete) + ;; Consume used bytes + (let loop ([skip-k skip-k]) + (do-read-byte who read-byte in) + (unless (fx= 0 skip-k) + (loop (fx- skip-k 1)))) + (integer->char next-accum)] + [(eq? state 'error) + #\uFFFD] [else - (loop (fx+ skip-k 1) new-state)])]))])])])) + (loop (fx+ 1 skip-k) next-accum next-remaining)])]))])])])) ;; ---------------------------------------- diff --git a/racket/src/io/string/utf-8-decode.rkt b/racket/src/io/string/utf-8-decode.rkt index 26c1bdf9d2..5034e5f507 100644 --- a/racket/src/io/string/utf-8-decode.rkt +++ b/racket/src/io/string/utf-8-decode.rkt @@ -3,7 +3,9 @@ (provide utf-8-decode! utf-8-max-aborts-amt - + + utf-8-decode-byte + utf-8-state? utf-8-state-pending-amt) @@ -57,6 +59,21 @@ ;; Iterate through the given byte string (let loop ([i in-start] [j out-start] [base-i base-i] [accum accum] [remaining remaining]) + ;; Shared handling for success: + (define (complete accum) + (when out-str (string-set! out-str j (integer->char accum))) + (define next-j (fx+ j 1)) + (define next-i (fx+ i 1)) + (cond + [(and out-end (fx= next-j out-end)) + (values (fx- next-i in-start) + (fx- next-j out-start) + (if (fx= next-i in-end) + 'complete + 'continues))] + [else + (loop next-i next-j next-i 0 0)])) + ;; Shared handling for encoding failures: (define (encoding-failure) (cond @@ -76,20 +93,6 @@ (fx- j out-start) 'error)])) - ;; Shared handling for decoding success: - (define (continue) - (define next-j (fx+ j 1)) - (define next-i (fx+ i 1)) - (cond - [(and out-end (fx= next-j out-end)) - (values (fx- next-i in-start) - (fx- next-j out-start) - (if (fx= next-i in-end) - 'complete - 'continues))] - [else - (loop next-i next-j next-i 0 0)])) - ;; Dispatch on byte: (cond [(fx= i in-end) @@ -116,74 +119,101 @@ (encoding-failure)] [else (define b (bytes-ref in-bstr i)) - (cond - [(b . fx< . 128) - (cond - [(fx= remaining 0) - ;; Found ASCII - (when out-str (string-set! out-str j (integer->char b))) - (continue)] - [else - ;; We were accumulating bytes for an encoding, and - ;; the encoding didn't complete - (encoding-failure)])] + (utf-8-decode-byte/inline b accum remaining + complete + (lambda (accum remaining) + (loop (fx+ i 1) j i accum remaining)) + (lambda (accum remaining) + (loop (fx+ i 1) j base-i accum remaining)) + encoding-failure)]))) + +(define-syntax-rule (utf-8-decode-byte/inline b accum remaining + complete-k + init-continue-k + next-continue-k + error-k) + (cond + [(b . fx< . 128) + (cond + [(fx= remaining 0) + ;; Found ASCII + (complete-k b)] [else - ;; Encoding... + ;; We were accumulating bytes for an encoding, and + ;; the encoding didn't complete + (error-k)])] + [else + ;; Encoding... + (cond + [(fx= #b10000000 (fxand b #b11000000)) + ;; A continuation byte (cond - [(fx= #b10000000 (fxand b #b11000000)) - ;; A continuation byte - (cond - [(fx= remaining 0) - ;; We weren't continuing - (encoding-failure)] - [else - (define next (fxand b #b00111111)) - (define next-accum (fxior (fxlshift accum 6) next)) - (cond - [(fx= 1 remaining) - (cond - [(and (next-accum . fx> . 127) - (next-accum . fx<= . #x10FFFF) - (not (and (next-accum . fx>= . #xD800) - (next-accum . fx<= . #xDFFF)))) - (when out-str (string-set! out-str j (integer->char next-accum))) - (continue)] - [else - ;; Not a valid character - (encoding-failure)])] - [(and (fx= 2 remaining) - (next-accum . fx<= . #b11111)) - ;; A shorter byte sequence would work, so this is an - ;; encoding mistae. - (encoding-failure)] - [(and (fx= 3 remaining) - (next-accum . fx<= . #b1111)) - ;; A shorter byte sequence would work - (encoding-failure)] - [else - ;; Continue an encoding. - (loop (fx+ i 1) j base-i next-accum (fx- remaining 1))])])] - [(not (fx= remaining 0)) - ;; Trying to start a new encoding while one is in - ;; progress - (encoding-failure)] - [(fx= #b11000000 (fxand b #b11100000)) - ;; Start a two-byte encoding - (define accum (fxand b #b11111)) - ;; If `accum` is zero, that's an encoding mistake, - ;; because a shorted byte sequence would work. + [(fx= remaining 0) + ;; We weren't continuing + (error-k)] + [else + (define next (fxand b #b00111111)) + (define next-accum (fxior (fxlshift accum 6) next)) (cond - [(fx= accum 0) (encoding-failure)] - [else (loop (fx+ i 1) j i accum 1)])] - [(fx= #b11100000 (fxand b #b11110000)) - ;; Start a three-byte encoding - (define accum (fxand b #b1111)) - (loop (fx+ i 1) j i accum 2)] - [(fx= #b11110000 (fxand b #b11111000)) - ;; Start a four-byte encoding - (define accum (fxand b #b111)) - (loop (fx+ i 1) j i accum 3)] - [else - ;; Five- or six-byte encodings don't produce valid - ;; characters - (encoding-failure)])])]))) + [(fx= 1 remaining) + (cond + [(and (next-accum . fx> . 127) + (next-accum . fx<= . #x10FFFF) + (not (and (next-accum . fx>= . #xD800) + (next-accum . fx<= . #xDFFF)))) + (complete-k next-accum)] + [else + ;; Not a valid character + (error-k)])] + [(and (fx= 2 remaining) + (next-accum . fx<= . #b11111)) + ;; A shorter byte sequence would work, so this is an + ;; encoding mistae. + (error-k)] + [(and (fx= 3 remaining) + (next-accum . fx<= . #b1111)) + ;; A shorter byte sequence would work + (error-k)] + [else + ;; Continue an encoding + (next-continue-k next-accum (fx- remaining 1))])])] + [(not (fx= remaining 0)) + ;; Trying to start a new encoding while one is in + ;; progress + (error-k)] + [(fx= #b11000000 (fxand b #b11100000)) + ;; Start a two-byte encoding + (define accum (fxand b #b11111)) + ;; If `accum` is zero, that's an encoding mistake, + ;; because a shorter byte sequence would work. + (cond + [(fx= accum 0) (error-k)] + [else (init-continue-k accum 1)])] + [(fx= #b11100000 (fxand b #b11110000)) + ;; Start a three-byte encoding + (define accum (fxand b #b1111)) + (init-continue-k accum 2)] + [(fx= #b11110000 (fxand b #b11111000)) + ;; Start a four-byte encoding + (define accum (fxand b #b111)) + (init-continue-k accum 3)] + [else + ;; Five- or six-byte encodings don't produce valid + ;; characters + (error-k)])])) + +;; Takes a byte and a decoding state and returns +;; one of +;; - (values code-point 0 'complete) +;; - (values #f 0 'error) +;; - (values new-accum new-remaining 'continues) +(define (utf-8-decode-byte b accum remaining) + (utf-8-decode-byte/inline b accum remaining + (lambda (accum) + (values accum 0 'complete)) + (lambda (accum remaining) + (values accum remaining 'continues)) + (lambda (accum remaining) + (values accum remaining 'continues)) + (lambda () + (values #f 0 'error))))