io: faster read-char

This commit is contained in:
Matthew Flatt 2019-02-03 14:46:02 -08:00
parent 9823cbba4d
commit af24a0318f
5 changed files with 148 additions and 141 deletions

View File

@ -23,7 +23,8 @@
do-read-byte/core-port) do-read-byte/core-port)
(module+ internal (module+ internal
(provide do-read-bytes!)) (provide do-read-bytes!
peek-byte/core-port))
;; ---------------------------------------- ;; ----------------------------------------
@ -132,11 +133,14 @@
(check who input-port? orig-in) (check who input-port? orig-in)
(check who exact-nonnegative-integer? skip-k) (check who exact-nonnegative-integer? skip-k)
(let ([in (->core-input-port orig-in)]) (let ([in (->core-input-port orig-in)])
(define peek-byte (and (zero? skip-k) (peek-byte/core-port in skip-k)))
(core-input-port-peek-byte in)))
(cond (define/who (peek-byte/core-port in skip-k)
[peek-byte (do-peek-byte who peek-byte in orig-in)] (define peek-byte (and (zero? skip-k)
[else (peek-byte-via-bytes in skip-k #:special-ok? #f)]))) (core-input-port-peek-byte in)))
(cond
[peek-byte (do-peek-byte who peek-byte in)]
[else (peek-byte-via-bytes in skip-k #:special-ok? #f)]))
(define/who (peek-bytes amt skip-k [in (current-input-port)]) (define/who (peek-bytes amt skip-k [in (current-input-port)])
(check who exact-nonnegative-integer? amt) (check who exact-nonnegative-integer? amt)

View File

@ -213,7 +213,7 @@
v)) v))
;; Use a `peek-byte` shortcut ;; Use a `peek-byte` shortcut
(define (do-peek-byte who peek-byte in orig-in) (define (do-peek-byte who peek-byte in)
(let loop () (let loop ()
(start-atomic) (start-atomic)
(prepare-change in) (prepare-change in)

View File

@ -41,7 +41,7 @@
(let ([in (->core-input-port orig-in)]) (let ([in (->core-input-port orig-in)])
(define peek-byte (core-input-port-peek-byte in)) (define peek-byte (core-input-port-peek-byte in))
(cond (cond
[peek-byte (do-peek-byte who peek-byte in orig-in)] [peek-byte (do-peek-byte who peek-byte in)]
[else [else
(extract-special-value (peek-byte-via-bytes in skip-k #:progress-evt progress-evt) (extract-special-value (peek-byte-via-bytes in skip-k #:progress-evt progress-evt)
in source-name skip-k in source-name skip-k

View File

@ -197,24 +197,7 @@
;; A shortcut to implement `read-char` in terms of a port-specific ;; A shortcut to implement `read-char` in terms of a port-specific
;; `read-byte`: ;; `read-byte`:
(define (read-char-via-read-byte who in read-byte #:special-ok? [special-ok? #t]) (define (read-char-via-read-byte who in read-byte #:special-ok? [special-ok? #t])
(define b (define b (do-read-byte who read-byte in))
(let loop ()
(start-atomic)
(prepare-change in)
(check-not-closed who in)
(define b (read-byte (core-port-self in)))
(cond
[(fixnum? b)
(port-count-byte! in b)
(end-atomic)
b]
[(eof-object? b)
(end-atomic)
eof]
[else
(end-atomic)
(sync b)
(loop)])))
(cond (cond
[(eof-object? b) b] [(eof-object? b) b]
[else [else
@ -222,48 +205,38 @@
[(b . fx< . 128) (integer->char b)] [(b . fx< . 128) (integer->char b)]
[else [else
;; UTF-8 decoding... May need to peek bytes to discover ;; UTF-8 decoding... May need to peek bytes to discover
;; whether the decoding will work (in which case this wasn't ;; whether the decoding will work
;; much of a shortcut) (define-values (accum remaining state)
(define bstr (bytes b)) (utf-8-decode-byte b 0 0))
(define str (make-string 1))
(define-values (used-bytes got-chars state)
(utf-8-decode! bstr 0 1
#f 0 #f
#:abort-mode 'state))
(cond (cond
[(eq? state 'error) [(eq? state 'error)
;; This happens if the byte is a UTF-8 continuation byte ;; This happens if the byte is a UTF-8 continuation byte
#\uFFFD] #\uFFFD]
[else [else
;; Need to peek ahead ;; Need to peek ahead; don't consume any more bytes until
(let loop ([skip-k 0] [state state]) ;; complete, and consume only the already-consumed byte
(define v (peek-some-bytes! who in bstr 0 1 skip-k #:copy-bstr? #f #:special-ok? special-ok?)) ;; if there's a decoding error
(let loop ([skip-k 0] [accum accum] [remaining remaining])
(define b (peek-byte/core-port in skip-k))
(cond (cond
[(or (eof-object? v) [(eof-object? b)
(procedure? v)) ;; Already-consumed byte is consume as an error byte
;; Already-consumed byte is an error byte
#\uFFFD] #\uFFFD]
[else [else
(define-values (used-bytes got-chars new-state) (define-values (next-accum next-remaining state)
(utf-8-decode! bstr 0 1 (utf-8-decode-byte b accum remaining))
str 0 1
#:state state
#:error-char #\uFFFD
#:abort-mode 'state))
(cond (cond
[(fx= got-chars 1) [(eq? state 'complete)
(define actually-used-bytes (fx+ skip-k used-bytes)) ;; Consume used bytes
(cond (let loop ([skip-k skip-k])
[(fx= actually-used-bytes 0) (void)] (do-read-byte who read-byte in)
[(fx= actually-used-bytes 1) (do-read-byte who read-byte in)] (unless (fx= 0 skip-k)
[else (loop (fx- skip-k 1))))
(define finish-bstr (if (actually-used-bytes . fx<= . (bytes-length bstr)) (integer->char next-accum)]
bstr [(eq? state 'error)
(make-bytes actually-used-bytes))) #\uFFFD]
(do-read-bytes! who in finish-bstr 0 actually-used-bytes)])
(string-ref str 0)]
[else [else
(loop (fx+ skip-k 1) new-state)])]))])])])) (loop (fx+ 1 skip-k) next-accum next-remaining)])]))])])]))
;; ---------------------------------------- ;; ----------------------------------------

View File

@ -3,7 +3,9 @@
(provide utf-8-decode! (provide utf-8-decode!
utf-8-max-aborts-amt utf-8-max-aborts-amt
utf-8-decode-byte
utf-8-state? utf-8-state?
utf-8-state-pending-amt) utf-8-state-pending-amt)
@ -57,6 +59,21 @@
;; Iterate through the given byte string ;; Iterate through the given byte string
(let loop ([i in-start] [j out-start] [base-i base-i] [accum accum] [remaining remaining]) (let loop ([i in-start] [j out-start] [base-i base-i] [accum accum] [remaining remaining])
;; Shared handling for success:
(define (complete accum)
(when out-str (string-set! out-str j (integer->char accum)))
(define next-j (fx+ j 1))
(define next-i (fx+ i 1))
(cond
[(and out-end (fx= next-j out-end))
(values (fx- next-i in-start)
(fx- next-j out-start)
(if (fx= next-i in-end)
'complete
'continues))]
[else
(loop next-i next-j next-i 0 0)]))
;; Shared handling for encoding failures: ;; Shared handling for encoding failures:
(define (encoding-failure) (define (encoding-failure)
(cond (cond
@ -76,20 +93,6 @@
(fx- j out-start) (fx- j out-start)
'error)])) 'error)]))
;; Shared handling for decoding success:
(define (continue)
(define next-j (fx+ j 1))
(define next-i (fx+ i 1))
(cond
[(and out-end (fx= next-j out-end))
(values (fx- next-i in-start)
(fx- next-j out-start)
(if (fx= next-i in-end)
'complete
'continues))]
[else
(loop next-i next-j next-i 0 0)]))
;; Dispatch on byte: ;; Dispatch on byte:
(cond (cond
[(fx= i in-end) [(fx= i in-end)
@ -116,74 +119,101 @@
(encoding-failure)] (encoding-failure)]
[else [else
(define b (bytes-ref in-bstr i)) (define b (bytes-ref in-bstr i))
(cond (utf-8-decode-byte/inline b accum remaining
[(b . fx< . 128) complete
(cond (lambda (accum remaining)
[(fx= remaining 0) (loop (fx+ i 1) j i accum remaining))
;; Found ASCII (lambda (accum remaining)
(when out-str (string-set! out-str j (integer->char b))) (loop (fx+ i 1) j base-i accum remaining))
(continue)] encoding-failure)])))
[else
;; We were accumulating bytes for an encoding, and (define-syntax-rule (utf-8-decode-byte/inline b accum remaining
;; the encoding didn't complete complete-k
(encoding-failure)])] init-continue-k
next-continue-k
error-k)
(cond
[(b . fx< . 128)
(cond
[(fx= remaining 0)
;; Found ASCII
(complete-k b)]
[else [else
;; Encoding... ;; We were accumulating bytes for an encoding, and
;; the encoding didn't complete
(error-k)])]
[else
;; Encoding...
(cond
[(fx= #b10000000 (fxand b #b11000000))
;; A continuation byte
(cond (cond
[(fx= #b10000000 (fxand b #b11000000)) [(fx= remaining 0)
;; A continuation byte ;; We weren't continuing
(cond (error-k)]
[(fx= remaining 0) [else
;; We weren't continuing (define next (fxand b #b00111111))
(encoding-failure)] (define next-accum (fxior (fxlshift accum 6) next))
[else
(define next (fxand b #b00111111))
(define next-accum (fxior (fxlshift accum 6) next))
(cond
[(fx= 1 remaining)
(cond
[(and (next-accum . fx> . 127)
(next-accum . fx<= . #x10FFFF)
(not (and (next-accum . fx>= . #xD800)
(next-accum . fx<= . #xDFFF))))
(when out-str (string-set! out-str j (integer->char next-accum)))
(continue)]
[else
;; Not a valid character
(encoding-failure)])]
[(and (fx= 2 remaining)
(next-accum . fx<= . #b11111))
;; A shorter byte sequence would work, so this is an
;; encoding mistae.
(encoding-failure)]
[(and (fx= 3 remaining)
(next-accum . fx<= . #b1111))
;; A shorter byte sequence would work
(encoding-failure)]
[else
;; Continue an encoding.
(loop (fx+ i 1) j base-i next-accum (fx- remaining 1))])])]
[(not (fx= remaining 0))
;; Trying to start a new encoding while one is in
;; progress
(encoding-failure)]
[(fx= #b11000000 (fxand b #b11100000))
;; Start a two-byte encoding
(define accum (fxand b #b11111))
;; If `accum` is zero, that's an encoding mistake,
;; because a shorted byte sequence would work.
(cond (cond
[(fx= accum 0) (encoding-failure)] [(fx= 1 remaining)
[else (loop (fx+ i 1) j i accum 1)])] (cond
[(fx= #b11100000 (fxand b #b11110000)) [(and (next-accum . fx> . 127)
;; Start a three-byte encoding (next-accum . fx<= . #x10FFFF)
(define accum (fxand b #b1111)) (not (and (next-accum . fx>= . #xD800)
(loop (fx+ i 1) j i accum 2)] (next-accum . fx<= . #xDFFF))))
[(fx= #b11110000 (fxand b #b11111000)) (complete-k next-accum)]
;; Start a four-byte encoding [else
(define accum (fxand b #b111)) ;; Not a valid character
(loop (fx+ i 1) j i accum 3)] (error-k)])]
[else [(and (fx= 2 remaining)
;; Five- or six-byte encodings don't produce valid (next-accum . fx<= . #b11111))
;; characters ;; A shorter byte sequence would work, so this is an
(encoding-failure)])])]))) ;; encoding mistae.
(error-k)]
[(and (fx= 3 remaining)
(next-accum . fx<= . #b1111))
;; A shorter byte sequence would work
(error-k)]
[else
;; Continue an encoding
(next-continue-k next-accum (fx- remaining 1))])])]
[(not (fx= remaining 0))
;; Trying to start a new encoding while one is in
;; progress
(error-k)]
[(fx= #b11000000 (fxand b #b11100000))
;; Start a two-byte encoding
(define accum (fxand b #b11111))
;; If `accum` is zero, that's an encoding mistake,
;; because a shorter byte sequence would work.
(cond
[(fx= accum 0) (error-k)]
[else (init-continue-k accum 1)])]
[(fx= #b11100000 (fxand b #b11110000))
;; Start a three-byte encoding
(define accum (fxand b #b1111))
(init-continue-k accum 2)]
[(fx= #b11110000 (fxand b #b11111000))
;; Start a four-byte encoding
(define accum (fxand b #b111))
(init-continue-k accum 3)]
[else
;; Five- or six-byte encodings don't produce valid
;; characters
(error-k)])]))
;; Takes a byte and a decoding state and returns
;; one of
;; - (values code-point 0 'complete)
;; - (values #f 0 'error)
;; - (values new-accum new-remaining 'continues)
(define (utf-8-decode-byte b accum remaining)
(utf-8-decode-byte/inline b accum remaining
(lambda (accum)
(values accum 0 'complete))
(lambda (accum remaining)
(values accum remaining 'continues))
(lambda (accum remaining)
(values accum remaining 'continues))
(lambda ()
(values #f 0 'error))))