io: faster read-char

This commit is contained in:
Matthew Flatt 2019-02-03 14:46:02 -08:00
parent 9823cbba4d
commit af24a0318f
5 changed files with 148 additions and 141 deletions

View File

@ -23,7 +23,8 @@
do-read-byte/core-port)
(module+ internal
(provide do-read-bytes!))
(provide do-read-bytes!
peek-byte/core-port))
;; ----------------------------------------
@ -132,11 +133,14 @@
(check who input-port? orig-in)
(check who exact-nonnegative-integer? skip-k)
(let ([in (->core-input-port orig-in)])
(define peek-byte (and (zero? skip-k)
(core-input-port-peek-byte in)))
(cond
[peek-byte (do-peek-byte who peek-byte in orig-in)]
[else (peek-byte-via-bytes in skip-k #:special-ok? #f)])))
(peek-byte/core-port in skip-k)))
(define/who (peek-byte/core-port in skip-k)
(define peek-byte (and (zero? skip-k)
(core-input-port-peek-byte in)))
(cond
[peek-byte (do-peek-byte who peek-byte in)]
[else (peek-byte-via-bytes in skip-k #:special-ok? #f)]))
(define/who (peek-bytes amt skip-k [in (current-input-port)])
(check who exact-nonnegative-integer? amt)

View File

@ -213,7 +213,7 @@
v))
;; Use a `peek-byte` shortcut
(define (do-peek-byte who peek-byte in orig-in)
(define (do-peek-byte who peek-byte in)
(let loop ()
(start-atomic)
(prepare-change in)

View File

@ -41,7 +41,7 @@
(let ([in (->core-input-port orig-in)])
(define peek-byte (core-input-port-peek-byte in))
(cond
[peek-byte (do-peek-byte who peek-byte in orig-in)]
[peek-byte (do-peek-byte who peek-byte in)]
[else
(extract-special-value (peek-byte-via-bytes in skip-k #:progress-evt progress-evt)
in source-name skip-k

View File

@ -197,24 +197,7 @@
;; A shortcut to implement `read-char` in terms of a port-specific
;; `read-byte`:
(define (read-char-via-read-byte who in read-byte #:special-ok? [special-ok? #t])
(define b
(let loop ()
(start-atomic)
(prepare-change in)
(check-not-closed who in)
(define b (read-byte (core-port-self in)))
(cond
[(fixnum? b)
(port-count-byte! in b)
(end-atomic)
b]
[(eof-object? b)
(end-atomic)
eof]
[else
(end-atomic)
(sync b)
(loop)])))
(define b (do-read-byte who read-byte in))
(cond
[(eof-object? b) b]
[else
@ -222,48 +205,38 @@
[(b . fx< . 128) (integer->char b)]
[else
;; UTF-8 decoding... May need to peek bytes to discover
;; whether the decoding will work (in which case this wasn't
;; much of a shortcut)
(define bstr (bytes b))
(define str (make-string 1))
(define-values (used-bytes got-chars state)
(utf-8-decode! bstr 0 1
#f 0 #f
#:abort-mode 'state))
;; whether the decoding will work
(define-values (accum remaining state)
(utf-8-decode-byte b 0 0))
(cond
[(eq? state 'error)
;; This happens if the byte is a UTF-8 continuation byte
#\uFFFD]
[else
;; Need to peek ahead
(let loop ([skip-k 0] [state state])
(define v (peek-some-bytes! who in bstr 0 1 skip-k #:copy-bstr? #f #:special-ok? special-ok?))
;; Need to peek ahead; don't consume any more bytes until
;; complete, and consume only the already-consumed byte
;; if there's a decoding error
(let loop ([skip-k 0] [accum accum] [remaining remaining])
(define b (peek-byte/core-port in skip-k))
(cond
[(or (eof-object? v)
(procedure? v))
;; Already-consumed byte is an error byte
[(eof-object? b)
;; Already-consumed byte is consume as an error byte
#\uFFFD]
[else
(define-values (used-bytes got-chars new-state)
(utf-8-decode! bstr 0 1
str 0 1
#:state state
#:error-char #\uFFFD
#:abort-mode 'state))
(define-values (next-accum next-remaining state)
(utf-8-decode-byte b accum remaining))
(cond
[(fx= got-chars 1)
(define actually-used-bytes (fx+ skip-k used-bytes))
(cond
[(fx= actually-used-bytes 0) (void)]
[(fx= actually-used-bytes 1) (do-read-byte who read-byte in)]
[else
(define finish-bstr (if (actually-used-bytes . fx<= . (bytes-length bstr))
bstr
(make-bytes actually-used-bytes)))
(do-read-bytes! who in finish-bstr 0 actually-used-bytes)])
(string-ref str 0)]
[(eq? state 'complete)
;; Consume used bytes
(let loop ([skip-k skip-k])
(do-read-byte who read-byte in)
(unless (fx= 0 skip-k)
(loop (fx- skip-k 1))))
(integer->char next-accum)]
[(eq? state 'error)
#\uFFFD]
[else
(loop (fx+ skip-k 1) new-state)])]))])])]))
(loop (fx+ 1 skip-k) next-accum next-remaining)])]))])])]))
;; ----------------------------------------

View File

@ -3,7 +3,9 @@
(provide utf-8-decode!
utf-8-max-aborts-amt
utf-8-decode-byte
utf-8-state?
utf-8-state-pending-amt)
@ -57,6 +59,21 @@
;; Iterate through the given byte string
(let loop ([i in-start] [j out-start] [base-i base-i] [accum accum] [remaining remaining])
;; Shared handling for success:
(define (complete accum)
(when out-str (string-set! out-str j (integer->char accum)))
(define next-j (fx+ j 1))
(define next-i (fx+ i 1))
(cond
[(and out-end (fx= next-j out-end))
(values (fx- next-i in-start)
(fx- next-j out-start)
(if (fx= next-i in-end)
'complete
'continues))]
[else
(loop next-i next-j next-i 0 0)]))
;; Shared handling for encoding failures:
(define (encoding-failure)
(cond
@ -76,20 +93,6 @@
(fx- j out-start)
'error)]))
;; Shared handling for decoding success:
(define (continue)
(define next-j (fx+ j 1))
(define next-i (fx+ i 1))
(cond
[(and out-end (fx= next-j out-end))
(values (fx- next-i in-start)
(fx- next-j out-start)
(if (fx= next-i in-end)
'complete
'continues))]
[else
(loop next-i next-j next-i 0 0)]))
;; Dispatch on byte:
(cond
[(fx= i in-end)
@ -116,74 +119,101 @@
(encoding-failure)]
[else
(define b (bytes-ref in-bstr i))
(cond
[(b . fx< . 128)
(cond
[(fx= remaining 0)
;; Found ASCII
(when out-str (string-set! out-str j (integer->char b)))
(continue)]
[else
;; We were accumulating bytes for an encoding, and
;; the encoding didn't complete
(encoding-failure)])]
(utf-8-decode-byte/inline b accum remaining
complete
(lambda (accum remaining)
(loop (fx+ i 1) j i accum remaining))
(lambda (accum remaining)
(loop (fx+ i 1) j base-i accum remaining))
encoding-failure)])))
(define-syntax-rule (utf-8-decode-byte/inline b accum remaining
complete-k
init-continue-k
next-continue-k
error-k)
(cond
[(b . fx< . 128)
(cond
[(fx= remaining 0)
;; Found ASCII
(complete-k b)]
[else
;; Encoding...
;; We were accumulating bytes for an encoding, and
;; the encoding didn't complete
(error-k)])]
[else
;; Encoding...
(cond
[(fx= #b10000000 (fxand b #b11000000))
;; A continuation byte
(cond
[(fx= #b10000000 (fxand b #b11000000))
;; A continuation byte
(cond
[(fx= remaining 0)
;; We weren't continuing
(encoding-failure)]
[else
(define next (fxand b #b00111111))
(define next-accum (fxior (fxlshift accum 6) next))
(cond
[(fx= 1 remaining)
(cond
[(and (next-accum . fx> . 127)
(next-accum . fx<= . #x10FFFF)
(not (and (next-accum . fx>= . #xD800)
(next-accum . fx<= . #xDFFF))))
(when out-str (string-set! out-str j (integer->char next-accum)))
(continue)]
[else
;; Not a valid character
(encoding-failure)])]
[(and (fx= 2 remaining)
(next-accum . fx<= . #b11111))
;; A shorter byte sequence would work, so this is an
;; encoding mistae.
(encoding-failure)]
[(and (fx= 3 remaining)
(next-accum . fx<= . #b1111))
;; A shorter byte sequence would work
(encoding-failure)]
[else
;; Continue an encoding.
(loop (fx+ i 1) j base-i next-accum (fx- remaining 1))])])]
[(not (fx= remaining 0))
;; Trying to start a new encoding while one is in
;; progress
(encoding-failure)]
[(fx= #b11000000 (fxand b #b11100000))
;; Start a two-byte encoding
(define accum (fxand b #b11111))
;; If `accum` is zero, that's an encoding mistake,
;; because a shorted byte sequence would work.
[(fx= remaining 0)
;; We weren't continuing
(error-k)]
[else
(define next (fxand b #b00111111))
(define next-accum (fxior (fxlshift accum 6) next))
(cond
[(fx= accum 0) (encoding-failure)]
[else (loop (fx+ i 1) j i accum 1)])]
[(fx= #b11100000 (fxand b #b11110000))
;; Start a three-byte encoding
(define accum (fxand b #b1111))
(loop (fx+ i 1) j i accum 2)]
[(fx= #b11110000 (fxand b #b11111000))
;; Start a four-byte encoding
(define accum (fxand b #b111))
(loop (fx+ i 1) j i accum 3)]
[else
;; Five- or six-byte encodings don't produce valid
;; characters
(encoding-failure)])])])))
[(fx= 1 remaining)
(cond
[(and (next-accum . fx> . 127)
(next-accum . fx<= . #x10FFFF)
(not (and (next-accum . fx>= . #xD800)
(next-accum . fx<= . #xDFFF))))
(complete-k next-accum)]
[else
;; Not a valid character
(error-k)])]
[(and (fx= 2 remaining)
(next-accum . fx<= . #b11111))
;; A shorter byte sequence would work, so this is an
;; encoding mistae.
(error-k)]
[(and (fx= 3 remaining)
(next-accum . fx<= . #b1111))
;; A shorter byte sequence would work
(error-k)]
[else
;; Continue an encoding
(next-continue-k next-accum (fx- remaining 1))])])]
[(not (fx= remaining 0))
;; Trying to start a new encoding while one is in
;; progress
(error-k)]
[(fx= #b11000000 (fxand b #b11100000))
;; Start a two-byte encoding
(define accum (fxand b #b11111))
;; If `accum` is zero, that's an encoding mistake,
;; because a shorter byte sequence would work.
(cond
[(fx= accum 0) (error-k)]
[else (init-continue-k accum 1)])]
[(fx= #b11100000 (fxand b #b11110000))
;; Start a three-byte encoding
(define accum (fxand b #b1111))
(init-continue-k accum 2)]
[(fx= #b11110000 (fxand b #b11111000))
;; Start a four-byte encoding
(define accum (fxand b #b111))
(init-continue-k accum 3)]
[else
;; Five- or six-byte encodings don't produce valid
;; characters
(error-k)])]))
;; Takes a byte and a decoding state and returns
;; one of
;; - (values code-point 0 'complete)
;; - (values #f 0 'error)
;; - (values new-accum new-remaining 'continues)
(define (utf-8-decode-byte b accum remaining)
(utf-8-decode-byte/inline b accum remaining
(lambda (accum)
(values accum 0 'complete))
(lambda (accum remaining)
(values accum remaining 'continues))
(lambda (accum remaining)
(values accum remaining 'continues))
(lambda ()
(values #f 0 'error))))