io: faster read-char
This commit is contained in:
parent
9823cbba4d
commit
af24a0318f
|
@ -23,7 +23,8 @@
|
|||
do-read-byte/core-port)
|
||||
|
||||
(module+ internal
|
||||
(provide do-read-bytes!))
|
||||
(provide do-read-bytes!
|
||||
peek-byte/core-port))
|
||||
|
||||
;; ----------------------------------------
|
||||
|
||||
|
@ -132,11 +133,14 @@
|
|||
(check who input-port? orig-in)
|
||||
(check who exact-nonnegative-integer? skip-k)
|
||||
(let ([in (->core-input-port orig-in)])
|
||||
(define peek-byte (and (zero? skip-k)
|
||||
(core-input-port-peek-byte in)))
|
||||
(cond
|
||||
[peek-byte (do-peek-byte who peek-byte in orig-in)]
|
||||
[else (peek-byte-via-bytes in skip-k #:special-ok? #f)])))
|
||||
(peek-byte/core-port in skip-k)))
|
||||
|
||||
(define/who (peek-byte/core-port in skip-k)
|
||||
(define peek-byte (and (zero? skip-k)
|
||||
(core-input-port-peek-byte in)))
|
||||
(cond
|
||||
[peek-byte (do-peek-byte who peek-byte in)]
|
||||
[else (peek-byte-via-bytes in skip-k #:special-ok? #f)]))
|
||||
|
||||
(define/who (peek-bytes amt skip-k [in (current-input-port)])
|
||||
(check who exact-nonnegative-integer? amt)
|
||||
|
|
|
@ -213,7 +213,7 @@
|
|||
v))
|
||||
|
||||
;; Use a `peek-byte` shortcut
|
||||
(define (do-peek-byte who peek-byte in orig-in)
|
||||
(define (do-peek-byte who peek-byte in)
|
||||
(let loop ()
|
||||
(start-atomic)
|
||||
(prepare-change in)
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
(let ([in (->core-input-port orig-in)])
|
||||
(define peek-byte (core-input-port-peek-byte in))
|
||||
(cond
|
||||
[peek-byte (do-peek-byte who peek-byte in orig-in)]
|
||||
[peek-byte (do-peek-byte who peek-byte in)]
|
||||
[else
|
||||
(extract-special-value (peek-byte-via-bytes in skip-k #:progress-evt progress-evt)
|
||||
in source-name skip-k
|
||||
|
|
|
@ -197,24 +197,7 @@
|
|||
;; A shortcut to implement `read-char` in terms of a port-specific
|
||||
;; `read-byte`:
|
||||
(define (read-char-via-read-byte who in read-byte #:special-ok? [special-ok? #t])
|
||||
(define b
|
||||
(let loop ()
|
||||
(start-atomic)
|
||||
(prepare-change in)
|
||||
(check-not-closed who in)
|
||||
(define b (read-byte (core-port-self in)))
|
||||
(cond
|
||||
[(fixnum? b)
|
||||
(port-count-byte! in b)
|
||||
(end-atomic)
|
||||
b]
|
||||
[(eof-object? b)
|
||||
(end-atomic)
|
||||
eof]
|
||||
[else
|
||||
(end-atomic)
|
||||
(sync b)
|
||||
(loop)])))
|
||||
(define b (do-read-byte who read-byte in))
|
||||
(cond
|
||||
[(eof-object? b) b]
|
||||
[else
|
||||
|
@ -222,48 +205,38 @@
|
|||
[(b . fx< . 128) (integer->char b)]
|
||||
[else
|
||||
;; UTF-8 decoding... May need to peek bytes to discover
|
||||
;; whether the decoding will work (in which case this wasn't
|
||||
;; much of a shortcut)
|
||||
(define bstr (bytes b))
|
||||
(define str (make-string 1))
|
||||
(define-values (used-bytes got-chars state)
|
||||
(utf-8-decode! bstr 0 1
|
||||
#f 0 #f
|
||||
#:abort-mode 'state))
|
||||
;; whether the decoding will work
|
||||
(define-values (accum remaining state)
|
||||
(utf-8-decode-byte b 0 0))
|
||||
(cond
|
||||
[(eq? state 'error)
|
||||
;; This happens if the byte is a UTF-8 continuation byte
|
||||
#\uFFFD]
|
||||
[else
|
||||
;; Need to peek ahead
|
||||
(let loop ([skip-k 0] [state state])
|
||||
(define v (peek-some-bytes! who in bstr 0 1 skip-k #:copy-bstr? #f #:special-ok? special-ok?))
|
||||
;; Need to peek ahead; don't consume any more bytes until
|
||||
;; complete, and consume only the already-consumed byte
|
||||
;; if there's a decoding error
|
||||
(let loop ([skip-k 0] [accum accum] [remaining remaining])
|
||||
(define b (peek-byte/core-port in skip-k))
|
||||
(cond
|
||||
[(or (eof-object? v)
|
||||
(procedure? v))
|
||||
;; Already-consumed byte is an error byte
|
||||
[(eof-object? b)
|
||||
;; Already-consumed byte is consume as an error byte
|
||||
#\uFFFD]
|
||||
[else
|
||||
(define-values (used-bytes got-chars new-state)
|
||||
(utf-8-decode! bstr 0 1
|
||||
str 0 1
|
||||
#:state state
|
||||
#:error-char #\uFFFD
|
||||
#:abort-mode 'state))
|
||||
(define-values (next-accum next-remaining state)
|
||||
(utf-8-decode-byte b accum remaining))
|
||||
(cond
|
||||
[(fx= got-chars 1)
|
||||
(define actually-used-bytes (fx+ skip-k used-bytes))
|
||||
(cond
|
||||
[(fx= actually-used-bytes 0) (void)]
|
||||
[(fx= actually-used-bytes 1) (do-read-byte who read-byte in)]
|
||||
[else
|
||||
(define finish-bstr (if (actually-used-bytes . fx<= . (bytes-length bstr))
|
||||
bstr
|
||||
(make-bytes actually-used-bytes)))
|
||||
(do-read-bytes! who in finish-bstr 0 actually-used-bytes)])
|
||||
(string-ref str 0)]
|
||||
[(eq? state 'complete)
|
||||
;; Consume used bytes
|
||||
(let loop ([skip-k skip-k])
|
||||
(do-read-byte who read-byte in)
|
||||
(unless (fx= 0 skip-k)
|
||||
(loop (fx- skip-k 1))))
|
||||
(integer->char next-accum)]
|
||||
[(eq? state 'error)
|
||||
#\uFFFD]
|
||||
[else
|
||||
(loop (fx+ skip-k 1) new-state)])]))])])]))
|
||||
(loop (fx+ 1 skip-k) next-accum next-remaining)])]))])])]))
|
||||
|
||||
;; ----------------------------------------
|
||||
|
||||
|
|
|
@ -3,7 +3,9 @@
|
|||
|
||||
(provide utf-8-decode!
|
||||
utf-8-max-aborts-amt
|
||||
|
||||
|
||||
utf-8-decode-byte
|
||||
|
||||
utf-8-state?
|
||||
utf-8-state-pending-amt)
|
||||
|
||||
|
@ -57,6 +59,21 @@
|
|||
;; Iterate through the given byte string
|
||||
(let loop ([i in-start] [j out-start] [base-i base-i] [accum accum] [remaining remaining])
|
||||
|
||||
;; Shared handling for success:
|
||||
(define (complete accum)
|
||||
(when out-str (string-set! out-str j (integer->char accum)))
|
||||
(define next-j (fx+ j 1))
|
||||
(define next-i (fx+ i 1))
|
||||
(cond
|
||||
[(and out-end (fx= next-j out-end))
|
||||
(values (fx- next-i in-start)
|
||||
(fx- next-j out-start)
|
||||
(if (fx= next-i in-end)
|
||||
'complete
|
||||
'continues))]
|
||||
[else
|
||||
(loop next-i next-j next-i 0 0)]))
|
||||
|
||||
;; Shared handling for encoding failures:
|
||||
(define (encoding-failure)
|
||||
(cond
|
||||
|
@ -76,20 +93,6 @@
|
|||
(fx- j out-start)
|
||||
'error)]))
|
||||
|
||||
;; Shared handling for decoding success:
|
||||
(define (continue)
|
||||
(define next-j (fx+ j 1))
|
||||
(define next-i (fx+ i 1))
|
||||
(cond
|
||||
[(and out-end (fx= next-j out-end))
|
||||
(values (fx- next-i in-start)
|
||||
(fx- next-j out-start)
|
||||
(if (fx= next-i in-end)
|
||||
'complete
|
||||
'continues))]
|
||||
[else
|
||||
(loop next-i next-j next-i 0 0)]))
|
||||
|
||||
;; Dispatch on byte:
|
||||
(cond
|
||||
[(fx= i in-end)
|
||||
|
@ -116,74 +119,101 @@
|
|||
(encoding-failure)]
|
||||
[else
|
||||
(define b (bytes-ref in-bstr i))
|
||||
(cond
|
||||
[(b . fx< . 128)
|
||||
(cond
|
||||
[(fx= remaining 0)
|
||||
;; Found ASCII
|
||||
(when out-str (string-set! out-str j (integer->char b)))
|
||||
(continue)]
|
||||
[else
|
||||
;; We were accumulating bytes for an encoding, and
|
||||
;; the encoding didn't complete
|
||||
(encoding-failure)])]
|
||||
(utf-8-decode-byte/inline b accum remaining
|
||||
complete
|
||||
(lambda (accum remaining)
|
||||
(loop (fx+ i 1) j i accum remaining))
|
||||
(lambda (accum remaining)
|
||||
(loop (fx+ i 1) j base-i accum remaining))
|
||||
encoding-failure)])))
|
||||
|
||||
(define-syntax-rule (utf-8-decode-byte/inline b accum remaining
|
||||
complete-k
|
||||
init-continue-k
|
||||
next-continue-k
|
||||
error-k)
|
||||
(cond
|
||||
[(b . fx< . 128)
|
||||
(cond
|
||||
[(fx= remaining 0)
|
||||
;; Found ASCII
|
||||
(complete-k b)]
|
||||
[else
|
||||
;; Encoding...
|
||||
;; We were accumulating bytes for an encoding, and
|
||||
;; the encoding didn't complete
|
||||
(error-k)])]
|
||||
[else
|
||||
;; Encoding...
|
||||
(cond
|
||||
[(fx= #b10000000 (fxand b #b11000000))
|
||||
;; A continuation byte
|
||||
(cond
|
||||
[(fx= #b10000000 (fxand b #b11000000))
|
||||
;; A continuation byte
|
||||
(cond
|
||||
[(fx= remaining 0)
|
||||
;; We weren't continuing
|
||||
(encoding-failure)]
|
||||
[else
|
||||
(define next (fxand b #b00111111))
|
||||
(define next-accum (fxior (fxlshift accum 6) next))
|
||||
(cond
|
||||
[(fx= 1 remaining)
|
||||
(cond
|
||||
[(and (next-accum . fx> . 127)
|
||||
(next-accum . fx<= . #x10FFFF)
|
||||
(not (and (next-accum . fx>= . #xD800)
|
||||
(next-accum . fx<= . #xDFFF))))
|
||||
(when out-str (string-set! out-str j (integer->char next-accum)))
|
||||
(continue)]
|
||||
[else
|
||||
;; Not a valid character
|
||||
(encoding-failure)])]
|
||||
[(and (fx= 2 remaining)
|
||||
(next-accum . fx<= . #b11111))
|
||||
;; A shorter byte sequence would work, so this is an
|
||||
;; encoding mistae.
|
||||
(encoding-failure)]
|
||||
[(and (fx= 3 remaining)
|
||||
(next-accum . fx<= . #b1111))
|
||||
;; A shorter byte sequence would work
|
||||
(encoding-failure)]
|
||||
[else
|
||||
;; Continue an encoding.
|
||||
(loop (fx+ i 1) j base-i next-accum (fx- remaining 1))])])]
|
||||
[(not (fx= remaining 0))
|
||||
;; Trying to start a new encoding while one is in
|
||||
;; progress
|
||||
(encoding-failure)]
|
||||
[(fx= #b11000000 (fxand b #b11100000))
|
||||
;; Start a two-byte encoding
|
||||
(define accum (fxand b #b11111))
|
||||
;; If `accum` is zero, that's an encoding mistake,
|
||||
;; because a shorted byte sequence would work.
|
||||
[(fx= remaining 0)
|
||||
;; We weren't continuing
|
||||
(error-k)]
|
||||
[else
|
||||
(define next (fxand b #b00111111))
|
||||
(define next-accum (fxior (fxlshift accum 6) next))
|
||||
(cond
|
||||
[(fx= accum 0) (encoding-failure)]
|
||||
[else (loop (fx+ i 1) j i accum 1)])]
|
||||
[(fx= #b11100000 (fxand b #b11110000))
|
||||
;; Start a three-byte encoding
|
||||
(define accum (fxand b #b1111))
|
||||
(loop (fx+ i 1) j i accum 2)]
|
||||
[(fx= #b11110000 (fxand b #b11111000))
|
||||
;; Start a four-byte encoding
|
||||
(define accum (fxand b #b111))
|
||||
(loop (fx+ i 1) j i accum 3)]
|
||||
[else
|
||||
;; Five- or six-byte encodings don't produce valid
|
||||
;; characters
|
||||
(encoding-failure)])])])))
|
||||
[(fx= 1 remaining)
|
||||
(cond
|
||||
[(and (next-accum . fx> . 127)
|
||||
(next-accum . fx<= . #x10FFFF)
|
||||
(not (and (next-accum . fx>= . #xD800)
|
||||
(next-accum . fx<= . #xDFFF))))
|
||||
(complete-k next-accum)]
|
||||
[else
|
||||
;; Not a valid character
|
||||
(error-k)])]
|
||||
[(and (fx= 2 remaining)
|
||||
(next-accum . fx<= . #b11111))
|
||||
;; A shorter byte sequence would work, so this is an
|
||||
;; encoding mistae.
|
||||
(error-k)]
|
||||
[(and (fx= 3 remaining)
|
||||
(next-accum . fx<= . #b1111))
|
||||
;; A shorter byte sequence would work
|
||||
(error-k)]
|
||||
[else
|
||||
;; Continue an encoding
|
||||
(next-continue-k next-accum (fx- remaining 1))])])]
|
||||
[(not (fx= remaining 0))
|
||||
;; Trying to start a new encoding while one is in
|
||||
;; progress
|
||||
(error-k)]
|
||||
[(fx= #b11000000 (fxand b #b11100000))
|
||||
;; Start a two-byte encoding
|
||||
(define accum (fxand b #b11111))
|
||||
;; If `accum` is zero, that's an encoding mistake,
|
||||
;; because a shorter byte sequence would work.
|
||||
(cond
|
||||
[(fx= accum 0) (error-k)]
|
||||
[else (init-continue-k accum 1)])]
|
||||
[(fx= #b11100000 (fxand b #b11110000))
|
||||
;; Start a three-byte encoding
|
||||
(define accum (fxand b #b1111))
|
||||
(init-continue-k accum 2)]
|
||||
[(fx= #b11110000 (fxand b #b11111000))
|
||||
;; Start a four-byte encoding
|
||||
(define accum (fxand b #b111))
|
||||
(init-continue-k accum 3)]
|
||||
[else
|
||||
;; Five- or six-byte encodings don't produce valid
|
||||
;; characters
|
||||
(error-k)])]))
|
||||
|
||||
;; Takes a byte and a decoding state and returns
|
||||
;; one of
|
||||
;; - (values code-point 0 'complete)
|
||||
;; - (values #f 0 'error)
|
||||
;; - (values new-accum new-remaining 'continues)
|
||||
(define (utf-8-decode-byte b accum remaining)
|
||||
(utf-8-decode-byte/inline b accum remaining
|
||||
(lambda (accum)
|
||||
(values accum 0 'complete))
|
||||
(lambda (accum remaining)
|
||||
(values accum remaining 'continues))
|
||||
(lambda (accum remaining)
|
||||
(values accum remaining 'continues))
|
||||
(lambda ()
|
||||
(values #f 0 'error))))
|
||||
|
|
Loading…
Reference in New Issue
Block a user