From d197e1b8f4e71f8b6bb57b3c59be9ca2a13745f8 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Wed, 13 Feb 2019 07:38:18 -0700 Subject: [PATCH] io: adjust `read-[bytes-]line` to use direct buffer --- racket/src/io/port/line-input.rkt | 76 +++++++++++++++------------ racket/src/io/port/read-and-peek.rkt | 47 ++++++++++++++++- racket/src/io/string/convert.rkt | 25 ++------- racket/src/io/string/utf-8-decode.rkt | 39 +++++++++++++- 4 files changed, 129 insertions(+), 58 deletions(-) diff --git a/racket/src/io/port/line-input.rkt b/racket/src/io/port/line-input.rkt index f473232fb1..799f0a7225 100644 --- a/racket/src/io/port/line-input.rkt +++ b/racket/src/io/port/line-input.rkt @@ -19,7 +19,7 @@ make-string string-set! string-copy! substring read-a-char peek-a-char - as-char) + as-char direct-string?) (define/who (read-line [orig-in (current-input-port)] [mode 'linefeed]) (define in (->core-input-port orig-in who)) (check who ok-mode? #:contract ok-mode-str mode) @@ -27,47 +27,53 @@ (define cr? (case mode [(return any any-one) #t] [else #f])) (define lf? (case mode [(linefeed any any-one) #t] [else #f])) (define crlf? (case mode [(return-linefeed any) #t] [else #f])) - (define init-len 32) - (let loop ([str (make-string init-len)] [len init-len] [pos 0]) - (define ch (read-a-char 'read-line in)) - (define (keep-char) - (if (pos . fx< . len) - (begin - (string-set! str pos ch) - (loop str len (fx+ pos 1))) - (let* ([new-len (fx* len 2)] - [new-str (make-string new-len)]) - (string-copy! new-str 0 str 0) - (string-set! new-str pos ch) - (loop new-str new-len (fx+ pos 1))))) - (cond - [(eof-object? ch) - (if (fx= pos 0) - eof - (substring str 0 pos))] - [(and (or cr? crlf?) - (eqv? ch (as-char #\return))) - (cond - [(and crlf? - (eqv? (peek-a-char 'read-line in 0) (as-char #\linefeed))) - (read-a-char 'read-line in) - (substring str 0 pos)] - [cr? - (substring str 0 pos)] - [else (keep-char)])] - [(and lf? - (eqv? ch (as-char #\newline))) - (substring str 0 pos)] - [else (keep-char)])))) + (cond + [(maybe-read-a-line in cr? lf? crlf? direct-string?) + => (lambda (r) r)] + [else + (define init-len 32) + (let loop ([str (make-string init-len)] [len init-len] [pos 0]) + (define ch (read-a-char 'read-line in)) + (define (keep-char) + (if (pos . fx< . len) + (begin + (string-set! str pos ch) + (loop str len (fx+ pos 1))) + (let* ([new-len (fx* len 2)] + [new-str (make-string new-len)]) + (string-copy! new-str 0 str 0) + (string-set! new-str pos ch) + (loop new-str new-len (fx+ pos 1))))) + (cond + [(eof-object? ch) + (if (fx= pos 0) + eof + (substring str 0 pos))] + [(and (or cr? crlf?) + (eqv? ch (as-char #\return))) + (cond + [(and crlf? + (eqv? (peek-a-char 'read-line in 0) (as-char #\linefeed))) + (read-a-char 'read-line in) + (substring str 0 pos)] + [cr? + (substring str 0 pos)] + [else (keep-char)])] + [(and lf? + (eqv? ch (as-char #\newline))) + (substring str 0 pos)] + [else (keep-char)]))]))) (define-read-line read-line make-string string-set! string-copy! substring read-a-char peek-a-char - values) + values + #t) (define-read-line read-bytes-line make-bytes bytes-set! bytes-copy! subbytes read-a-byte peek-a-byte - char->integer) + char->integer + #f) diff --git a/racket/src/io/port/read-and-peek.rkt b/racket/src/io/port/read-and-peek.rkt index 375edbdfa9..988c9ec21a 100644 --- a/racket/src/io/port/read-and-peek.rkt +++ b/racket/src/io/port/read-and-peek.rkt @@ -3,6 +3,7 @@ "../common/internal-error.rkt" "../common/class.rkt" "../host/thread.rkt" + "../string/utf-8-decode.rkt" "port.rkt" "input-port.rkt" "count.rkt" @@ -15,7 +16,9 @@ read-a-byte read-byte-via-bytes peek-a-byte - peek-byte-via-bytes) + peek-byte-via-bytes + + maybe-read-a-line) ;; Read up to `(- end start)` bytes, producing at least a ;; single by unless `zero-ok?` is true. The result is @@ -259,3 +262,45 @@ (if (eq? v 1) (bytes-ref bstr 0) v)) + +;; ---------------------------------------- + +;; Tries to read a line from the fast-path buffer +(define (maybe-read-a-line in cr? lf? crlf? as-string?) + (start-atomic) + (define buffer (core-port-buffer in)) + (define bstr (direct-bstr buffer)) + (define pos (direct-pos buffer)) + (define end (fxmin (direct-end buffer) + ;; limit atomicity + (fx+ pos 4096))) + (define (finish end read-end) + (set-direct-pos! buffer read-end) + (when (core-port-count in) + (port-count! in (fx- read-end pos) bstr pos)) + (define result + (if as-string? + (a-bytes->string/utf-8 bstr pos end) + (subbytes bstr pos end))) + (end-atomic) + result) + (let loop ([i pos]) + (cond + [(fx= i end) + (end-atomic) + #f] + [else + (define b (bytes-ref bstr i)) + (cond + [(and lf? (eqv? b (char->integer #\linefeed))) + (finish i (fx+ i 1))] + [(and (or cr? crlf?) (eqv? b (char->integer #\return))) + (cond + [(and crlf? + ((fx+ i 1) . fx< . end) + (eqv? (bytes-ref bstr (fx+ i 1)) (char->integer #\linefeed))) + (finish i (fx+ i 2))] + [cr? + (finish i (fx+ i 1))] + [else (loop (fx+ i 1))])] + [else (loop (fx+ i 1))])]))) diff --git a/racket/src/io/string/convert.rkt b/racket/src/io/string/convert.rkt index 082154b3e5..df61d52d3e 100644 --- a/racket/src/io/string/convert.rkt +++ b/racket/src/io/string/convert.rkt @@ -40,25 +40,10 @@ (check who exact-nonnegative-integer? start) (check who exact-nonnegative-integer? end) (check-range who start end (bytes-length bstr) bstr) - ;; Measure result string: - (define-values (used-bytes got-chars state) - (utf-8-decode! bstr start end - #f 0 #f - #:error-char err-char - #:abort-mode 'error)) - (cond - [(eq? state 'error) (if just-length? - #f - (raise-encoding-error who bstr start end))] - [just-length? got-chars] - [else - ;; Create result string: - (define str (make-string got-chars)) - (utf-8-decode! bstr start end - str 0 #f - #:error-char err-char - #:abort-mode 'error) - str])) + (or (a-bytes->string/utf-8 bstr start end err-char #:just-length? just-length?) + (if just-length? + #f + (raise-encoding-error who bstr start end)))) (define/who (bytes->string/utf-8 bstr [err-char #f] [start 0] [end (and (bytes? bstr) (bytes-length bstr))]) @@ -86,7 +71,7 @@ (check-range who start end (bytes-length bstr) bstr) ;; First, decode `skip` items: (define-values (initial-used-bytes initial-got-chars state) - (if (zero? skip) + (if (eqv? skip 0) (values 0 0 (if (= start end) 'complete 'continues)) (utf-8-decode! bstr start end #f 0 skip diff --git a/racket/src/io/string/utf-8-decode.rkt b/racket/src/io/string/utf-8-decode.rkt index 5034e5f507..c40b4948ca 100644 --- a/racket/src/io/string/utf-8-decode.rkt +++ b/racket/src/io/string/utf-8-decode.rkt @@ -7,7 +7,9 @@ utf-8-decode-byte utf-8-state? - utf-8-state-pending-amt) + utf-8-state-pending-amt + + a-bytes->string/utf-8) ;; The maximum number of characters that might not be consumed ;; by a conversion at the tail of a byte string, assuming that @@ -23,7 +25,7 @@ ;; and further decoding reveals that earlier bytes were in error. ;; ;; The `abort-mode` argument determines what to do when reaching the end of the input -;; and an encoding needs more ytes: +;; and an encoding needs more bytes: ;; * 'error : treat the bytes as encoding errors ;; * 'aborts : report 'aborts ;; * 'state : return a value that encapsulates the state, so another call can continue @@ -217,3 +219,36 @@ (values accum remaining 'continues)) (lambda () (values #f 0 'error)))) + +;; ---------------------------------------- + +(define (a-bytes->string/utf-8 bstr start end [err-char #\uFFFD] #:just-length? [just-length? #f]) + ;; Shortcut for all-ASCII: + (cond + [(for/and ([i (in-range start end)]) + ((bytes-ref bstr i) . fx< . 128)) + (cond + [just-length? (fx- end start)] + [else + (define str (make-string (fx- end start))) + (for ([i (in-range start end)]) + (string-set! str (fx- i start) (integer->char (bytes-ref bstr i)))) + str])] + [else + ;; Measure result string: + (define-values (used-bytes got-chars state) + (utf-8-decode! bstr start end + #f 0 #f + #:error-char err-char + #:abort-mode 'error)) + (cond + [(eq? state 'error) #f] + [just-length? got-chars] + [else + ;; Create result string: + (define str (make-string got-chars)) + (utf-8-decode! bstr start end + str 0 #f + #:error-char err-char + #:abort-mode 'error) + str])]))