From ef43f26f6240985385b7b62e539950b542bdda51 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Mon, 3 May 2021 07:25:22 -0600 Subject: [PATCH] net/base64: speed up encoding and decoding, especially on byte strings Base64 encoding and decoding are common and simple enough to specialize for byte strings, where they can be much faster. --- pkgs/net-doc/net/scribblings/base64.scrbl | 21 ++- pkgs/net-test/tests/net/base64.rkt | 44 ++++- racket/collects/net/base64.rkt | 218 +++++++++++++++++----- 3 files changed, 218 insertions(+), 65 deletions(-) diff --git a/pkgs/net-doc/net/scribblings/base64.scrbl b/pkgs/net-doc/net/scribblings/base64.scrbl index a3a8920558..31a2a582c1 100644 --- a/pkgs/net-doc/net/scribblings/base64.scrbl +++ b/pkgs/net-doc/net/scribblings/base64.scrbl @@ -8,13 +8,17 @@ utilities for Base 64 (MIME-standard) encoding and decoding.} @section[#:tag "base64-procs"]{Functions} -@defproc[(base64-encode [bstr bytes?] [newline-bstr bytes? #"\r\n"]) bytes?]{ +@defproc[(base64-encode [bstr bytes?] [newline any/c #"\r\n"]) bytes?]{ Consumes a byte string and returns its Base 64 encoding as a new byte string. The returned string is broken into 72-byte lines separated by -@racket[newline-bstr], which defaults to a CRLF combination, and the +@racket[newline], which defaults to a CRLF combination, and the result always ends with a @racket[newline-bstr] unless the -input is empty.} +input is empty. + +Although @racket[newline] is intended to be a byte string, it can be +any value (possibly with a performance penalty), and it is converted +to a byte string using @racket[display].} @defproc[(base64-decode [bstr bytes?]) bytes?]{ @@ -25,15 +29,18 @@ string.} @defproc[(base64-encode-stream [in input-port?] [out output-port?] - [newline-bstr bytes? #"\n"]) + [newline any/c #"\n"]) void?]{ Reads bytes from @racket[in] and writes the encoded result to @racket[out], breaking the output into 72-character lines separated by -@racket[newline-bstr], and ending with @racket[newline-bstr] unless -the input stream is empty. Note that the default @racket[newline-bstr] +@racket[newline], and ending with @racket[newline] unless +the input stream is empty. Note that the default @racket[newline] is just @racket[#"\n"], not @racket[#"\r\n"]. The procedure returns when -it encounters an end-of-file from @racket[in].} +it encounters an end-of-file from @racket[in]. + +Although @racket[newline] is intended to be a byte string, it can be +any value, and it is written using @racket[display].} @defproc[(base64-decode-stream [in input-port?] [out output-port?]) diff --git a/pkgs/net-test/tests/net/base64.rkt b/pkgs/net-test/tests/net/base64.rkt index a2a9a78506..215bf4f86c 100644 --- a/pkgs/net-test/tests/net/base64.rkt +++ b/pkgs/net-test/tests/net/base64.rkt @@ -2,12 +2,40 @@ (require net/base64) (define (test expect f . args) - (unless (equal? expect (apply f args)) - (error "fail"))) + (define got (apply f args)) + (unless (equal? expect got) + (error "fail" expect got))) -(test #"" base64-encode #"") -(test #"" base64-encode #"" #"<>") -(test #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5\r\nIG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=\r\n" - base64-encode #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath") -(test #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5<>IG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=<>" - base64-encode #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath" #"<>") +(for ([base64-encode (list base64-encode + (lambda (bstr [linesep #"\r\n"]) + (define out (open-output-bytes)) + (base64-encode-stream (open-input-bytes bstr) out linesep) + (get-output-bytes out)))] + [base64-decode (list base64-decode + (lambda (bstr) + (define out (open-output-bytes)) + (base64-decode-stream (open-input-bytes bstr) out) + (get-output-bytes out)))]) + (test #"" base64-encode #"") + (test #"" base64-encode #"" #"<>") + (test #"" base64-decode #"") + (test #"" base64-decode #"a") + (test #"i" base64-decode #"ab") + (test #"i\267" base64-decode #"abc") + (test #"i\267" base64-decode #"abc=d") + (test #"eA==\n" base64-encode #"x" #"\n") + (test #"eHk=\n" base64-encode #"xy" #"\n") + (test #"eHl6\n" base64-encode #"xyz" #"\n") + (test #"eHh4\n" base64-encode #"xxx" #"\n") + (test #"eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4\n" + base64-encode #"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" #"\n") + (test #"eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4\neA==\n" + base64-encode #"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" #"\n") + (test #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5\r\nIG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=\r\n" + base64-encode #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath") + (test #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5<>IG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=<>" + base64-encode #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath" #"<>") + (test #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5_X_IG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=_X_" + base64-encode #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath" '_X_) + (test #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath" + base64-decode #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5\r\nIG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=\r\n")) diff --git a/racket/collects/net/base64.rkt b/racket/collects/net/base64.rkt index 04ee7c855c..c3adb8dc1e 100644 --- a/racket/collects/net/base64.rkt +++ b/racket/collects/net/base64.rkt @@ -1,4 +1,9 @@ #lang racket/base +(require racket/fixnum) + +;; Unsafe mode can be worth a factor of 2 to 4 +;; on byte-string encoding/decoding: +(#%declare #:unsafe) (provide base64-encode-stream base64-decode-stream @@ -16,66 +21,179 @@ (vector-set! db n i)) (values (vector->immutable-vector bd) (vector->immutable-vector db)))) -(define =byte (bytes-ref #"=" 0)) -(define ones - (vector->immutable-vector - (list->vector (for/list ([i (in-range 9)]) (sub1 (arithmetic-shift 1 i)))))) - (define (base64-decode-stream in out) + (unless (input-port? in) (raise-argument-error 'base64-decode-stream "input-port?" in)) + (unless (output-port? out) (raise-argument-error 'base64-decode-stream "output-port?" out)) (let loop ([data 0] [bits 0]) - (if (>= bits 8) - (let ([bits (- bits 8)]) - (write-byte (arithmetic-shift data (- bits)) out) - (loop (bitwise-and data (vector*-ref ones bits)) bits)) - (let ([c (read-byte in)]) - (unless (or (eof-object? c) (eq? c =byte)) - (let ([v (vector*-ref base64-digit c)]) - (if v - (loop (+ (arithmetic-shift data 6) v) (+ bits 6)) - (loop data bits)))))))) + (cond + [(bits . fx>= . 8) + (let ([bits (fx- bits 8)]) + (write-byte (fxrshift data bits) out) + (loop (fxand data (fx- (fxlshift 1 bits) 1)) bits))] + [else + (define c (read-byte in)) + (unless (or (eof-object? c) (fx= c (char->integer #\=))) + (let ([v (vector*-ref base64-digit c)]) + (if v + (loop (fx+ (fxlshift data 6) v) (fx+ bits 6)) + (loop data bits))))]))) (define (base64-encode-stream in out [linesep #"\n"]) - ;; each set of three input bytes turns into four output bytes + (unless (input-port? in) (raise-argument-error 'base64-encode-stream "input-port?" in)) + (unless (output-port? out) (raise-argument-error 'base64-encode-stream "output-port?" out)) + ;; Each set of three input bytes turns into four output bytes. + ;; It might be nice to actually write the bytes as a 4-byte string, + ;; but this way preserves fine-grained streaming. (define (o! c) (write-byte (vector*-ref digit-base64 c) out)) (define (fill!) (write-byte (char->integer #\=) out)) (define (line!) (display linesep out)) (let loop ([width 0]) (define b1 (read-byte in)) - (unless (eof-object? b1) - (let ([width (if (eqv? width 72) - (begin - (display linesep out) - 0) - width)]) - (o! (arithmetic-shift b1 -2)) - (define b2 (read-byte in)) - (cond - [(eof-object? b2) - (o! (arithmetic-shift (bitwise-and b1 #b11) 4)) - (fill!) - (fill!) - (line!)] - [else - (o! (bitwise-ior (arithmetic-shift (bitwise-and b1 #b11) 4) - (arithmetic-shift b2 -4))) - (define b3 (read-byte in)) - (cond - [(eof-object? b3) - (o! (arithmetic-shift (bitwise-and b2 #b1111) 2)) - (fill!) - (line!)] - [else - (o! (bitwise-ior (arithmetic-shift (bitwise-and b2 #b1111) 2) - (arithmetic-shift b3 -6))) - (o! (bitwise-and b3 #b111111)) - (loop (+ width 4))])]))))) + (cond + [(eof-object? b1) + (unless (eqv? width 0) + (line!))] + [else + (o! (fxrshift b1 2)) + (define b2 (read-byte in)) + (cond + [(eof-object? b2) + (o! (fxlshift (fxand b1 #b11) 4)) + (fill!) + (fill!) + (line!)] + [else + (o! (fxior (fxlshift (fxand b1 #b11) 4) + (fxrshift b2 4))) + (define b3 (read-byte in)) + (cond + [(eof-object? b3) + (o! (fxlshift (fxand b2 #b1111) 2)) + (fill!) + (line!)] + [else + (o! (fxior (fxlshift (fxand b2 #b1111) 2) + (fxrshift b3 6))) + (o! (fxand b3 #b111111)) + (let ([width (if (eqv? width 68) + (begin + (display linesep out) + 0) + (fx+ width 4))]) + (loop width))])])]))) + +;; ---------------------------------------- (define (base64-decode src) - (let ([s (open-output-bytes)]) - (base64-decode-stream (open-input-bytes src) s) - (get-output-bytes s))) + (unless (bytes? src) (raise-argument-error 'base64-decode "bytes?" src)) + ;; Loop through bytes to handle non-encoding characters and stop at `=` + (define-values (src-len in-len) + (let loop ([i 0] [len 0]) + (cond + [(fx= i (bytes-length src)) (values i len)] + [else + (define c (bytes-ref src i)) + (cond + [(fx= c (char->integer #\=)) (values i len)] + [(vector*-ref base64-digit c) (loop (fx+ i 1) (fx+ len 1))] + [else (loop (fx+ i 1) len)])]))) + (define out-len (fx+ (fx* (fxrshift in-len 2) 3) + (fxmax 0 (fx- (fxand in-len 3) 1)))) + (define out (make-bytes out-len)) + (let loop1 ([i 0] [j 0]) + (unless (fx= i src-len) + (define c1 (bytes-ref src i)) + (let ([v1 (vector*-ref base64-digit c1)] + [i (fx+ i 1)]) + (cond + [(not v1) (loop1 i j)] + [else + (let loop2 ([i i] [j j]) + (unless (fx= i src-len) + (define c2 (bytes-ref src i)) + (let ([v2 (vector*-ref base64-digit c2)] + [i (fx+ i 1)]) + (cond + [(not v2) (loop2 i j)] + [else + (bytes-set! out j (fxior (fxlshift v1 2) + (fxrshift v2 4))) + (let loop3 ([i i] [j (fx+ j 1)]) + (unless (fx= i src-len) + (define c3 (bytes-ref src i)) + (let ([v3 (vector*-ref base64-digit c3)] + [i (fx+ i 1)]) + (cond + [(not v3) (loop3 i j)] + [else + (bytes-set! out j (fxior (fxlshift (fxand v2 #b1111) 4) + (fxrshift v3 2))) + (let loop4 ([i i] [j (fx+ j 1)]) + (unless (fx= i src-len) + (define c4 (bytes-ref src i)) + (let ([v4 (vector*-ref base64-digit c4)] + [i (fx+ i 1)]) + (cond + [(not v4) (loop4 i j)] + [else + (bytes-set! out j (fxior (fxlshift (fxand v3 #b11) 6) + v4)) + (loop1 i (fx+ j 1))]))))]))))]))))])))) + out) (define (base64-encode src [linesep #"\r\n"]) - (let ([s (open-output-bytes)]) - (base64-encode-stream (open-input-bytes src) s linesep) - (get-output-bytes s))) + (unless (bytes? src) (raise-argument-error 'base64-encode "bytes?" src)) + (cond + [(and (bytes? src) (bytes? linesep)) + (define in-len (bytes-length src)) + (cond + [(eqv? 0 in-len) #""] + [else + (define out-payload-len (fx* (fxquotient (fx+ in-len 2) 3) 4)) + (define out-len (fx+ out-payload-len + (fx* (fxquotient (fx+ out-payload-len 71) 72) + (bytes-length linesep)))) + (define out (make-bytes out-len (char->integer #\=))) + (define (out! j c) (bytes-set! out j (vector*-ref digit-base64 c))) + (let loop ([i 0] [j 0] [width 0]) + (cond + [((fx+ i 3) . fx<= . in-len) + (define b1 (bytes-ref src i)) + (define b2 (bytes-ref src (fx+ i 1))) + (define b3 (bytes-ref src (fx+ i 2))) + (out! j (fxrshift b1 2)) + (out! (fx+ j 1) (fxior (fxlshift (fxand b1 #b11) 4) + (fxrshift b2 4))) + (out! (fx+ j 2) (fxior (fxlshift (fxand b2 #b1111) 2) + (fxrshift b3 6))) + (out! (fx+ j 3) (fxand b3 #b111111)) + (let ([width (fx+ width 4)] + [i (fx+ i 3)] + [j (fx+ j 4)]) + (cond + [(and (eqv? width 72) + (i . fx< . in-len)) + (bytes-copy! out j linesep) + (loop i (fx+ j (bytes-length linesep)) 0)] + [else + (loop i j width)]))] + [((fx+ i 2) . fx<= . in-len) + (define b1 (bytes-ref src i)) + (define b2 (bytes-ref src (fx+ i 1))) + (out! j (fxrshift b1 2)) + (out! (fx+ j 1) (fxior (fxlshift (fxand b1 #b11) 4) + (fxrshift b2 4))) + (out! (fx+ j 2) (fxlshift (fxand b2 #b1111) 2)) + (bytes-copy! out (fx+ j 4) linesep)] + [((fx+ i 1) . fx<= . in-len) + (define b1 (bytes-ref src i)) + (out! j (fxrshift b1 2)) + (out! (fx+ j 1) (fxlshift (fxand b1 #b11) 4)) + (bytes-copy! out (fx+ j 4) linesep)] + [else + (bytes-copy! out j linesep)])) + out])] + [else + (let ([s (open-output-bytes)]) + (base64-encode-stream (open-input-bytes src) s linesep) + (get-output-bytes s))]))