net/base64: speed up encoding and decoding, especially on byte strings

Base64 encoding and decoding are common and simple enough to
specialize for byte strings, where they can be much faster.
This commit is contained in:
Matthew Flatt 2021-05-03 07:25:22 -06:00
parent d20292b80c
commit ef43f26f62
3 changed files with 218 additions and 65 deletions

View File

@ -8,13 +8,17 @@ utilities for Base 64 (MIME-standard) encoding and decoding.}
@section[#:tag "base64-procs"]{Functions}
@defproc[(base64-encode [bstr bytes?] [newline-bstr bytes? #"\r\n"]) bytes?]{
@defproc[(base64-encode [bstr bytes?] [newline any/c #"\r\n"]) bytes?]{
Consumes a byte string and returns its Base 64 encoding as a new byte
string. The returned string is broken into 72-byte lines separated by
@racket[newline-bstr], which defaults to a CRLF combination, and the
@racket[newline], which defaults to a CRLF combination, and the
result always ends with a @racket[newline-bstr] unless the
input is empty.}
input is empty.
Although @racket[newline] is intended to be a byte string, it can be
any value (possibly with a performance penalty), and it is converted
to a byte string using @racket[display].}
@defproc[(base64-decode [bstr bytes?]) bytes?]{
@ -25,15 +29,18 @@ string.}
@defproc[(base64-encode-stream [in input-port?]
[out output-port?]
[newline-bstr bytes? #"\n"])
[newline any/c #"\n"])
void?]{
Reads bytes from @racket[in] and writes the encoded result to
@racket[out], breaking the output into 72-character lines separated by
@racket[newline-bstr], and ending with @racket[newline-bstr] unless
the input stream is empty. Note that the default @racket[newline-bstr]
@racket[newline], and ending with @racket[newline] unless
the input stream is empty. Note that the default @racket[newline]
is just @racket[#"\n"], not @racket[#"\r\n"]. The procedure returns when
it encounters an end-of-file from @racket[in].}
it encounters an end-of-file from @racket[in].
Although @racket[newline] is intended to be a byte string, it can be
any value, and it is written using @racket[display].}
@defproc[(base64-decode-stream [in input-port?]
[out output-port?])

View File

@ -2,12 +2,40 @@
(require net/base64)
(define (test expect f . args)
(unless (equal? expect (apply f args))
(error "fail")))
(define got (apply f args))
(unless (equal? expect got)
(error "fail" expect got)))
(for ([base64-encode (list base64-encode
(lambda (bstr [linesep #"\r\n"])
(define out (open-output-bytes))
(base64-encode-stream (open-input-bytes bstr) out linesep)
(get-output-bytes out)))]
[base64-decode (list base64-decode
(lambda (bstr)
(define out (open-output-bytes))
(base64-decode-stream (open-input-bytes bstr) out)
(get-output-bytes out)))])
(test #"" base64-encode #"")
(test #"" base64-encode #"" #"<>")
(test #"" base64-decode #"")
(test #"" base64-decode #"a")
(test #"i" base64-decode #"ab")
(test #"i\267" base64-decode #"abc")
(test #"i\267" base64-decode #"abc=d")
(test #"eA==\n" base64-encode #"x" #"\n")
(test #"eHk=\n" base64-encode #"xy" #"\n")
(test #"eHl6\n" base64-encode #"xyz" #"\n")
(test #"eHh4\n" base64-encode #"xxx" #"\n")
(test #"eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4\n"
base64-encode #"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" #"\n")
(test #"eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4\neA==\n"
base64-encode #"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" #"\n")
(test #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5\r\nIG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=\r\n"
base64-encode #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath")
(test #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5<>IG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=<>"
base64-encode #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath" #"<>")
(test #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5_X_IG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=_X_"
base64-encode #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath" '_X_)
(test #"Why do axe murderers only attack\nWhen you're partially nude\nOr you're taking a bath"
base64-decode #"V2h5IGRvIGF4ZSBtdXJkZXJlcnMgb25seSBhdHRhY2sKV2hlbiB5b3UncmUgcGFydGlhbGx5\r\nIG51ZGUKT3IgeW91J3JlIHRha2luZyBhIGJhdGg=\r\n"))

View File

@ -1,4 +1,9 @@
#lang racket/base
(require racket/fixnum)
;; Unsafe mode can be worth a factor of 2 to 4
;; on byte-string encoding/decoding:
(#%declare #:unsafe)
(provide base64-encode-stream
base64-decode-stream
@ -16,66 +21,179 @@
(vector-set! db n i))
(values (vector->immutable-vector bd) (vector->immutable-vector db))))
(define =byte (bytes-ref #"=" 0))
(define ones
(vector->immutable-vector
(list->vector (for/list ([i (in-range 9)]) (sub1 (arithmetic-shift 1 i))))))
(define (base64-decode-stream in out)
(unless (input-port? in) (raise-argument-error 'base64-decode-stream "input-port?" in))
(unless (output-port? out) (raise-argument-error 'base64-decode-stream "output-port?" out))
(let loop ([data 0] [bits 0])
(if (>= bits 8)
(let ([bits (- bits 8)])
(write-byte (arithmetic-shift data (- bits)) out)
(loop (bitwise-and data (vector*-ref ones bits)) bits))
(let ([c (read-byte in)])
(unless (or (eof-object? c) (eq? c =byte))
(cond
[(bits . fx>= . 8)
(let ([bits (fx- bits 8)])
(write-byte (fxrshift data bits) out)
(loop (fxand data (fx- (fxlshift 1 bits) 1)) bits))]
[else
(define c (read-byte in))
(unless (or (eof-object? c) (fx= c (char->integer #\=)))
(let ([v (vector*-ref base64-digit c)])
(if v
(loop (+ (arithmetic-shift data 6) v) (+ bits 6))
(loop data bits))))))))
(loop (fx+ (fxlshift data 6) v) (fx+ bits 6))
(loop data bits))))])))
(define (base64-encode-stream in out [linesep #"\n"])
;; each set of three input bytes turns into four output bytes
(unless (input-port? in) (raise-argument-error 'base64-encode-stream "input-port?" in))
(unless (output-port? out) (raise-argument-error 'base64-encode-stream "output-port?" out))
;; Each set of three input bytes turns into four output bytes.
;; It might be nice to actually write the bytes as a 4-byte string,
;; but this way preserves fine-grained streaming.
(define (o! c) (write-byte (vector*-ref digit-base64 c) out))
(define (fill!) (write-byte (char->integer #\=) out))
(define (line!) (display linesep out))
(let loop ([width 0])
(define b1 (read-byte in))
(unless (eof-object? b1)
(let ([width (if (eqv? width 72)
(begin
(display linesep out)
0)
width)])
(o! (arithmetic-shift b1 -2))
(cond
[(eof-object? b1)
(unless (eqv? width 0)
(line!))]
[else
(o! (fxrshift b1 2))
(define b2 (read-byte in))
(cond
[(eof-object? b2)
(o! (arithmetic-shift (bitwise-and b1 #b11) 4))
(o! (fxlshift (fxand b1 #b11) 4))
(fill!)
(fill!)
(line!)]
[else
(o! (bitwise-ior (arithmetic-shift (bitwise-and b1 #b11) 4)
(arithmetic-shift b2 -4)))
(o! (fxior (fxlshift (fxand b1 #b11) 4)
(fxrshift b2 4)))
(define b3 (read-byte in))
(cond
[(eof-object? b3)
(o! (arithmetic-shift (bitwise-and b2 #b1111) 2))
(o! (fxlshift (fxand b2 #b1111) 2))
(fill!)
(line!)]
[else
(o! (bitwise-ior (arithmetic-shift (bitwise-and b2 #b1111) 2)
(arithmetic-shift b3 -6)))
(o! (bitwise-and b3 #b111111))
(loop (+ width 4))])])))))
(o! (fxior (fxlshift (fxand b2 #b1111) 2)
(fxrshift b3 6)))
(o! (fxand b3 #b111111))
(let ([width (if (eqv? width 68)
(begin
(display linesep out)
0)
(fx+ width 4))])
(loop width))])])])))
;; ----------------------------------------
(define (base64-decode src)
(let ([s (open-output-bytes)])
(base64-decode-stream (open-input-bytes src) s)
(get-output-bytes s)))
(unless (bytes? src) (raise-argument-error 'base64-decode "bytes?" src))
;; Loop through bytes to handle non-encoding characters and stop at `=`
(define-values (src-len in-len)
(let loop ([i 0] [len 0])
(cond
[(fx= i (bytes-length src)) (values i len)]
[else
(define c (bytes-ref src i))
(cond
[(fx= c (char->integer #\=)) (values i len)]
[(vector*-ref base64-digit c) (loop (fx+ i 1) (fx+ len 1))]
[else (loop (fx+ i 1) len)])])))
(define out-len (fx+ (fx* (fxrshift in-len 2) 3)
(fxmax 0 (fx- (fxand in-len 3) 1))))
(define out (make-bytes out-len))
(let loop1 ([i 0] [j 0])
(unless (fx= i src-len)
(define c1 (bytes-ref src i))
(let ([v1 (vector*-ref base64-digit c1)]
[i (fx+ i 1)])
(cond
[(not v1) (loop1 i j)]
[else
(let loop2 ([i i] [j j])
(unless (fx= i src-len)
(define c2 (bytes-ref src i))
(let ([v2 (vector*-ref base64-digit c2)]
[i (fx+ i 1)])
(cond
[(not v2) (loop2 i j)]
[else
(bytes-set! out j (fxior (fxlshift v1 2)
(fxrshift v2 4)))
(let loop3 ([i i] [j (fx+ j 1)])
(unless (fx= i src-len)
(define c3 (bytes-ref src i))
(let ([v3 (vector*-ref base64-digit c3)]
[i (fx+ i 1)])
(cond
[(not v3) (loop3 i j)]
[else
(bytes-set! out j (fxior (fxlshift (fxand v2 #b1111) 4)
(fxrshift v3 2)))
(let loop4 ([i i] [j (fx+ j 1)])
(unless (fx= i src-len)
(define c4 (bytes-ref src i))
(let ([v4 (vector*-ref base64-digit c4)]
[i (fx+ i 1)])
(cond
[(not v4) (loop4 i j)]
[else
(bytes-set! out j (fxior (fxlshift (fxand v3 #b11) 6)
v4))
(loop1 i (fx+ j 1))]))))]))))]))))]))))
out)
(define (base64-encode src [linesep #"\r\n"])
(unless (bytes? src) (raise-argument-error 'base64-encode "bytes?" src))
(cond
[(and (bytes? src) (bytes? linesep))
(define in-len (bytes-length src))
(cond
[(eqv? 0 in-len) #""]
[else
(define out-payload-len (fx* (fxquotient (fx+ in-len 2) 3) 4))
(define out-len (fx+ out-payload-len
(fx* (fxquotient (fx+ out-payload-len 71) 72)
(bytes-length linesep))))
(define out (make-bytes out-len (char->integer #\=)))
(define (out! j c) (bytes-set! out j (vector*-ref digit-base64 c)))
(let loop ([i 0] [j 0] [width 0])
(cond
[((fx+ i 3) . fx<= . in-len)
(define b1 (bytes-ref src i))
(define b2 (bytes-ref src (fx+ i 1)))
(define b3 (bytes-ref src (fx+ i 2)))
(out! j (fxrshift b1 2))
(out! (fx+ j 1) (fxior (fxlshift (fxand b1 #b11) 4)
(fxrshift b2 4)))
(out! (fx+ j 2) (fxior (fxlshift (fxand b2 #b1111) 2)
(fxrshift b3 6)))
(out! (fx+ j 3) (fxand b3 #b111111))
(let ([width (fx+ width 4)]
[i (fx+ i 3)]
[j (fx+ j 4)])
(cond
[(and (eqv? width 72)
(i . fx< . in-len))
(bytes-copy! out j linesep)
(loop i (fx+ j (bytes-length linesep)) 0)]
[else
(loop i j width)]))]
[((fx+ i 2) . fx<= . in-len)
(define b1 (bytes-ref src i))
(define b2 (bytes-ref src (fx+ i 1)))
(out! j (fxrshift b1 2))
(out! (fx+ j 1) (fxior (fxlshift (fxand b1 #b11) 4)
(fxrshift b2 4)))
(out! (fx+ j 2) (fxlshift (fxand b2 #b1111) 2))
(bytes-copy! out (fx+ j 4) linesep)]
[((fx+ i 1) . fx<= . in-len)
(define b1 (bytes-ref src i))
(out! j (fxrshift b1 2))
(out! (fx+ j 1) (fxlshift (fxand b1 #b11) 4))
(bytes-copy! out (fx+ j 4) linesep)]
[else
(bytes-copy! out j linesep)]))
out])]
[else
(let ([s (open-output-bytes)])
(base64-encode-stream (open-input-bytes src) s linesep)
(get-output-bytes s)))
(get-output-bytes s))]))