commit
6feb946842
|
@ -1,49 +1,77 @@
|
||||||
#lang racket/base
|
#lang racket/base
|
||||||
(require net/base64 net/qp racket/string)
|
(require net/base64 net/qp)
|
||||||
|
|
||||||
(provide encode-for-header decode-for-header generalize-encoding)
|
(provide encode-for-header decode-for-header generalize-encoding)
|
||||||
|
|
||||||
(define re:ascii #rx"^[\u0-\u7F]*$")
|
(define re:non-ascii #rx"[^\u0-\u7F]")
|
||||||
|
|
||||||
(define (encode-for-header s)
|
(define (encode-for-header s)
|
||||||
(if (regexp-match? re:ascii s)
|
(cond [(not (regexp-match? re:non-ascii s)) s]
|
||||||
s
|
[(not (regexp-match? #rx"[\r\n]" s)) (encode-line-for-header s)] ; speed
|
||||||
(let ([l (regexp-split #rx"\r\n" s)])
|
[else (regexp-replace* #rx"[^\r\n]+" s encode-line-for-header)]))
|
||||||
(apply string-append
|
|
||||||
(map encode-line-for-header l)))))
|
;; Note: the following two encoder wrappers remove newlines from the
|
||||||
|
;; encoded strings. This avoids producing invalid strings, but it's not
|
||||||
|
;; complete: rfc2047 (section 2) specifies that encoded words should not
|
||||||
|
;; be longer than 75 characters, and longer words should be split for
|
||||||
|
;; encoding with a separator of CRLF SPACE between them. The problem is
|
||||||
|
;; that doing this properly requires changing the encoders to get a
|
||||||
|
;; length limit and have them return also the leftover unencoded string.
|
||||||
|
;; Instead of doing all of that, do something simpler: if the string to
|
||||||
|
;; be encoded is longer than 70 characters, then split it. (This is
|
||||||
|
;; done in `encode-line-for-header' below.) It's possible to get longer
|
||||||
|
;; encodings with this, but it seems that sendmail's limit on line
|
||||||
|
;; lengths is sufficiently larger that it works fine in practice. (BTW,
|
||||||
|
;; when sendmail gets lines that are too long it splits them with the
|
||||||
|
;; dreaded "!\n ", and it looks like there is no sane way to avoid that
|
||||||
|
;; behavior -- so splitting the input is needed.)
|
||||||
|
|
||||||
|
(define (base64-encode-header s)
|
||||||
|
(regexp-replace* #rx#"[\r\n]+" (base64-encode s) #""))
|
||||||
|
|
||||||
|
(define (qp-encode-header s)
|
||||||
|
;; rfc2047 (section 4.2) calls this "Q encoding", which is different
|
||||||
|
;; from the usual QP encoding: encode underlines and question marks,
|
||||||
|
;; and replace spaces by underlines; also remove soft-newlines.
|
||||||
|
(regexp-replace* #rx#"[ ?_]"
|
||||||
|
(regexp-replace* #rx#"=\r?\n" (qp-encode s) #"")
|
||||||
|
(λ (b)
|
||||||
|
(case (bytes-ref b 0)
|
||||||
|
[(32) #"_"] ; " "
|
||||||
|
[(63) #"=3F"] ; "?"
|
||||||
|
[(95) #"=5F"] ; "_"
|
||||||
|
[else (error 'qp-encode-header "internal error")]))))
|
||||||
|
|
||||||
(define (encode-line-for-header s)
|
(define (encode-line-for-header s)
|
||||||
(define (loop s string->bytes charset encode encoding)
|
(define (do-encode s string->bytes charset encode encoding)
|
||||||
;; Find ASCII (and no "=") prefix before a space
|
(let loop ([s s])
|
||||||
(let ([m (regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)])
|
|
||||||
(if m
|
|
||||||
(string-append
|
|
||||||
(cadr m)
|
|
||||||
(loop (caddr m) string->bytes charset encode encoding))
|
|
||||||
;; Find ASCII (and no "=") suffix after a space
|
|
||||||
(let ([m (regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)])
|
|
||||||
(if m
|
|
||||||
(string-append
|
|
||||||
(loop (cadr m) string->bytes charset encode encoding)
|
|
||||||
(caddr m))
|
|
||||||
(format "=?~a?~a?~a?="
|
|
||||||
charset encoding
|
|
||||||
(regexp-replace* #rx#"[\r\n]+$"
|
|
||||||
(encode (string->bytes s))
|
|
||||||
#"")))))))
|
|
||||||
(cond
|
(cond
|
||||||
[(regexp-match? re:ascii s)
|
;; Find ASCII (and no "=") prefix before a space
|
||||||
;; ASCII - do nothing
|
[(regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)
|
||||||
s]
|
=> (λ (m) (string-append (cadr m) (loop (caddr m))))]
|
||||||
[(regexp-match? #rx"[^\u0-\uFF]" s)
|
;; Find ASCII (and no "=") suffix after a space
|
||||||
;; Not Latin-1, so use UTF-8
|
[(regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)
|
||||||
(loop s string->bytes/utf-8 "UTF-8" base64-encode "B")]
|
=> (λ (m) (string-append (loop (cadr m)) (caddr m)))]
|
||||||
[else
|
[else
|
||||||
|
;; Split lines longer than 70 chars and re-assemble (see above
|
||||||
|
;; comment). Note that the proper separator should use CRLF,
|
||||||
|
;; but we're sending this to a sendmail process that will take
|
||||||
|
;; care of that level.
|
||||||
|
(let loop ([bytes (string->bytes s)])
|
||||||
|
(if ((bytes-length bytes) . > . 70)
|
||||||
|
(string-append (loop (subbytes bytes 0 70))
|
||||||
|
"\n "
|
||||||
|
(loop (subbytes bytes 70)))
|
||||||
|
(format "=?~a?~a?~a?=" charset encoding (encode bytes))))])))
|
||||||
|
(cond
|
||||||
|
;; ASCII - do nothing
|
||||||
|
[(not (regexp-match? re:non-ascii s)) s]
|
||||||
|
;; Not Latin-1, so use UTF-8
|
||||||
|
[(regexp-match? #rx"[^\u0-\uFF]" s)
|
||||||
|
(do-encode s string->bytes/utf-8 "UTF-8" base64-encode-header "B")]
|
||||||
;; use Latin-1
|
;; use Latin-1
|
||||||
(loop s string->bytes/latin-1 "ISO-8859-1"
|
[else
|
||||||
(lambda (s)
|
(do-encode s string->bytes/latin-1 "ISO-8859-1" qp-encode-header "Q")]))
|
||||||
(regexp-replace #rx#" " (qp-encode s) #"_"))
|
|
||||||
"Q")]))
|
|
||||||
|
|
||||||
;; ----------------------------------------
|
;; ----------------------------------------
|
||||||
|
|
||||||
|
@ -68,8 +96,7 @@
|
||||||
(if m
|
(if m
|
||||||
(let ([s ((if (member (cadddr m) '(#"q" #"Q"))
|
(let ([s ((if (member (cadddr m) '(#"q" #"Q"))
|
||||||
;; quoted-printable, with special _ handling
|
;; quoted-printable, with special _ handling
|
||||||
(lambda (x)
|
(λ (x) (qp-decode (regexp-replace* #rx#"_" x #" ")))
|
||||||
(qp-decode (regexp-replace* #rx#"_" x #" ")))
|
|
||||||
;; base64:
|
;; base64:
|
||||||
base64-decode)
|
base64-decode)
|
||||||
(cadddr (cdr m)))]
|
(cadddr (cdr m)))]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user