diff --git a/collects/net/unihead.rkt b/collects/net/unihead.rkt index 1cc821c1c1..7dd39246d8 100644 --- a/collects/net/unihead.rkt +++ b/collects/net/unihead.rkt @@ -10,37 +10,68 @@ [(not (regexp-match? #rx"\r\n" s)) (encode-line-for-header s)] ; speed [else (regexp-replace* #rx"[^\r\n]+" s encode-line-for-header)])) +;; Note: the following two encoder wrappers remove newlines from the +;; encoded strings. This avoids producing invalid strings, but it's not +;; complete: rfc2047 (section 2) specifies that encoded words should not +;; be longer than 75 characters, and longer words should be split for +;; encoding with a separator of CRLF SPACE between them. The problem is +;; that doing this properly requires changing the encoders to get a +;; length limit and have them return also the leftover unencoded string. +;; Instead of doing all of that, do something simpler: if the string to +;; be encoded is longer than 70 characters, then split it. (This is +;; done in `encode-line-for-header' below.) It's possible to get longer +;; encodings with this, but it seems that sendmail's limit on line +;; lengths is sufficiently larger that it works fine in practice. (BTW, +;; when sendmail gets lines that are too long it splits them with the +;; dreaded "!\n ", and it looks like there is no sane way to avoid that +;; behavior -- so splitting the input is needed.) + +(define (base64-encode-header s) + (regexp-replace* #rx#"[\r\n]+" (base64-encode s) #"")) + +(define (qp-encode-header s) + ;; rfc2047 (section 4.2) calls this "Q encoding", which is different + ;; from the usual QP encoding: encode underlines and question marks, + ;; and replace spaces by underlines; also remove soft-newlines. + (regexp-replace* #rx#"[ ?_]" + (regexp-replace* #rx#"=\r?\n" (qp-encode s) #"") + (λ (b) + (case (bytes-ref b 0) + [(32) #"_"] ; " " + [(63) #"=3F"] ; "?" + [(95) #"=5F"] ; "_" + [else (error 'qp-encode-header "internal error")])))) + (define (encode-line-for-header s) - (define (loop s string->bytes charset encode encoding) - ;; Find ASCII (and no "=") prefix before a space - (let ([m (regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)]) - (if m - (string-append - (cadr m) - (loop (caddr m) string->bytes charset encode encoding)) + (define (do-encode s string->bytes charset encode encoding) + (let loop ([s s]) + (cond + ;; Find ASCII (and no "=") prefix before a space + [(regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s) + => (λ (m) (string-append (cadr m) (loop (caddr m))))] ;; Find ASCII (and no "=") suffix after a space - (let ([m (regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)]) - (if m - (string-append - (loop (cadr m) string->bytes charset encode encoding) - (caddr m)) - (format "=?~a?~a?~a?=" - charset encoding - (regexp-replace* #rx#"[\r\n]+$" - (encode (string->bytes s)) - #""))))))) + [(regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s) + => (λ (m) (string-append (loop (cadr m)) (caddr m)))] + [else + ;; Split lines longer than 70 chars and re-assemble (see above + ;; comment). Note that the proper separator should use CRLF, + ;; but we're sending this to a sendmail process that will take + ;; care of that level. + (let loop ([bytes (string->bytes s)]) + (if ((bytes-length bytes) . > . 70) + (string-append (loop (subbytes bytes 0 70)) + "\n " + (loop (subbytes bytes 70))) + (format "=?~a?~a?~a?=" charset encoding (encode bytes))))]))) (cond ;; ASCII - do nothing [(not (regexp-match? re:non-ascii s)) s] ;; Not Latin-1, so use UTF-8 [(regexp-match? #rx"[^\u0-\uFF]" s) - (loop s string->bytes/utf-8 "UTF-8" base64-encode "B")] + (do-encode s string->bytes/utf-8 "UTF-8" base64-encode-header "B")] ;; use Latin-1 [else - (loop s string->bytes/latin-1 "ISO-8859-1" - (lambda (s) - (regexp-replace #rx#" " (qp-encode s) #"_")) - "Q")])) + (do-encode s string->bytes/latin-1 "ISO-8859-1" qp-encode-header "Q")])) ;; ---------------------------------------- @@ -65,8 +96,7 @@ (if m (let ([s ((if (member (cadddr m) '(#"q" #"Q")) ;; quoted-printable, with special _ handling - (lambda (x) - (qp-decode (regexp-replace* #rx#"_" x #" "))) + (λ (x) (qp-decode (regexp-replace* #rx#"_" x #" "))) ;; base64: base64-decode) (cadddr (cdr m)))]