More fixes and improvements to `net/unihead'.

* In base64 encoding remove all newlines from the encoded result, avoids
  getting an invalid result.

* In qp encoding:
  - replace all spaces by underlines, not just the first (looks like a
    typo in the previous code)
  - encode "?"s and "_"s too, as required for this encoding
  - remove soft newlines (again, avoid an invalid result)

* Use `regexp-replace*' to encode the parts between the lines.  Besides
  making the code simpler, it fixes a bug in the previous code where
  multiple lines would each get encoded and the results concatenated
  without the newlines or any other whitespace.

* When the string to be encoded is longer than 70 characters, split and
  encode the sub-parts, then concatenate the encodings with a "\n "
  separator.  This is done as a poor attempt to follow the line length
  limits specified in rfc2047, or more concretely, to avoid sendmail's
  "!\n " splitting.
This commit is contained in:
Eli Barzilay 2011-12-21 13:35:58 -05:00
parent 188c65661d
commit ed8e3367d4

View File

@ -10,37 +10,68 @@
[(not (regexp-match? #rx"\r\n" s)) (encode-line-for-header s)] ; speed
[else (regexp-replace* #rx"[^\r\n]+" s encode-line-for-header)]))
;; Note: the following two encoder wrappers remove newlines from the
;; encoded strings. This avoids producing invalid strings, but it's not
;; complete: rfc2047 (section 2) specifies that encoded words should not
;; be longer than 75 characters, and longer words should be split for
;; encoding with a separator of CRLF SPACE between them. The problem is
;; that doing this properly requires changing the encoders to get a
;; length limit and have them return also the leftover unencoded string.
;; Instead of doing all of that, do something simpler: if the string to
;; be encoded is longer than 70 characters, then split it. (This is
;; done in `encode-line-for-header' below.) It's possible to get longer
;; encodings with this, but it seems that sendmail's limit on line
;; lengths is sufficiently larger that it works fine in practice. (BTW,
;; when sendmail gets lines that are too long it splits them with the
;; dreaded "!\n ", and it looks like there is no sane way to avoid that
;; behavior -- so splitting the input is needed.)
(define (base64-encode-header s)
(regexp-replace* #rx#"[\r\n]+" (base64-encode s) #""))
(define (qp-encode-header s)
;; rfc2047 (section 4.2) calls this "Q encoding", which is different
;; from the usual QP encoding: encode underlines and question marks,
;; and replace spaces by underlines; also remove soft-newlines.
(regexp-replace* #rx#"[ ?_]"
(regexp-replace* #rx#"=\r?\n" (qp-encode s) #"")
(λ (b)
(case (bytes-ref b 0)
[(32) #"_"] ; " "
[(63) #"=3F"] ; "?"
[(95) #"=5F"] ; "_"
[else (error 'qp-encode-header "internal error")]))))
(define (encode-line-for-header s)
(define (loop s string->bytes charset encode encoding)
(define (do-encode s string->bytes charset encode encoding)
(let loop ([s s])
(cond
;; Find ASCII (and no "=") prefix before a space
(let ([m (regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)])
(if m
(string-append
(cadr m)
(loop (caddr m) string->bytes charset encode encoding))
[(regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)
=> (λ (m) (string-append (cadr m) (loop (caddr m))))]
;; Find ASCII (and no "=") suffix after a space
(let ([m (regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)])
(if m
(string-append
(loop (cadr m) string->bytes charset encode encoding)
(caddr m))
(format "=?~a?~a?~a?="
charset encoding
(regexp-replace* #rx#"[\r\n]+$"
(encode (string->bytes s))
#"")))))))
[(regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)
=> (λ (m) (string-append (loop (cadr m)) (caddr m)))]
[else
;; Split lines longer than 70 chars and re-assemble (see above
;; comment). Note that the proper separator should use CRLF,
;; but we're sending this to a sendmail process that will take
;; care of that level.
(let loop ([bytes (string->bytes s)])
(if ((bytes-length bytes) . > . 70)
(string-append (loop (subbytes bytes 0 70))
"\n "
(loop (subbytes bytes 70)))
(format "=?~a?~a?~a?=" charset encoding (encode bytes))))])))
(cond
;; ASCII - do nothing
[(not (regexp-match? re:non-ascii s)) s]
;; Not Latin-1, so use UTF-8
[(regexp-match? #rx"[^\u0-\uFF]" s)
(loop s string->bytes/utf-8 "UTF-8" base64-encode "B")]
(do-encode s string->bytes/utf-8 "UTF-8" base64-encode-header "B")]
;; use Latin-1
[else
(loop s string->bytes/latin-1 "ISO-8859-1"
(lambda (s)
(regexp-replace #rx#" " (qp-encode s) #"_"))
"Q")]))
(do-encode s string->bytes/latin-1 "ISO-8859-1" qp-encode-header "Q")]))
;; ----------------------------------------
@ -65,8 +96,7 @@
(if m
(let ([s ((if (member (cadddr m) '(#"q" #"Q"))
;; quoted-printable, with special _ handling
(lambda (x)
(qp-decode (regexp-replace* #rx#"_" x #" ")))
(λ (x) (qp-decode (regexp-replace* #rx#"_" x #" ")))
;; base64:
base64-decode)
(cadddr (cdr m)))]