racket/collects/net/unihead.rkt

#lang racket/base
(require net/base64 net/qp)

(provide encode-for-header decode-for-header generalize-encoding)

(define re:non-ascii #rx"[^\u0-\u7F]")

(define (encode-for-header s)
  (cond [(not (regexp-match? re:non-ascii s)) s]
        [(not (regexp-match? #rx"[\r\n]" s)) (encode-line-for-header s)] ; speed
        [else (regexp-replace* #rx"[^\r\n]+" s encode-line-for-header)]))

;; Note: the following two encoder wrappers remove newlines from the
;; encoded strings.  This avoids producing invalid strings, but it's not
;; complete: rfc2047 (section 2) specifies that encoded words should not
;; be longer than 75 characters, and longer words should be split for
;; encoding with a separator of CRLF SPACE between them.  The problem is
;; that doing this properly requires changing the encoders to get a
;; length limit and have them return also the leftover unencoded string.
;; Instead of doing all of that, do something simpler: if the string to
;; be encoded is longer than 70 characters, then split it.  (This is
;; done in `encode-line-for-header' below.)  It's possible to get longer
;; encodings with this, but it seems that sendmail's limit on line
;; lengths is sufficiently larger that it works fine in practice.  (BTW,
;; when sendmail gets lines that are too long it splits them with the
;; dreaded "!\n ", and it looks like there is no sane way to avoid that
;; behavior -- so splitting the input is needed.)

(define (base64-encode-header s)
  (regexp-replace* #rx#"[\r\n]+" (base64-encode s) #""))

(define (qp-encode-header s)
  ;; rfc2047 (section 4.2) calls this "Q encoding", which is different
  ;; from the usual QP encoding: encode underlines and question marks,
  ;; and replace spaces by underlines; also remove soft-newlines.
  (regexp-replace* #rx#"[ ?_]"
                   (regexp-replace* #rx#"=\r?\n" (qp-encode s) #"")
                   (λ (b)
                     (case (bytes-ref b 0)
                       [(32) #"_"]   ; " "
                       [(63) #"=3F"] ; "?"
                       [(95) #"=5F"] ; "_"
                       [else (error 'qp-encode-header "internal error")]))))

(define (encode-line-for-header s)
  (define (do-encode s string->bytes charset encode encoding)
    (let loop ([s s])
      (cond
        ;; Find ASCII (and no "=") prefix before a space
        [(regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)
         => (λ (m) (string-append (cadr m) (loop (caddr m))))]
        ;; Find ASCII (and no "=") suffix after a space
        [(regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)
         => (λ (m) (string-append (loop (cadr m)) (caddr m)))]
        [else
         ;; Split lines longer than 70 chars and re-assemble (see above
         ;; comment).  Note that the proper separator should use CRLF,
         ;; but we're sending this to a sendmail process that will take
         ;; care of that level.
         (let loop ([bytes (string->bytes s)])
           (if ((bytes-length bytes) . > . 70)
             (string-append (loop (subbytes bytes 0 70))
                            "\n "
                            (loop (subbytes bytes 70)))
             (format "=?~a?~a?~a?=" charset encoding (encode bytes))))])))
  (cond
    ;; ASCII - do nothing
    [(not (regexp-match? re:non-ascii s)) s]
    ;; Not Latin-1, so use UTF-8
    [(regexp-match? #rx"[^\u0-\uFF]" s)
     (do-encode s string->bytes/utf-8 "UTF-8" base64-encode-header "B")]
    ;; use Latin-1
    [else
     (do-encode s string->bytes/latin-1 "ISO-8859-1" qp-encode-header "Q")]))

;; ----------------------------------------

(define re:encoded #rx#"^(.*?)=[?]([^?]+)[?]([qQbB])[?](.*?)[?]=(.*)$")

(define (generalize-encoding encoding)
  ;; Treat Latin-1 as Windows-1252 and also threat GB and GB2312
  ;; as GBK, because some mailers are broken.
  (cond [(or (regexp-match? #rx#"^(?i:iso-8859-1)$" encoding)
             (regexp-match? #rx#"^(?i:us-ascii)$" encoding))
         (if (bytes? encoding) #"WINDOWS-1252" "WINDOWS-1252")]
        [(regexp-match? #rx#"^(?i:gb(?:2312)?)$" encoding)
         (if (bytes? encoding) #"GBK" "GBK")]
        [(regexp-match? #rx#"^(?i:ks_c_5601-1987)$" encoding)
         (if (bytes? encoding) #"CP949" "CP949")]
        [else encoding]))

(define (decode-for-header s)
  (and s
       (let ([m (regexp-match re:encoded
                              (string->bytes/latin-1 s (char->integer #\?)))])
         (if m
           (let ([s ((if (member (cadddr m) '(#"q" #"Q"))
                       ;; quoted-printable, with special _ handling
                       (λ (x) (qp-decode (regexp-replace* #rx#"_" x #" ")))
                       ;; base64:
                       base64-decode)
                     (cadddr (cdr m)))]
                 [encoding (caddr m)])
             (string-append
              (decode-for-header (bytes->string/latin-1 (cadr m)))
              (let ([encoding (generalize-encoding encoding)])
                (cond
                  [(regexp-match? #rx#"^(?i:utf-8)$" encoding)
                   (bytes->string/utf-8 s #\?)]
                  [else (let ([c (bytes-open-converter
                                  (bytes->string/latin-1 encoding)
                                  "UTF-8")])
                          (if c
                            (let-values ([(r got status)
                                          (bytes-convert c s)])
                              (bytes-close-converter c)
                              (if (eq? status 'complete)
                                (bytes->string/utf-8 r #\?)
                                (bytes->string/latin-1 s)))
                            (bytes->string/latin-1 s)))]))
              (let ([rest (cadddr (cddr m))])
                (let ([rest
                       ;; A CR-LF-space-encoding sequence means that we
                       ;; should drop the space.
                       (if (and (> (bytes-length rest) 4)
                                (= 13 (bytes-ref rest 0))
                                (= 10 (bytes-ref rest 1))
                                (= 32 (bytes-ref rest 2))
                                (let ([m (regexp-match-positions
                                          re:encoded rest)])
                                  (and m (= (caaddr m) 5))))
                         (subbytes rest 3)
                         rest)])
                  (decode-for-header (bytes->string/latin-1 rest))))))
           s))))