From e0eb397a87daa67f3500caef41729a1f8fa2fa65 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Fri, 30 Apr 2004 15:40:34 +0000 Subject: [PATCH] . original commit: 59f7f8c9a04c804ebefefaf969a9e017bd29e970 --- collects/net/unihead.ss | 108 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 collects/net/unihead.ss diff --git a/collects/net/unihead.ss b/collects/net/unihead.ss new file mode 100644 index 0000000000..20405d98b0 --- /dev/null +++ b/collects/net/unihead.ss @@ -0,0 +1,108 @@ +(module unihead mzscheme + (require (lib "base64.ss" "net") + (lib "qp.ss" "net") + (lib "string.ss")) + + (provide encode-for-header + decode-for-header + latin-1->windows-1252) + + (define re:ascii #rx"^[\u0-\u7F]*$") + + (define (encode-for-header s) + (if (regexp-match re:ascii s) + s + (let ([l (regexp-split #rx"\r\n" s)]) + (apply string-append + (map encode-line-for-header l))))) + + (define (encode-line-for-header s) + (define (loop s string->bytes charset encode encoding) + ;; Find ASCII (and no "=") prefix before a space + (let ([m (regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)]) + (if m + (string-append + (cadr m) + (loop (caddr m) string->bytes charset encode encoding)) + ;; Find ASCII (and no "=") suffix after a space + (let ([m (regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)]) + (if m + (string-append + (loop (cadr m) string->bytes charset encode encoding) + (caddr m)) + (format "=?~a?~a?~a?=" + charset encoding + (regexp-replace* #rx#"[\r\n]+$" + (encode (string->bytes s)) + #""))))))) + (cond + [(regexp-match re:ascii s) + ;; ASCII - do nothing + s] + [(regexp-match #rx"[^\u0-\uFF]" s) + ;; Not Latin-1, so use UTF-8 + (loop s string->bytes/utf-8 "UTF-8" base64-encode "B")] + [else + ;; use Latin-1 + (loop s string->bytes/latin-1 "ISO-8859-1" + (lambda (s) + (regexp-replace #rx#" " (qp-encode s) #"_")) + "Q")])) + + ;; ---------------------------------------- + + (define re:iso #rx#"[iI][sS][oO]-8859-1") + (define re:utf-8 #rx#"[uU][tT][fF]-8") + + (define re:encoded #rx#"^(.*?)=[?]([^?]+)[?]([qQbB])[?](.*?)[?]=(.*)$") + + (define (latin-1->windows-1252 encoding) + (if (regexp-match re:iso encoding) + ;; Treat Latin-1 as Windows-1252, because + ;; some mailers are broken. The only difference + ;; is control chaarcters (which are technically + ;; not mapped in Latin-1, anyway). + (if (bytes? encoding) + #"WINDOWS-1252" + "WINDOWS-1252") + encoding)) + + (define (decode-for-header s) + (and s + (let ([m (regexp-match re:encoded (string->bytes/latin-1 s (char->integer #\?)))]) + (if m + (let ([s ((if (member (cadddr m) '(#"q" #"Q")) + ;; quoted-printable, with special _ handling + (lambda (x) + (qp-decode (regexp-replace* #rx#"_" x #" "))) + ;; base64: + base64-decode) + (cadddr (cdr m)))] + [encoding (caddr m)]) + (string-append + (decode-for-header (bytes->string/latin-1 (cadr m))) + (let ([encoding (latin-1->windows-1252 encoding)]) + (cond + [(regexp-match re:utf-8 encoding) (bytes->string/utf-8 s #\?)] + [else (let ([c (bytes-open-converter (bytes->string/latin-1 encoding) "UTF-8")]) + (if c + (let-values ([(r got status) (bytes-convert c s)]) + (bytes-close-converter c) + (if (eq? status 'complete) + (bytes->string/utf-8 r #\?) + (bytes->string/latin-1 s))) + (bytes->string/latin-1 s)))])) + (let ([rest (cadddr (cddr m))]) + (let ([rest + ;; A CR-LF-space-encoding sequence means that we should + ;; drop the space. + (if (and (> (bytes-length rest) 4) + (= 13 (bytes-ref rest 0)) + (= 10 (bytes-ref rest 1)) + (= 32 (bytes-ref rest 2)) + (let ([m (regexp-match-positions re:encoded rest)]) + (and m (= (caaddr m) 5)))) + (subbytes rest 3) + rest)]) + (decode-for-header (bytes->string/latin-1 rest)))))) + s)))))