Fixes and improvements to `net/unihead'.

* Use `re:non-ascii' to look for a non-ascii character => faster.

* Use either CR or LF for a newline, not just LF.

* Use `regexp-replace*' to encode the parts between the lines.  Besides
  making the code simpler, it fixes a bug in the previous code where
  multiple lines would each get encoded and the results concatenated
  without the newlines or any other whitespace.

original commit: 188c65661d
This commit is contained in:
Eli Barzilay 2011-12-21 03:13:34 -05:00
parent 42503882a8
commit b232bb25ac

View File

@ -1,16 +1,14 @@
#lang racket/base #lang racket/base
(require net/base64 net/qp racket/string) (require net/base64 net/qp)
(provide encode-for-header decode-for-header generalize-encoding) (provide encode-for-header decode-for-header generalize-encoding)
(define re:ascii #rx"^[\u0-\u7F]*$") (define re:non-ascii #rx"[^\u0-\u7F]")
(define (encode-for-header s) (define (encode-for-header s)
(if (regexp-match? re:ascii s) (cond [(not (regexp-match? re:non-ascii s)) s]
s [(not (regexp-match? #rx"\r\n" s)) (encode-line-for-header s)] ; speed
(let ([l (regexp-split #rx"\r\n" s)]) [else (regexp-replace* #rx"[^\r\n]+" s encode-line-for-header)]))
(apply string-append
(map encode-line-for-header l)))))
(define (encode-line-for-header s) (define (encode-line-for-header s)
(define (loop s string->bytes charset encode encoding) (define (loop s string->bytes charset encode encoding)
@ -32,14 +30,13 @@
(encode (string->bytes s)) (encode (string->bytes s))
#""))))))) #"")))))))
(cond (cond
[(regexp-match? re:ascii s)
;; ASCII - do nothing ;; ASCII - do nothing
s] [(not (regexp-match? re:non-ascii s)) s]
[(regexp-match? #rx"[^\u0-\uFF]" s)
;; Not Latin-1, so use UTF-8 ;; Not Latin-1, so use UTF-8
[(regexp-match? #rx"[^\u0-\uFF]" s)
(loop s string->bytes/utf-8 "UTF-8" base64-encode "B")] (loop s string->bytes/utf-8 "UTF-8" base64-encode "B")]
[else
;; use Latin-1 ;; use Latin-1
[else
(loop s string->bytes/latin-1 "ISO-8859-1" (loop s string->bytes/latin-1 "ISO-8859-1"
(lambda (s) (lambda (s)
(regexp-replace #rx#" " (qp-encode s) #"_")) (regexp-replace #rx#" " (qp-encode s) #"_"))