better use of regexps

svn: r4931

original commit: a024df75fb
This commit is contained in:
Eli Barzilay 2006-11-23 04:44:25 +00:00
commit 4d097b7bfc

View File

@ -10,7 +10,7 @@
(define re:ascii #rx"^[\u0-\u7F]*$") (define re:ascii #rx"^[\u0-\u7F]*$")
(define (encode-for-header s) (define (encode-for-header s)
(if (regexp-match re:ascii s) (if (regexp-match? re:ascii s)
s s
(let ([l (regexp-split #rx"\r\n" s)]) (let ([l (regexp-split #rx"\r\n" s)])
(apply string-append (apply string-append
@ -36,10 +36,10 @@
(encode (string->bytes s)) (encode (string->bytes s))
#""))))))) #"")))))))
(cond (cond
[(regexp-match re:ascii s) [(regexp-match? re:ascii s)
;; ASCII - do nothing ;; ASCII - do nothing
s] s]
[(regexp-match #rx"[^\u0-\uFF]" s) [(regexp-match? #rx"[^\u0-\uFF]" s)
;; Not Latin-1, so use UTF-8 ;; Not Latin-1, so use UTF-8
(loop s string->bytes/utf-8 "UTF-8" base64-encode "B")] (loop s string->bytes/utf-8 "UTF-8" base64-encode "B")]
[else [else
@ -51,36 +51,30 @@
;; ---------------------------------------- ;; ----------------------------------------
(define re:us-ascii #rx#"^[uS][sS]-[aA][sS][cC][iI][iI]$") (define re:us-ascii #rx#"^(?i:us-ascii)$")
(define re:iso #rx#"^[iI][sS][oO]-8859-1$") (define re:iso #rx#"^(?i:iso-8859-1)$")
(define re:gb #rx#"^[gG][bB](2312)?$") (define re:gb #rx#"^(?i:gb(?:2312)?)$")
(define re:ks_c #rx#"^[kK][sS]_[cC]_5601-1987$") (define re:ks_c #rx#"^(?i:ks_c_5601-1987)$")
(define re:utf-8 #rx#"^[uU][tT][fF]-8$") (define re:utf-8 #rx#"^(?i:utf-8)$")
(define re:encoded #rx#"^(.*?)=[?]([^?]+)[?]([qQbB])[?](.*?)[?]=(.*)$") (define re:encoded #rx#"^(.*?)=[?]([^?]+)[?]([qQbB])[?](.*?)[?]=(.*)$")
(define (generalize-encoding encoding) (define (generalize-encoding encoding)
;; Treat Latin-1 as Windows-1252 and also threat GB and GB2312 ;; Treat Latin-1 as Windows-1252 and also threat GB and GB2312
;; as GBK, because some mailers are broken. ;; as GBK, because some mailers are broken.
(cond (cond [(or (regexp-match? re:iso encoding)
[(or (regexp-match re:iso encoding) (regexp-match? re:us-ascii encoding))
(regexp-match re:us-ascii encoding)) (if (bytes? encoding) #"WINDOWS-1252" "WINDOWS-1252")]
(if (bytes? encoding) [(regexp-match? re:gb encoding)
#"WINDOWS-1252" (if (bytes? encoding) #"GBK" "GBK")]
"WINDOWS-1252")] [(regexp-match? re:ks_c encoding)
[(regexp-match re:gb encoding) (if (bytes? encoding) #"CP949" "CP949")]
(if (bytes? encoding) [else encoding]))
#"GBK"
"GBK")]
[(regexp-match re:ks_c encoding)
(if (bytes? encoding)
#"CP949"
"CP949")]
[else encoding]))
(define (decode-for-header s) (define (decode-for-header s)
(and s (and s
(let ([m (regexp-match re:encoded (string->bytes/latin-1 s (char->integer #\?)))]) (let ([m (regexp-match re:encoded
(string->bytes/latin-1 s (char->integer #\?)))])
(if m (if m
(let ([s ((if (member (cadddr m) '(#"q" #"Q")) (let ([s ((if (member (cadddr m) '(#"q" #"Q"))
;; quoted-printable, with special _ handling ;; quoted-printable, with special _ handling
@ -94,10 +88,13 @@
(decode-for-header (bytes->string/latin-1 (cadr m))) (decode-for-header (bytes->string/latin-1 (cadr m)))
(let ([encoding (generalize-encoding encoding)]) (let ([encoding (generalize-encoding encoding)])
(cond (cond
[(regexp-match re:utf-8 encoding) (bytes->string/utf-8 s #\?)] [(regexp-match? re:utf-8 encoding)
[else (let ([c (bytes-open-converter (bytes->string/latin-1 encoding) "UTF-8")]) (bytes->string/utf-8 s #\?)]
[else (let ([c (bytes-open-converter
(bytes->string/latin-1 encoding) "UTF-8")])
(if c (if c
(let-values ([(r got status) (bytes-convert c s)]) (let-values ([(r got status)
(bytes-convert c s)])
(bytes-close-converter c) (bytes-close-converter c)
(if (eq? status 'complete) (if (eq? status 'complete)
(bytes->string/utf-8 r #\?) (bytes->string/utf-8 r #\?)
@ -105,13 +102,14 @@
(bytes->string/latin-1 s)))])) (bytes->string/latin-1 s)))]))
(let ([rest (cadddr (cddr m))]) (let ([rest (cadddr (cddr m))])
(let ([rest (let ([rest
;; A CR-LF-space-encoding sequence means that we should ;; A CR-LF-space-encoding sequence means that we
;; drop the space. ;; should drop the space.
(if (and (> (bytes-length rest) 4) (if (and (> (bytes-length rest) 4)
(= 13 (bytes-ref rest 0)) (= 13 (bytes-ref rest 0))
(= 10 (bytes-ref rest 1)) (= 10 (bytes-ref rest 1))
(= 32 (bytes-ref rest 2)) (= 32 (bytes-ref rest 2))
(let ([m (regexp-match-positions re:encoded rest)]) (let ([m (regexp-match-positions
re:encoded rest)])
(and m (= (caaddr m) 5)))) (and m (= (caaddr m) 5))))
(subbytes rest 3) (subbytes rest 3)
rest)]) rest)])