Minor typo in recent new regexp.

original commit: 3fcd3c04d5
2011-12-28 21:06:22 -05:00 · 2011-12-28 21:06:22 -05:00 · 6feb946842
commit 6feb946842
parent 83d3a6d984 97f880fb94
1 changed files with 63 additions and 36 deletions
--- a/collects/net/unihead.rkt
+++ b/collects/net/unihead.rkt
@ -1,49 +1,77 @@
 #lang racket/base
-(require net/base64 net/qp racket/string)
+(require net/base64 net/qp)

 (provide encode-for-header decode-for-header generalize-encoding)

-(define re:ascii #rx"^[\u0-\u7F]*$")
+(define re:non-ascii #rx"[^\u0-\u7F]")

 (define (encode-for-header s)
-  (if (regexp-match? re:ascii s)
-    s
-    (let ([l (regexp-split #rx"\r\n" s)])
-      (apply string-append
-             (map encode-line-for-header l)))))
+  (cond [(not (regexp-match? re:non-ascii s)) s]
+        [(not (regexp-match? #rx"[\r\n]" s)) (encode-line-for-header s)] ; speed
+        [else (regexp-replace* #rx"[^\r\n]+" s encode-line-for-header)]))
+
+;; Note: the following two encoder wrappers remove newlines from the
+;; encoded strings.  This avoids producing invalid strings, but it's not
+;; complete: rfc2047 (section 2) specifies that encoded words should not
+;; be longer than 75 characters, and longer words should be split for
+;; encoding with a separator of CRLF SPACE between them.  The problem is
+;; that doing this properly requires changing the encoders to get a
+;; length limit and have them return also the leftover unencoded string.
+;; Instead of doing all of that, do something simpler: if the string to
+;; be encoded is longer than 70 characters, then split it.  (This is
+;; done in `encode-line-for-header' below.)  It's possible to get longer
+;; encodings with this, but it seems that sendmail's limit on line
+;; lengths is sufficiently larger that it works fine in practice.  (BTW,
+;; when sendmail gets lines that are too long it splits them with the
+;; dreaded "!\n ", and it looks like there is no sane way to avoid that
+;; behavior -- so splitting the input is needed.)
+
+(define (base64-encode-header s)
+  (regexp-replace* #rx#"[\r\n]+" (base64-encode s) #""))
+
+(define (qp-encode-header s)
+  ;; rfc2047 (section 4.2) calls this "Q encoding", which is different
+  ;; from the usual QP encoding: encode underlines and question marks,
+  ;; and replace spaces by underlines; also remove soft-newlines.
+  (regexp-replace* #rx#"[ ?_]"
+                   (regexp-replace* #rx#"=\r?\n" (qp-encode s) #"")
+                   (λ (b)
+                     (case (bytes-ref b 0)
+                       [(32) #"_"]   ; " "
+                       [(63) #"=3F"] ; "?"
+                       [(95) #"=5F"] ; "_"
+                       [else (error 'qp-encode-header "internal error")]))))

 (define (encode-line-for-header s)
-  (define (loop s string->bytes charset encode encoding)
-    ;; Find ASCII (and no "=") prefix before a space
-    (let ([m (regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)])
-      (if m
-        (string-append
-         (cadr m)
-         (loop (caddr m) string->bytes charset encode encoding))
+  (define (do-encode s string->bytes charset encode encoding)
+    (let loop ([s s])
+      (cond
+        ;; Find ASCII (and no "=") prefix before a space
+        [(regexp-match #rx"^([\u0-\u3c\u3e-\u7F]* )(.*)$" s)
+         => (λ (m) (string-append (cadr m) (loop (caddr m))))]
        ;; Find ASCII (and no "=") suffix after a space
-        (let ([m (regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)])
-          (if m
-            (string-append
-             (loop (cadr m) string->bytes charset encode encoding)
-             (caddr m))
-            (format "=?~a?~a?~a?="
-                    charset encoding
-                    (regexp-replace* #rx#"[\r\n]+$"
-                                     (encode (string->bytes s))
-                                     #"")))))))
+        [(regexp-match #rx"^(.*?)( [\u0-\u3c\u3e-\u7F]*)$" s)
+         => (λ (m) (string-append (loop (cadr m)) (caddr m)))]
+        [else
+         ;; Split lines longer than 70 chars and re-assemble (see above
+         ;; comment).  Note that the proper separator should use CRLF,
+         ;; but we're sending this to a sendmail process that will take
+         ;; care of that level.
+         (let loop ([bytes (string->bytes s)])
+           (if ((bytes-length bytes) . > . 70)
+             (string-append (loop (subbytes bytes 0 70))
+                            "\n "
+                            (loop (subbytes bytes 70)))
+             (format "=?~a?~a?~a?=" charset encoding (encode bytes))))])))
  (cond
-    [(regexp-match? re:ascii s)
-     ;; ASCII - do nothing
-     s]
+    ;; ASCII - do nothing
+    [(not (regexp-match? re:non-ascii s)) s]
+    ;; Not Latin-1, so use UTF-8
    [(regexp-match? #rx"[^\u0-\uFF]" s)
-     ;; Not Latin-1, so use UTF-8
-     (loop s string->bytes/utf-8 "UTF-8" base64-encode "B")]
+     (do-encode s string->bytes/utf-8 "UTF-8" base64-encode-header "B")]
+    ;; use Latin-1
    [else
-     ;; use Latin-1
-     (loop s string->bytes/latin-1 "ISO-8859-1"
-           (lambda (s)
-             (regexp-replace #rx#" " (qp-encode s) #"_"))
-           "Q")]))
+     (do-encode s string->bytes/latin-1 "ISO-8859-1" qp-encode-header "Q")]))

 ;; ----------------------------------------

@ -68,8 +96,7 @@
         (if m
           (let ([s ((if (member (cadddr m) '(#"q" #"Q"))
                       ;; quoted-printable, with special _ handling
-                       (lambda (x)
-                         (qp-decode (regexp-replace* #rx#"_" x #" ")))
+                       (λ (x) (qp-decode (regexp-replace* #rx#"_" x #" ")))
                       ;; base64:
                       base64-decode)
                     (cadddr (cdr m)))]