From 29beae55c1ce911d38ba9279f65a1c53ff8c553c Mon Sep 17 00:00:00 2001
From: Eli Barzilay <eli@barzilay.org>
Date: Wed, 23 May 2012 15:37:06 -0400
Subject: [PATCH] Redo `string-trim' and `string-normalize-spaces'.

This is following the mailing list discussion.  In addition get
`string-join' more in-line with these by making its `sep' argument
default to a space.
---
 collects/racket/string.rkt                   | 86 ++++++++++++--------
 collects/scribblings/reference/strings.scrbl | 64 +++++++++------
 collects/tests/racket/string.rktl            | 35 ++++++--
 3 files changed, 120 insertions(+), 65 deletions(-)

diff --git a/collects/racket/string.rkt b/collects/racket/string.rkt
index 9e90fb3ee1..6cae70ddb9 100644
--- a/collects/racket/string.rkt
+++ b/collects/racket/string.rkt
@@ -12,7 +12,7 @@
 
 (require (only-in racket/list add-between))
 
-(define (string-join strs sep)
+(define (string-join strs [sep " "])
   (cond [(not (and (list? strs) (andmap string? strs)))
          (raise-type-error 'string-join "list-of-strings" strs)]
         [(not (string? sep))
@@ -21,43 +21,57 @@
         [(null? (cdr strs)) (car strs)]
         [else (apply string-append (add-between strs sep))]))
 
+;; Utilities for the functions below
+(define none (gensym))
+(define get-rxs
+  (let ([t (make-weak-hasheq)] [t+ (make-weak-hasheq)])
+    (let ([spaces '(#px"\\s+" #px"^\\s+" #px"\\s+$")])
+      (hash-set! t none spaces)
+      (hash-set! t+ none spaces))
+    (λ (who rx +?)
+      (hash-ref! (if +? t+ t) rx
+        (λ () (let* ([s (cond [(string? rx) (regexp-quote rx)]
+                              [(regexp? rx) (object-name rx)]
+                              [else (raise-type-error
+                                     who "string-or-regexp" rx)])]
+                     [s (if +? (string-append "(?:" s ")+") s)]
+                     [^s (string-append "^" s)]
+                     [s$ (string-append s "$")])
+                (if (pregexp? rx)
+                  (list (pregexp s) (pregexp ^s) (pregexp s$))
+                  (list (regexp  s) (regexp  ^s) (regexp  s$)))))))))
+
+;; returns start+end positions, #f when no trimming should happen
+(define (internal-trim who str sep l? r? rxs)
+  (unless (string? str) (raise-type-error who "string" str))
+  (define l
+    (and l? (let ([p (regexp-match-positions (car rxs) str)])
+              (and p (let ([p (cdar p)]) (and (> p 0) p))))))
+  (define r
+    (and r? (let ([p (regexp-match-positions (cadr rxs) str)])
+              (and p (let ([p (caar p)])
+                       (and (< p (string-length str))
+                            (if (and l (> l p)) l p)))))))
+  (values l r))
+
 ;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice
 ;; overview of popular names etc for these functions;
 ;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways
 ;; to implement trimming.
-(define (string-trim str [rx #px"\\s+"]
-                     #:left? [left? #t] #:right? [right? #t])
-  (unless (string? str) (raise-type-error 'string-trim "string" str))
-  (unless (regexp? rx)  (raise-type-error 'string-trim "regexp" rx))
-  (define len (string-length str))
-  (if (zero? len)
-    str
-    (let* ([start (if (and left? (regexp-match? rx (substring str 0 1)))
-                    (cdar (regexp-match-positions rx str))
-                    0)]
-           [end (and right? (< start len)
-                     (regexp-match? rx (substring str (- len 1)))
-                     (for/or ([i (in-range (- len 2) (- start 1) -1)])
-                       (and (not (regexp-match? rx (substring str i (add1 i))))
-                            (add1 i))))])
-      (if (and (not start) (not end))
-        str
-        (substring str (or start 0) (or end len))))))
+(define (string-trim str [sep none]
+                     #:left? [l? #t] #:right? [r? #t] #:repeat? [+? #f])
+  (define rxs (get-rxs 'string-trim sep +?))
+  (define-values [l r] (internal-trim 'string-trim str sep l? r? (cdr rxs)))
+  (cond [(and l r) (substring str l r)]
+        [l         (substring str l)]
+        [r         (substring str 0 r)]
+        [else      str]))
 
-(define (string-normalize-spaces str [rx #px"\\s+"]
-                                 #:space [space " "] #:trim? [trim? #t])
-  (define ps (regexp-match-positions* rx str))
-  (if (null? ps)
-    str
-    (let ([drop-first? (and trim? (zero? (caar ps)))]
-          [len (string-length str)])
-      (let loop ([ps (if drop-first? (cdr ps) ps)]
-                 [i  (if drop-first? (cdar ps) 0)]
-                 [r  '()])
-        (if (or (null? ps) (and trim? (= len (cdar ps))))
-          (apply string-append
-                 (reverse (cons (substring str i (if (null? ps) len (caar ps)))
-                                r)))
-          (loop (cdr ps)
-                (cdar ps)
-                (list* space (substring str i (caar ps)) r)))))))
+(define (string-normalize-spaces str [sep none] [space " "]
+                                 #:trim? [trim? #t] #:repeat? [+? #f])
+  (define rxs (get-rxs 'string-normalize-spaces sep +?))
+  (define-values [l r]
+    (if trim?
+      (internal-trim 'string-normalize-spaces str sep #t #t (cdr rxs))
+      (values #f #f)))
+  (string-join (regexp-split (car rxs) str (or l 0) r) space))
diff --git a/collects/scribblings/reference/strings.scrbl b/collects/scribblings/reference/strings.scrbl
index 20daa2336b..fe78cd2755 100644
--- a/collects/scribblings/reference/strings.scrbl
+++ b/collects/scribblings/reference/strings.scrbl
@@ -389,47 +389,65 @@ one between @racket[list] and @racket[list*].
                                      '("Alpha" "Beta" "Gamma")))))
 ]}
 
-@defproc[(string-join [strs (listof string?)] [sep string?]) string?]{
+@defproc[(string-join [strs (listof string?)] [sep string? " "]) string?]{
 
 Appends the strings in @racket[strs], inserting @racket[sep] between
 each pair of strings in @racket[strs].
 
 @mz-examples[#:eval string-eval
- (string-join '("one" "two" "three" "four") " potato ")
+  (string-join '("one" "two" "three" "four"))
+  (string-join '("one" "two" "three" "four") ", ")
+  (string-join '("one" "two" "three" "four") " potato ")
 ]}
 
-@defproc[(string-trim [str string?] [rx regexp? #px"\\s+"]
-                      [#:left? left? any/c #t] [#:right? right? any/c #t])
+@; *********************************************************************
+@; Meta note: these functions are intended to be newbie-friendly, so I'm
+@; intentionally starting the descriptions with a short senstence that
+@; describes the default behavior instead of diving straight to a
+@; precise description.
+
+@defproc[(string-trim [str string?]
+                      [sep (or/c string? regexp?) #px"\\s+"]
+                      [#:left? left? any/c #t]
+                      [#:right? right? any/c #t]
+                      [#:repeat? repeat? any/c #f])
          string?]{
 
-Trims the input @racket[str] by removing prefix and suffix matches of
-@racket[rx].  Use @racket[#:left?] or @racket[#:right?] to suppress
-trimming one of these sides.
+Trims the input @racket[str] by removing prefix and suffix whitespaces.
 
-The @racket[rx] regexp should match a whole (non-empty) sequence of
-spaces and should not rely on surrounding context.  This means that it
-should usually end with a @litchar{+}, and that it should not use
-@litchar{^}, @litchar{$}, or other lookaheads and lookbacks.  (The
-regexp is expected to both identify a whole sequence of spaces, and
-match on a non-empty part of such a sequence.)
+The optional @racket[sep] argument can be specified as either a string
+or a (p)regexp to remove a different prefix/suffix; a string is matched
+as-is.  Use @racket[#:left?] or @racket[#:right?] to suppress trimming
+one of these sides.  When @racket[repeat?] is @racket[#f] (the default),
+only one match is removed from each side, but when it is true any number
+of matches is trimmed.  (Note that with a regexp separator you can use
+@litchar{+} instead.)
 
 @mz-examples[#:eval string-eval
- (string-trim "  foo bar  baz \r\n\t")
+  (string-trim "  foo bar  baz \r\n\t")
+  (string-trim "  foo bar  baz \r\n\t" " " #:repeat? #t)
+  (string-trim "aaaxaayaa" "aa")
 ]}
 
-@defproc[(string-normalize-spaces
-          [str string?] [rx regexp? #px"\\s+"]
-          [#:space space string? " "] [#:trim? trim? any/c #t])
+@defproc[(string-normalize-spaces [str string?]
+                                  [sep (or/c string? regexp?) #px"\\s+"]
+                                  [space string? " "]
+                                  [#:trim? trim? any/c #t]
+                                  [#:repeat? repeat? any/c #f])
          string?]{
 
-Normalizes spaces (matching @racket[rx]) in the input @racket[str] by
-replacing them with @racket[space].  In the default configuration, this
-will replace any sequence of whitespaces by a single space character.
-In addition, prefix and suffix spaces are trimmed if @racket[trim?] is
-true, otherwise they get normalized too.
+Normalizes spaces in the input @racket[str] by trimming it (using
+@racket[string-trim]) and replacing all whitespace sequences in the
+result with a single space.
+
+Similarly to @racket[string-trim], @racket[sep] can be given as a string
+or a (p)regexp, and @racket[repeat?] controls matching repeated
+sequences.  In addition, you can specify @racket[space] for an alternate
+space replacement.  @racket[trim?] determines whether trimming is done
+(the default).
 
 @mz-examples[#:eval string-eval
- (string-normalize-spaces "  foo bar  baz \r\n\t")
+  (string-normalize-spaces "  foo bar  baz \r\n\t")
 ]}
 
 
diff --git a/collects/tests/racket/string.rktl b/collects/tests/racket/string.rktl
index d673da1f94..602ee40c2e 100644
--- a/collects/tests/racket/string.rktl
+++ b/collects/tests/racket/string.rktl
@@ -367,11 +367,11 @@
 
 ;; ---------- string-join ----------
 (let ()
-  (test ""      string-join '()            " ")
-  (test ""      string-join '("")          " ")
-  (test " "     string-join '("" "")       " ")
-  (test "x"     string-join '("x")         " ")
-  (test "x y"   string-join '("x" "y")     " ")
+  (test ""      string-join '())
+  (test ""      string-join '(""))
+  (test " "     string-join '("" ""))
+  (test "x"     string-join '("x"))
+  (test "x y"   string-join '("x" "y"))
   (test "x y z" string-join '("x" "y" "z") " ")
   (test "x,y,z" string-join '("x" "y" "z") ","))
 
@@ -405,6 +405,29 @@
   (test "\t x \t" string-trim " \t x \t " #px" +")
   (test "  x"     string-trim "  x  " #:left? #f)
   (test "x  "     string-trim "  x  " #:right? #f)
-  (test "  x  "   string-trim "  x  " #:left? #f #:right? #f))
+  (test "  x  "   string-trim "  x  " #:left? #f #:right? #f)
+  (for* ([i+e '([""       ""  ""]
+                ["a"      "a" "a"]
+                ["aa"     ""  ""]
+                ["aaa"    ""  ""] ; weird case
+                ["aaaa"   ""  ""]
+                ["aaaaa"  "a" ""]
+                ["aa-aa"  "-" "-"]
+                ["aaaaaa" "aa" ""]
+                ["aa--aa" "--" "--"])]
+         [sep '("aa" #rx"aa" #px"aa")])
+    (define input     (car   i+e))
+    (define expected  (cadr  i+e))
+    (define expected+ (caddr i+e))
+    (test expected  string-trim input sep)
+    (test expected+ string-trim input sep #:repeat? #t))
+  ;; this is obvious, but...
+  (test "" string-trim "abaaba" "aba")
+  ;; ...this is a version of the above weird case: it's questionable whether
+  ;; this should return "" or "ba" (could also be "ab"), but it seems sensible
+  ;; to do this (I haven't seen any existing trimmers that make any relevant
+  ;; decision on this)
+  (test "" string-trim "ababa" "aba")
+  )
 
 (report-errs)