Redo string-trim' and string-normalize-spaces'.

This is following the mailing list discussion. In addition get `string-join' more in-line with these by making its `sep' argument default to a space.
2012-05-23 15:37:06 -04:00 · 2012-05-23 15:37:06 -04:00 · 29beae55c1
commit 29beae55c1
parent abf9223203
3 changed files with 120 additions and 65 deletions
--- a/collects/racket/string.rkt
+++ b/collects/racket/string.rkt
@ -12,7 +12,7 @@
 (require (only-in racket/list add-between))
-(define (string-join strs sep)
+(define (string-join strs [sep " "])
  (cond [(not (and (list? strs) (andmap string? strs)))
         (raise-type-error 'string-join "list-of-strings" strs)]
        [(not (string? sep))
@ -21,43 +21,57 @@
        [(null? (cdr strs)) (car strs)]
        [else (apply string-append (add-between strs sep))]))
 ;; Utilities for the functions below
 (define none (gensym))
 (define get-rxs
  (let ([t (make-weak-hasheq)] [t+ (make-weak-hasheq)])
    (let ([spaces '(#px"\\s+" #px"^\\s+" #px"\\s+$")])
      (hash-set! t none spaces)
      (hash-set! t+ none spaces))
    (λ (who rx +?)
      (hash-ref! (if +? t+ t) rx
        (λ () (let* ([s (cond [(string? rx) (regexp-quote rx)]
                              [(regexp? rx) (object-name rx)]
                              [else (raise-type-error
                                     who "string-or-regexp" rx)])]
                     [s (if +? (string-append "(?:" s ")+") s)]
                     [^s (string-append "^" s)]
                     [s$ (string-append s "$")])
                (if (pregexp? rx)
                  (list (pregexp s) (pregexp ^s) (pregexp s$))
                  (list (regexp  s) (regexp  ^s) (regexp  s$)))))))))
 ;; returns start+end positions, #f when no trimming should happen
 (define (internal-trim who str sep l? r? rxs)
  (unless (string? str) (raise-type-error who "string" str))
  (define l
    (and l? (let ([p (regexp-match-positions (car rxs) str)])
              (and p (let ([p (cdar p)]) (and (> p 0) p))))))
  (define r
    (and r? (let ([p (regexp-match-positions (cadr rxs) str)])
              (and p (let ([p (caar p)])
                       (and (< p (string-length str))
                            (if (and l (> l p)) l p)))))))
  (values l r))
 ;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice
 ;; overview of popular names etc for these functions;
 ;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways
 ;; to implement trimming.
-(define (string-trim str [rx #px"\\s+"]
+(define (string-trim str [sep none]
-                     #:left? [left? #t] #:right? [right? #t])
+                     #:left? [l? #t] #:right? [r? #t] #:repeat? [+? #f])
-  (unless (string? str) (raise-type-error 'string-trim "string" str))
+  (define rxs (get-rxs 'string-trim sep +?))
-  (unless (regexp? rx)  (raise-type-error 'string-trim "regexp" rx))
+  (define-values [l r] (internal-trim 'string-trim str sep l? r? (cdr rxs)))
-  (define len (string-length str))
+  (cond [(and l r) (substring str l r)]
-  (if (zero? len)
+        [l         (substring str l)]
-    str
+        [r         (substring str 0 r)]
-    (let* ([start (if (and left? (regexp-match? rx (substring str 0 1)))
+        [else      str]))
                    (cdar (regexp-match-positions rx str))
                    0)]
           [end (and right? (< start len)
                     (regexp-match? rx (substring str (- len 1)))
                     (for/or ([i (in-range (- len 2) (- start 1) -1)])
                       (and (not (regexp-match? rx (substring str i (add1 i))))
                            (add1 i))))])
      (if (and (not start) (not end))
        str
        (substring str (or start 0) (or end len))))))
-(define (string-normalize-spaces str [rx #px"\\s+"]
+(define (string-normalize-spaces str [sep none] [space " "]
-                                 #:space [space " "] #:trim? [trim? #t])
+                                 #:trim? [trim? #t] #:repeat? [+? #f])
-  (define ps (regexp-match-positions* rx str))
+  (define rxs (get-rxs 'string-normalize-spaces sep +?))
-  (if (null? ps)
+  (define-values [l r]
-    str
+    (if trim?
-    (let ([drop-first? (and trim? (zero? (caar ps)))]
+      (internal-trim 'string-normalize-spaces str sep #t #t (cdr rxs))
-          [len (string-length str)])
+      (values #f #f)))
-      (let loop ([ps (if drop-first? (cdr ps) ps)]
+  (string-join (regexp-split (car rxs) str (or l 0) r) space))
                 [i  (if drop-first? (cdar ps) 0)]
                 [r  '()])
        (if (or (null? ps) (and trim? (= len (cdar ps))))
          (apply string-append
                 (reverse (cons (substring str i (if (null? ps) len (caar ps)))
                                r)))
          (loop (cdr ps)
                (cdar ps)
                (list* space (substring str i (caar ps)) r)))))))
--- a/collects/scribblings/reference/strings.scrbl
+++ b/collects/scribblings/reference/strings.scrbl
@ -389,44 +389,62 @@ one between @racket[list] and @racket[list*].
                                     '("Alpha" "Beta" "Gamma")))))
 ]}
-@defproc[(string-join [strs (listof string?)] [sep string?]) string?]{
+@defproc[(string-join [strs (listof string?)] [sep string? " "]) string?]{
 Appends the strings in @racket[strs], inserting @racket[sep] between
 each pair of strings in @racket[strs].
@mz-examples[#:eval string-eval
  (string-join '("one" "two" "three" "four"))
  (string-join '("one" "two" "three" "four") ", ")
  (string-join '("one" "two" "three" "four") " potato ")
 ]}
-@defproc[(string-trim [str string?] [rx regexp? #px"\\s+"]
+@; *********************************************************************
-                      [#:left? left? any/c #t] [#:right? right? any/c #t])
+@; Meta note: these functions are intended to be newbie-friendly, so I'm
@; intentionally starting the descriptions with a short senstence that
@; describes the default behavior instead of diving straight to a
@; precise description.
@defproc[(string-trim [str string?]
                      [sep (or/c string? regexp?) #px"\\s+"]
                      [#:left? left? any/c #t]
                      [#:right? right? any/c #t]
                      [#:repeat? repeat? any/c #f])
         string?]{
-Trims the input @racket[str] by removing prefix and suffix matches of
+Trims the input @racket[str] by removing prefix and suffix whitespaces.
@racket[rx].  Use @racket[#:left?] or @racket[#:right?] to suppress
 trimming one of these sides.
-The @racket[rx] regexp should match a whole (non-empty) sequence of
+The optional @racket[sep] argument can be specified as either a string
-spaces and should not rely on surrounding context.  This means that it
+or a (p)regexp to remove a different prefix/suffix; a string is matched
-should usually end with a @litchar{+}, and that it should not use
+as-is.  Use @racket[#:left?] or @racket[#:right?] to suppress trimming
-@litchar{^}, @litchar{$}, or other lookaheads and lookbacks.  (The
+one of these sides.  When @racket[repeat?] is @racket[#f] (the default),
-regexp is expected to both identify a whole sequence of spaces, and
+only one match is removed from each side, but when it is true any number
-match on a non-empty part of such a sequence.)
+of matches is trimmed.  (Note that with a regexp separator you can use
@litchar{+} instead.)
@mz-examples[#:eval string-eval
  (string-trim "  foo bar  baz \r\n\t")
  (string-trim "  foo bar  baz \r\n\t" " " #:repeat? #t)
  (string-trim "aaaxaayaa" "aa")
 ]}
-@defproc[(string-normalize-spaces
+@defproc[(string-normalize-spaces [str string?]
-          [str string?] [rx regexp? #px"\\s+"]
+                                  [sep (or/c string? regexp?) #px"\\s+"]
-          [#:space space string? " "] [#:trim? trim? any/c #t])
+                                  [space string? " "]
                                  [#:trim? trim? any/c #t]
                                  [#:repeat? repeat? any/c #f])
         string?]{
-Normalizes spaces (matching @racket[rx]) in the input @racket[str] by
+Normalizes spaces in the input @racket[str] by trimming it (using
-replacing them with @racket[space].  In the default configuration, this
+@racket[string-trim]) and replacing all whitespace sequences in the
-will replace any sequence of whitespaces by a single space character.
+result with a single space.
-In addition, prefix and suffix spaces are trimmed if @racket[trim?] is
+
-true, otherwise they get normalized too.
+Similarly to @racket[string-trim], @racket[sep] can be given as a string
 or a (p)regexp, and @racket[repeat?] controls matching repeated
 sequences.  In addition, you can specify @racket[space] for an alternate
 space replacement.  @racket[trim?] determines whether trimming is done
 (the default).
@mz-examples[#:eval string-eval
  (string-normalize-spaces "  foo bar  baz \r\n\t")
--- a/collects/tests/racket/string.rktl
+++ b/collects/tests/racket/string.rktl
@ -367,11 +367,11 @@
 ;; ---------- string-join ----------
 (let ()
-  (test ""      string-join '()            " ")
+  (test ""      string-join '())
-  (test ""      string-join '("")          " ")
+  (test ""      string-join '(""))
-  (test " "     string-join '("" "")       " ")
+  (test " "     string-join '("" ""))
-  (test "x"     string-join '("x")         " ")
+  (test "x"     string-join '("x"))
-  (test "x y"   string-join '("x" "y")     " ")
+  (test "x y"   string-join '("x" "y"))
  (test "x y z" string-join '("x" "y" "z") " ")
  (test "x,y,z" string-join '("x" "y" "z") ","))
@ -405,6 +405,29 @@
  (test "\t x \t" string-trim " \t x \t " #px" +")
  (test "  x"     string-trim "  x  " #:left? #f)
  (test "x  "     string-trim "  x  " #:right? #f)
-  (test "  x  "   string-trim "  x  " #:left? #f #:right? #f))
+  (test "  x  "   string-trim "  x  " #:left? #f #:right? #f)
  (for* ([i+e '([""       ""  ""]
                ["a"      "a" "a"]
                ["aa"     ""  ""]
                ["aaa"    ""  ""] ; weird case
                ["aaaa"   ""  ""]
                ["aaaaa"  "a" ""]
                ["aa-aa"  "-" "-"]
                ["aaaaaa" "aa" ""]
                ["aa--aa" "--" "--"])]
         [sep '("aa" #rx"aa" #px"aa")])
    (define input     (car   i+e))
    (define expected  (cadr  i+e))
    (define expected+ (caddr i+e))
    (test expected  string-trim input sep)
    (test expected+ string-trim input sep #:repeat? #t))
  ;; this is obvious, but...
  (test "" string-trim "abaaba" "aba")
  ;; ...this is a version of the above weird case: it's questionable whether
  ;; this should return "" or "ba" (could also be "ab"), but it seems sensible
  ;; to do this (I haven't seen any existing trimmers that make any relevant
  ;; decision on this)
  (test "" string-trim "ababa" "aba")
  )
 (report-errs)