Add string-trim' and string-normalize-spaces'.

2012-04-18 15:19:36 -04:00 · 2012-04-18 15:19:36 -04:00 · 6f215759ae
commit 6f215759ae
parent e2e2a1e3e1
3 changed files with 129 additions and 21 deletions
--- a/collects/racket/string.rkt
+++ b/collects/racket/string.rkt
@ -1,6 +1,6 @@
-#lang scheme/base
+#lang racket/base

-(provide string-append* string-join)
+(provide string-append* string-join string-trim string-normalize-spaces)

 (define string-append*
  (case-lambda [(strs) (apply string-append strs)] ; optimize common case
@ -20,3 +20,44 @@
        [(null? strs) ""]
        [(null? (cdr strs)) (car strs)]
        [else (apply string-append (add-between strs sep))]))
+
+;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice
+;; overview of popular names etc for these functions;
+;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways
+;; to implement trimming.
+(define (string-trim str [rx #px"\\s+"]
+                     #:left? [left? #t] #:right? [right? #t])
+  (unless (string? str) (raise-type-error 'string-trim "string" str))
+  (unless (regexp? rx)  (raise-type-error 'string-trim "regexp" rx))
+  (define len (string-length str))
+  (if (zero? len)
+    str
+    (let* ([start (if (and left? (regexp-match? rx (substring str 0 1)))
+                    (cdar (regexp-match-positions rx str))
+                    0)]
+           [end (and right? (< start len)
+                     (regexp-match? rx (substring str (- len 1)))
+                     (for/or ([i (in-range (- len 2) (- start 1) -1)])
+                       (and (not (regexp-match? rx (substring str i (add1 i))))
+                            (add1 i))))])
+      (if (and (not start) (not end))
+        str
+        (substring str (or start 0) (or end len))))))
+
+(define (string-normalize-spaces str [rx #px"\\s+"]
+                                 #:space [space " "] #:trim? [trim? #t])
+  (define ps (regexp-match-positions* rx str))
+  (if (null? ps)
+    str
+    (let ([drop-first? (and trim? (zero? (caar ps)))]
+          [len (string-length str)])
+      (let loop ([ps (if drop-first? (cdr ps) ps)]
+                 [i  (if drop-first? (cdar ps) 0)]
+                 [r  '()])
+        (if (or (null? ps) (and trim? (= len (cdar ps))))
+          (apply string-append
+                 (reverse (cons (substring str i (if (null? ps) len (caar ps)))
+                                r)))
+          (loop (cdr ps)
+                (cdar ps)
+                (list* space (substring str i (caar ps)) r)))))))
--- a/collects/scribblings/reference/strings.scrbl
+++ b/collects/scribblings/reference/strings.scrbl
@ -398,4 +398,38 @@ each pair of strings in @racket[strs].
 (string-join '("one" "two" "three" "four") " potato ")
 ]}

+@defproc[(string-trim [str string?] [rx regexp? #px"\\s+"]
+                      [#:left? left? any/c #t] [#:right? right? any/c #t])
+         string?]{
+
+Trims the input @racket[str] by removing prefix and suffix matches of
+@racket[rx].  Use @racket[#:left?] or @racket[#:right?] to suppress
+trimming one of these sides.
+
+The @racket[rx] regexp should match a whole (non-empty) sequence of
+spaces and should not rely on surrounding context.  This means that it
+should usually end with a @litchar{+}, and that it should not use
+@litchar{^}, @litchar{$}, or other lookaheads and lookbacks.  (The
+regexp is expected to both identify a whole sequence of spaces, and
+match on a non-empty part of such a sequence.)
+
+@mz-examples[#:eval string-eval
+ (string-trim "  foo bar  baz \r\n\t")
+]}
+
+@defproc[(string-normalize-spaces
+          [str string?] [rx regexp? #px"\\s+"]
+          [#:space space string? " "] [#:trim? trim? any/c #t])
+         string?]{
+
+Normalizes spaces (matching @racket[rx]) in the input @racket[str] by
+replacing them with @racket[space].  In the default configuration, this
+will replace any sequence of whitespaces by a single space character.
+In addition, prefix and suffix spaces are trimmed if @racket[trim?] is
+true, otherwise they get normalized too.
+
+@mz-examples[#:eval string-eval
+ (string-normalize-spaces "  foo bar  baz \r\n\t")
+]}
+
@close-eval[string-eval]
--- a/collects/tests/racket/string.rktl
+++ b/collects/tests/racket/string.rktl
@ -337,25 +337,6 @@
  (t " 12 34 " #f "" " 12 34 ")
  )

-;; ---------- string-append* ----------
-(let ()
-  (test ""           string-append* '())
-  (test ""           string-append* '(""))
-  (test ""           string-append* '("" ""))
-  (test "0123456789" string-append* '("0123456789"))
-  (test "0123456789" string-append* "0123456789" '())
-  (test "0123456789" string-append* "0123456789" '(""))
-  (test "0123456789" string-append* "0123456789" '("" ""))
-  (test "0123456789" string-append* "01234567" '("8" "9")))
-
-;; ---------- string-join ----------
-(let ()
-  (test ""    string-join '() " ")
-  (test ""    string-join '("") " ")
-  (test " "   string-join '("" "") " ")
-  (test "x y" string-join '("x" "y") " ")
-  (test "x"   string-join '("x") " "))
-
 ;; String splitting can take longer than byte-string splitting,
 ;;  but it should have the same computational complexity.
 (let ()
@ -373,5 +354,57 @@
        (and ((* 100 (- bcpu bgc)) . < . (- scpu sgc))
             "suspiciously long time for regexp string split")))

+;; ---------- string-append* ----------
+(let ([t (λ (x . xs) (test ))])
+  (test ""           string-append* '())
+  (test ""           string-append* '(""))
+  (test ""           string-append* '("" ""))
+  (test "0123456789" string-append* '("0123456789"))
+  (test "0123456789" string-append* "0123456789" '())
+  (test "0123456789" string-append* "0123456789" '(""))
+  (test "0123456789" string-append* "0123456789" '("" ""))
+  (test "0123456789" string-append* "01234567" '("8" "9")))
+
+;; ---------- string-join ----------
+(let ()
+  (test ""      string-join '()            " ")
+  (test ""      string-join '("")          " ")
+  (test " "     string-join '("" "")       " ")
+  (test "x"     string-join '("x")         " ")
+  (test "x y"   string-join '("x" "y")     " ")
+  (test "x y z" string-join '("x" "y" "z") " ")
+  (test "x,y,z" string-join '("x" "y" "z") ","))
+
+;; ---------- string-trim & string-normalize-spaces ----------
+(let ()
+  (define spaces '("" " " "  " "\r" "\r\n\t "))
+  (define ++ string-append)
+  (define-syntax-rule (with-spaces id E ...)
+    (for ([id (in-list spaces)]) E ...))
+  (define (both result arg)
+    (test result string-trim arg)
+    (test result string-normalize-spaces arg))
+  (define (norm s) (if (equal? "" s) s " "))
+  (with-spaces s1
+    (both "" s1)
+    (with-spaces s2
+      (both "x" (++ s1 "x" s2))
+      (both "xx" (++ s1 "xx" s2))
+      (with-spaces s3
+        (test (++ "x" s3 "x") string-trim (++ s1 "x" s3 "x" s2))
+        (test (++ "x" (norm s3) "x")
+              string-normalize-spaces (++ s1 "x" s3 "x" s2))
+        (test (++ (norm s1) "x" (norm s3) "x" (norm s2))
+              string-normalize-spaces (++ s1 "x" s3 "x" s2) #:trim? #f)
+        (with-spaces s4
+          (test (++ "x" (norm s3) "y" (norm s4) "z")
+                string-normalize-spaces (++ s1 "x" s3 "y" s4 "z" s2))
+          (test (++ (norm s1) "x" (norm s3) "y" (norm s4) "z" (norm s2))
+                string-normalize-spaces (++ s1 "x" s3 "y" s4 "z" s2)
+                                        #:trim? #f)))))
+  (test "\t x \t" string-trim " \t x \t " #px" +")
+  (test "  x"     string-trim "  x  " #:left? #f)
+  (test "x  "     string-trim "  x  " #:right? #f)
+  (test "  x  "   string-trim "  x  " #:left? #f #:right? #f))

 (report-errs)