From 29beae55c1ce911d38ba9279f65a1c53ff8c553c Mon Sep 17 00:00:00 2001 From: Eli Barzilay Date: Wed, 23 May 2012 15:37:06 -0400 Subject: [PATCH] Redo `string-trim' and `string-normalize-spaces'. This is following the mailing list discussion. In addition get `string-join' more in-line with these by making its `sep' argument default to a space. --- collects/racket/string.rkt | 86 ++++++++++++-------- collects/scribblings/reference/strings.scrbl | 64 +++++++++------ collects/tests/racket/string.rktl | 35 ++++++-- 3 files changed, 120 insertions(+), 65 deletions(-) diff --git a/collects/racket/string.rkt b/collects/racket/string.rkt index 9e90fb3ee1..6cae70ddb9 100644 --- a/collects/racket/string.rkt +++ b/collects/racket/string.rkt @@ -12,7 +12,7 @@ (require (only-in racket/list add-between)) -(define (string-join strs sep) +(define (string-join strs [sep " "]) (cond [(not (and (list? strs) (andmap string? strs))) (raise-type-error 'string-join "list-of-strings" strs)] [(not (string? sep)) @@ -21,43 +21,57 @@ [(null? (cdr strs)) (car strs)] [else (apply string-append (add-between strs sep))])) +;; Utilities for the functions below +(define none (gensym)) +(define get-rxs + (let ([t (make-weak-hasheq)] [t+ (make-weak-hasheq)]) + (let ([spaces '(#px"\\s+" #px"^\\s+" #px"\\s+$")]) + (hash-set! t none spaces) + (hash-set! t+ none spaces)) + (λ (who rx +?) + (hash-ref! (if +? t+ t) rx + (λ () (let* ([s (cond [(string? rx) (regexp-quote rx)] + [(regexp? rx) (object-name rx)] + [else (raise-type-error + who "string-or-regexp" rx)])] + [s (if +? (string-append "(?:" s ")+") s)] + [^s (string-append "^" s)] + [s$ (string-append s "$")]) + (if (pregexp? rx) + (list (pregexp s) (pregexp ^s) (pregexp s$)) + (list (regexp s) (regexp ^s) (regexp s$))))))))) + +;; returns start+end positions, #f when no trimming should happen +(define (internal-trim who str sep l? r? rxs) + (unless (string? str) (raise-type-error who "string" str)) + (define l + (and l? (let ([p (regexp-match-positions (car rxs) str)]) + (and p (let ([p (cdar p)]) (and (> p 0) p)))))) + (define r + (and r? (let ([p (regexp-match-positions (cadr rxs) str)]) + (and p (let ([p (caar p)]) + (and (< p (string-length str)) + (if (and l (> l p)) l p))))))) + (values l r)) + ;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice ;; overview of popular names etc for these functions; ;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways ;; to implement trimming. -(define (string-trim str [rx #px"\\s+"] - #:left? [left? #t] #:right? [right? #t]) - (unless (string? str) (raise-type-error 'string-trim "string" str)) - (unless (regexp? rx) (raise-type-error 'string-trim "regexp" rx)) - (define len (string-length str)) - (if (zero? len) - str - (let* ([start (if (and left? (regexp-match? rx (substring str 0 1))) - (cdar (regexp-match-positions rx str)) - 0)] - [end (and right? (< start len) - (regexp-match? rx (substring str (- len 1))) - (for/or ([i (in-range (- len 2) (- start 1) -1)]) - (and (not (regexp-match? rx (substring str i (add1 i)))) - (add1 i))))]) - (if (and (not start) (not end)) - str - (substring str (or start 0) (or end len)))))) +(define (string-trim str [sep none] + #:left? [l? #t] #:right? [r? #t] #:repeat? [+? #f]) + (define rxs (get-rxs 'string-trim sep +?)) + (define-values [l r] (internal-trim 'string-trim str sep l? r? (cdr rxs))) + (cond [(and l r) (substring str l r)] + [l (substring str l)] + [r (substring str 0 r)] + [else str])) -(define (string-normalize-spaces str [rx #px"\\s+"] - #:space [space " "] #:trim? [trim? #t]) - (define ps (regexp-match-positions* rx str)) - (if (null? ps) - str - (let ([drop-first? (and trim? (zero? (caar ps)))] - [len (string-length str)]) - (let loop ([ps (if drop-first? (cdr ps) ps)] - [i (if drop-first? (cdar ps) 0)] - [r '()]) - (if (or (null? ps) (and trim? (= len (cdar ps)))) - (apply string-append - (reverse (cons (substring str i (if (null? ps) len (caar ps))) - r))) - (loop (cdr ps) - (cdar ps) - (list* space (substring str i (caar ps)) r))))))) +(define (string-normalize-spaces str [sep none] [space " "] + #:trim? [trim? #t] #:repeat? [+? #f]) + (define rxs (get-rxs 'string-normalize-spaces sep +?)) + (define-values [l r] + (if trim? + (internal-trim 'string-normalize-spaces str sep #t #t (cdr rxs)) + (values #f #f))) + (string-join (regexp-split (car rxs) str (or l 0) r) space)) diff --git a/collects/scribblings/reference/strings.scrbl b/collects/scribblings/reference/strings.scrbl index 20daa2336b..fe78cd2755 100644 --- a/collects/scribblings/reference/strings.scrbl +++ b/collects/scribblings/reference/strings.scrbl @@ -389,47 +389,65 @@ one between @racket[list] and @racket[list*]. '("Alpha" "Beta" "Gamma"))))) ]} -@defproc[(string-join [strs (listof string?)] [sep string?]) string?]{ +@defproc[(string-join [strs (listof string?)] [sep string? " "]) string?]{ Appends the strings in @racket[strs], inserting @racket[sep] between each pair of strings in @racket[strs]. @mz-examples[#:eval string-eval - (string-join '("one" "two" "three" "four") " potato ") + (string-join '("one" "two" "three" "four")) + (string-join '("one" "two" "three" "four") ", ") + (string-join '("one" "two" "three" "four") " potato ") ]} -@defproc[(string-trim [str string?] [rx regexp? #px"\\s+"] - [#:left? left? any/c #t] [#:right? right? any/c #t]) +@; ********************************************************************* +@; Meta note: these functions are intended to be newbie-friendly, so I'm +@; intentionally starting the descriptions with a short senstence that +@; describes the default behavior instead of diving straight to a +@; precise description. + +@defproc[(string-trim [str string?] + [sep (or/c string? regexp?) #px"\\s+"] + [#:left? left? any/c #t] + [#:right? right? any/c #t] + [#:repeat? repeat? any/c #f]) string?]{ -Trims the input @racket[str] by removing prefix and suffix matches of -@racket[rx]. Use @racket[#:left?] or @racket[#:right?] to suppress -trimming one of these sides. +Trims the input @racket[str] by removing prefix and suffix whitespaces. -The @racket[rx] regexp should match a whole (non-empty) sequence of -spaces and should not rely on surrounding context. This means that it -should usually end with a @litchar{+}, and that it should not use -@litchar{^}, @litchar{$}, or other lookaheads and lookbacks. (The -regexp is expected to both identify a whole sequence of spaces, and -match on a non-empty part of such a sequence.) +The optional @racket[sep] argument can be specified as either a string +or a (p)regexp to remove a different prefix/suffix; a string is matched +as-is. Use @racket[#:left?] or @racket[#:right?] to suppress trimming +one of these sides. When @racket[repeat?] is @racket[#f] (the default), +only one match is removed from each side, but when it is true any number +of matches is trimmed. (Note that with a regexp separator you can use +@litchar{+} instead.) @mz-examples[#:eval string-eval - (string-trim " foo bar baz \r\n\t") + (string-trim " foo bar baz \r\n\t") + (string-trim " foo bar baz \r\n\t" " " #:repeat? #t) + (string-trim "aaaxaayaa" "aa") ]} -@defproc[(string-normalize-spaces - [str string?] [rx regexp? #px"\\s+"] - [#:space space string? " "] [#:trim? trim? any/c #t]) +@defproc[(string-normalize-spaces [str string?] + [sep (or/c string? regexp?) #px"\\s+"] + [space string? " "] + [#:trim? trim? any/c #t] + [#:repeat? repeat? any/c #f]) string?]{ -Normalizes spaces (matching @racket[rx]) in the input @racket[str] by -replacing them with @racket[space]. In the default configuration, this -will replace any sequence of whitespaces by a single space character. -In addition, prefix and suffix spaces are trimmed if @racket[trim?] is -true, otherwise they get normalized too. +Normalizes spaces in the input @racket[str] by trimming it (using +@racket[string-trim]) and replacing all whitespace sequences in the +result with a single space. + +Similarly to @racket[string-trim], @racket[sep] can be given as a string +or a (p)regexp, and @racket[repeat?] controls matching repeated +sequences. In addition, you can specify @racket[space] for an alternate +space replacement. @racket[trim?] determines whether trimming is done +(the default). @mz-examples[#:eval string-eval - (string-normalize-spaces " foo bar baz \r\n\t") + (string-normalize-spaces " foo bar baz \r\n\t") ]} diff --git a/collects/tests/racket/string.rktl b/collects/tests/racket/string.rktl index d673da1f94..602ee40c2e 100644 --- a/collects/tests/racket/string.rktl +++ b/collects/tests/racket/string.rktl @@ -367,11 +367,11 @@ ;; ---------- string-join ---------- (let () - (test "" string-join '() " ") - (test "" string-join '("") " ") - (test " " string-join '("" "") " ") - (test "x" string-join '("x") " ") - (test "x y" string-join '("x" "y") " ") + (test "" string-join '()) + (test "" string-join '("")) + (test " " string-join '("" "")) + (test "x" string-join '("x")) + (test "x y" string-join '("x" "y")) (test "x y z" string-join '("x" "y" "z") " ") (test "x,y,z" string-join '("x" "y" "z") ",")) @@ -405,6 +405,29 @@ (test "\t x \t" string-trim " \t x \t " #px" +") (test " x" string-trim " x " #:left? #f) (test "x " string-trim " x " #:right? #f) - (test " x " string-trim " x " #:left? #f #:right? #f)) + (test " x " string-trim " x " #:left? #f #:right? #f) + (for* ([i+e '(["" "" ""] + ["a" "a" "a"] + ["aa" "" ""] + ["aaa" "" ""] ; weird case + ["aaaa" "" ""] + ["aaaaa" "a" ""] + ["aa-aa" "-" "-"] + ["aaaaaa" "aa" ""] + ["aa--aa" "--" "--"])] + [sep '("aa" #rx"aa" #px"aa")]) + (define input (car i+e)) + (define expected (cadr i+e)) + (define expected+ (caddr i+e)) + (test expected string-trim input sep) + (test expected+ string-trim input sep #:repeat? #t)) + ;; this is obvious, but... + (test "" string-trim "abaaba" "aba") + ;; ...this is a version of the above weird case: it's questionable whether + ;; this should return "" or "ba" (could also be "ab"), but it seems sensible + ;; to do this (I haven't seen any existing trimmers that make any relevant + ;; decision on this) + (test "" string-trim "ababa" "aba") + ) (report-errs)