Redo string-trim' and string-normalize-spaces'.

This is following the mailing list discussion.  In addition get
`string-join' more in-line with these by making its `sep' argument
default to a space.
This commit is contained in:
Eli Barzilay 2012-05-23 15:37:06 -04:00
parent abf9223203
commit 29beae55c1
3 changed files with 120 additions and 65 deletions

View File

@ -12,7 +12,7 @@
(require (only-in racket/list add-between)) (require (only-in racket/list add-between))
(define (string-join strs sep) (define (string-join strs [sep " "])
(cond [(not (and (list? strs) (andmap string? strs))) (cond [(not (and (list? strs) (andmap string? strs)))
(raise-type-error 'string-join "list-of-strings" strs)] (raise-type-error 'string-join "list-of-strings" strs)]
[(not (string? sep)) [(not (string? sep))
@ -21,43 +21,57 @@
[(null? (cdr strs)) (car strs)] [(null? (cdr strs)) (car strs)]
[else (apply string-append (add-between strs sep))])) [else (apply string-append (add-between strs sep))]))
;; Utilities for the functions below
(define none (gensym))
(define get-rxs
(let ([t (make-weak-hasheq)] [t+ (make-weak-hasheq)])
(let ([spaces '(#px"\\s+" #px"^\\s+" #px"\\s+$")])
(hash-set! t none spaces)
(hash-set! t+ none spaces))
(λ (who rx +?)
(hash-ref! (if +? t+ t) rx
(λ () (let* ([s (cond [(string? rx) (regexp-quote rx)]
[(regexp? rx) (object-name rx)]
[else (raise-type-error
who "string-or-regexp" rx)])]
[s (if +? (string-append "(?:" s ")+") s)]
[^s (string-append "^" s)]
[s$ (string-append s "$")])
(if (pregexp? rx)
(list (pregexp s) (pregexp ^s) (pregexp s$))
(list (regexp s) (regexp ^s) (regexp s$)))))))))
;; returns start+end positions, #f when no trimming should happen
(define (internal-trim who str sep l? r? rxs)
(unless (string? str) (raise-type-error who "string" str))
(define l
(and l? (let ([p (regexp-match-positions (car rxs) str)])
(and p (let ([p (cdar p)]) (and (> p 0) p))))))
(define r
(and r? (let ([p (regexp-match-positions (cadr rxs) str)])
(and p (let ([p (caar p)])
(and (< p (string-length str))
(if (and l (> l p)) l p)))))))
(values l r))
;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice ;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice
;; overview of popular names etc for these functions; ;; overview of popular names etc for these functions;
;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways ;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways
;; to implement trimming. ;; to implement trimming.
(define (string-trim str [rx #px"\\s+"] (define (string-trim str [sep none]
#:left? [left? #t] #:right? [right? #t]) #:left? [l? #t] #:right? [r? #t] #:repeat? [+? #f])
(unless (string? str) (raise-type-error 'string-trim "string" str)) (define rxs (get-rxs 'string-trim sep +?))
(unless (regexp? rx) (raise-type-error 'string-trim "regexp" rx)) (define-values [l r] (internal-trim 'string-trim str sep l? r? (cdr rxs)))
(define len (string-length str)) (cond [(and l r) (substring str l r)]
(if (zero? len) [l (substring str l)]
str [r (substring str 0 r)]
(let* ([start (if (and left? (regexp-match? rx (substring str 0 1))) [else str]))
(cdar (regexp-match-positions rx str))
0)]
[end (and right? (< start len)
(regexp-match? rx (substring str (- len 1)))
(for/or ([i (in-range (- len 2) (- start 1) -1)])
(and (not (regexp-match? rx (substring str i (add1 i))))
(add1 i))))])
(if (and (not start) (not end))
str
(substring str (or start 0) (or end len))))))
(define (string-normalize-spaces str [rx #px"\\s+"] (define (string-normalize-spaces str [sep none] [space " "]
#:space [space " "] #:trim? [trim? #t]) #:trim? [trim? #t] #:repeat? [+? #f])
(define ps (regexp-match-positions* rx str)) (define rxs (get-rxs 'string-normalize-spaces sep +?))
(if (null? ps) (define-values [l r]
str (if trim?
(let ([drop-first? (and trim? (zero? (caar ps)))] (internal-trim 'string-normalize-spaces str sep #t #t (cdr rxs))
[len (string-length str)]) (values #f #f)))
(let loop ([ps (if drop-first? (cdr ps) ps)] (string-join (regexp-split (car rxs) str (or l 0) r) space))
[i (if drop-first? (cdar ps) 0)]
[r '()])
(if (or (null? ps) (and trim? (= len (cdar ps))))
(apply string-append
(reverse (cons (substring str i (if (null? ps) len (caar ps)))
r)))
(loop (cdr ps)
(cdar ps)
(list* space (substring str i (caar ps)) r)))))))

View File

@ -389,44 +389,62 @@ one between @racket[list] and @racket[list*].
'("Alpha" "Beta" "Gamma"))))) '("Alpha" "Beta" "Gamma")))))
]} ]}
@defproc[(string-join [strs (listof string?)] [sep string?]) string?]{ @defproc[(string-join [strs (listof string?)] [sep string? " "]) string?]{
Appends the strings in @racket[strs], inserting @racket[sep] between Appends the strings in @racket[strs], inserting @racket[sep] between
each pair of strings in @racket[strs]. each pair of strings in @racket[strs].
@mz-examples[#:eval string-eval @mz-examples[#:eval string-eval
(string-join '("one" "two" "three" "four"))
(string-join '("one" "two" "three" "four") ", ")
(string-join '("one" "two" "three" "four") " potato ") (string-join '("one" "two" "three" "four") " potato ")
]} ]}
@defproc[(string-trim [str string?] [rx regexp? #px"\\s+"] @; *********************************************************************
[#:left? left? any/c #t] [#:right? right? any/c #t]) @; Meta note: these functions are intended to be newbie-friendly, so I'm
@; intentionally starting the descriptions with a short senstence that
@; describes the default behavior instead of diving straight to a
@; precise description.
@defproc[(string-trim [str string?]
[sep (or/c string? regexp?) #px"\\s+"]
[#:left? left? any/c #t]
[#:right? right? any/c #t]
[#:repeat? repeat? any/c #f])
string?]{ string?]{
Trims the input @racket[str] by removing prefix and suffix matches of Trims the input @racket[str] by removing prefix and suffix whitespaces.
@racket[rx]. Use @racket[#:left?] or @racket[#:right?] to suppress
trimming one of these sides.
The @racket[rx] regexp should match a whole (non-empty) sequence of The optional @racket[sep] argument can be specified as either a string
spaces and should not rely on surrounding context. This means that it or a (p)regexp to remove a different prefix/suffix; a string is matched
should usually end with a @litchar{+}, and that it should not use as-is. Use @racket[#:left?] or @racket[#:right?] to suppress trimming
@litchar{^}, @litchar{$}, or other lookaheads and lookbacks. (The one of these sides. When @racket[repeat?] is @racket[#f] (the default),
regexp is expected to both identify a whole sequence of spaces, and only one match is removed from each side, but when it is true any number
match on a non-empty part of such a sequence.) of matches is trimmed. (Note that with a regexp separator you can use
@litchar{+} instead.)
@mz-examples[#:eval string-eval @mz-examples[#:eval string-eval
(string-trim " foo bar baz \r\n\t") (string-trim " foo bar baz \r\n\t")
(string-trim " foo bar baz \r\n\t" " " #:repeat? #t)
(string-trim "aaaxaayaa" "aa")
]} ]}
@defproc[(string-normalize-spaces @defproc[(string-normalize-spaces [str string?]
[str string?] [rx regexp? #px"\\s+"] [sep (or/c string? regexp?) #px"\\s+"]
[#:space space string? " "] [#:trim? trim? any/c #t]) [space string? " "]
[#:trim? trim? any/c #t]
[#:repeat? repeat? any/c #f])
string?]{ string?]{
Normalizes spaces (matching @racket[rx]) in the input @racket[str] by Normalizes spaces in the input @racket[str] by trimming it (using
replacing them with @racket[space]. In the default configuration, this @racket[string-trim]) and replacing all whitespace sequences in the
will replace any sequence of whitespaces by a single space character. result with a single space.
In addition, prefix and suffix spaces are trimmed if @racket[trim?] is
true, otherwise they get normalized too. Similarly to @racket[string-trim], @racket[sep] can be given as a string
or a (p)regexp, and @racket[repeat?] controls matching repeated
sequences. In addition, you can specify @racket[space] for an alternate
space replacement. @racket[trim?] determines whether trimming is done
(the default).
@mz-examples[#:eval string-eval @mz-examples[#:eval string-eval
(string-normalize-spaces " foo bar baz \r\n\t") (string-normalize-spaces " foo bar baz \r\n\t")

View File

@ -367,11 +367,11 @@
;; ---------- string-join ---------- ;; ---------- string-join ----------
(let () (let ()
(test "" string-join '() " ") (test "" string-join '())
(test "" string-join '("") " ") (test "" string-join '(""))
(test " " string-join '("" "") " ") (test " " string-join '("" ""))
(test "x" string-join '("x") " ") (test "x" string-join '("x"))
(test "x y" string-join '("x" "y") " ") (test "x y" string-join '("x" "y"))
(test "x y z" string-join '("x" "y" "z") " ") (test "x y z" string-join '("x" "y" "z") " ")
(test "x,y,z" string-join '("x" "y" "z") ",")) (test "x,y,z" string-join '("x" "y" "z") ","))
@ -405,6 +405,29 @@
(test "\t x \t" string-trim " \t x \t " #px" +") (test "\t x \t" string-trim " \t x \t " #px" +")
(test " x" string-trim " x " #:left? #f) (test " x" string-trim " x " #:left? #f)
(test "x " string-trim " x " #:right? #f) (test "x " string-trim " x " #:right? #f)
(test " x " string-trim " x " #:left? #f #:right? #f)) (test " x " string-trim " x " #:left? #f #:right? #f)
(for* ([i+e '(["" "" ""]
["a" "a" "a"]
["aa" "" ""]
["aaa" "" ""] ; weird case
["aaaa" "" ""]
["aaaaa" "a" ""]
["aa-aa" "-" "-"]
["aaaaaa" "aa" ""]
["aa--aa" "--" "--"])]
[sep '("aa" #rx"aa" #px"aa")])
(define input (car i+e))
(define expected (cadr i+e))
(define expected+ (caddr i+e))
(test expected string-trim input sep)
(test expected+ string-trim input sep #:repeat? #t))
;; this is obvious, but...
(test "" string-trim "abaaba" "aba")
;; ...this is a version of the above weird case: it's questionable whether
;; this should return "" or "ba" (could also be "ab"), but it seems sensible
;; to do this (I haven't seen any existing trimmers that make any relevant
;; decision on this)
(test "" string-trim "ababa" "aba")
)
(report-errs) (report-errs)