Redo string-trim' and
string-normalize-spaces'.
This is following the mailing list discussion. In addition get `string-join' more in-line with these by making its `sep' argument default to a space.
This commit is contained in:
parent
abf9223203
commit
29beae55c1
|
@ -12,7 +12,7 @@
|
||||||
|
|
||||||
(require (only-in racket/list add-between))
|
(require (only-in racket/list add-between))
|
||||||
|
|
||||||
(define (string-join strs sep)
|
(define (string-join strs [sep " "])
|
||||||
(cond [(not (and (list? strs) (andmap string? strs)))
|
(cond [(not (and (list? strs) (andmap string? strs)))
|
||||||
(raise-type-error 'string-join "list-of-strings" strs)]
|
(raise-type-error 'string-join "list-of-strings" strs)]
|
||||||
[(not (string? sep))
|
[(not (string? sep))
|
||||||
|
@ -21,43 +21,57 @@
|
||||||
[(null? (cdr strs)) (car strs)]
|
[(null? (cdr strs)) (car strs)]
|
||||||
[else (apply string-append (add-between strs sep))]))
|
[else (apply string-append (add-between strs sep))]))
|
||||||
|
|
||||||
|
;; Utilities for the functions below
|
||||||
|
(define none (gensym))
|
||||||
|
(define get-rxs
|
||||||
|
(let ([t (make-weak-hasheq)] [t+ (make-weak-hasheq)])
|
||||||
|
(let ([spaces '(#px"\\s+" #px"^\\s+" #px"\\s+$")])
|
||||||
|
(hash-set! t none spaces)
|
||||||
|
(hash-set! t+ none spaces))
|
||||||
|
(λ (who rx +?)
|
||||||
|
(hash-ref! (if +? t+ t) rx
|
||||||
|
(λ () (let* ([s (cond [(string? rx) (regexp-quote rx)]
|
||||||
|
[(regexp? rx) (object-name rx)]
|
||||||
|
[else (raise-type-error
|
||||||
|
who "string-or-regexp" rx)])]
|
||||||
|
[s (if +? (string-append "(?:" s ")+") s)]
|
||||||
|
[^s (string-append "^" s)]
|
||||||
|
[s$ (string-append s "$")])
|
||||||
|
(if (pregexp? rx)
|
||||||
|
(list (pregexp s) (pregexp ^s) (pregexp s$))
|
||||||
|
(list (regexp s) (regexp ^s) (regexp s$)))))))))
|
||||||
|
|
||||||
|
;; returns start+end positions, #f when no trimming should happen
|
||||||
|
(define (internal-trim who str sep l? r? rxs)
|
||||||
|
(unless (string? str) (raise-type-error who "string" str))
|
||||||
|
(define l
|
||||||
|
(and l? (let ([p (regexp-match-positions (car rxs) str)])
|
||||||
|
(and p (let ([p (cdar p)]) (and (> p 0) p))))))
|
||||||
|
(define r
|
||||||
|
(and r? (let ([p (regexp-match-positions (cadr rxs) str)])
|
||||||
|
(and p (let ([p (caar p)])
|
||||||
|
(and (< p (string-length str))
|
||||||
|
(if (and l (> l p)) l p)))))))
|
||||||
|
(values l r))
|
||||||
|
|
||||||
;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice
|
;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice
|
||||||
;; overview of popular names etc for these functions;
|
;; overview of popular names etc for these functions;
|
||||||
;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways
|
;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways
|
||||||
;; to implement trimming.
|
;; to implement trimming.
|
||||||
(define (string-trim str [rx #px"\\s+"]
|
(define (string-trim str [sep none]
|
||||||
#:left? [left? #t] #:right? [right? #t])
|
#:left? [l? #t] #:right? [r? #t] #:repeat? [+? #f])
|
||||||
(unless (string? str) (raise-type-error 'string-trim "string" str))
|
(define rxs (get-rxs 'string-trim sep +?))
|
||||||
(unless (regexp? rx) (raise-type-error 'string-trim "regexp" rx))
|
(define-values [l r] (internal-trim 'string-trim str sep l? r? (cdr rxs)))
|
||||||
(define len (string-length str))
|
(cond [(and l r) (substring str l r)]
|
||||||
(if (zero? len)
|
[l (substring str l)]
|
||||||
str
|
[r (substring str 0 r)]
|
||||||
(let* ([start (if (and left? (regexp-match? rx (substring str 0 1)))
|
[else str]))
|
||||||
(cdar (regexp-match-positions rx str))
|
|
||||||
0)]
|
|
||||||
[end (and right? (< start len)
|
|
||||||
(regexp-match? rx (substring str (- len 1)))
|
|
||||||
(for/or ([i (in-range (- len 2) (- start 1) -1)])
|
|
||||||
(and (not (regexp-match? rx (substring str i (add1 i))))
|
|
||||||
(add1 i))))])
|
|
||||||
(if (and (not start) (not end))
|
|
||||||
str
|
|
||||||
(substring str (or start 0) (or end len))))))
|
|
||||||
|
|
||||||
(define (string-normalize-spaces str [rx #px"\\s+"]
|
(define (string-normalize-spaces str [sep none] [space " "]
|
||||||
#:space [space " "] #:trim? [trim? #t])
|
#:trim? [trim? #t] #:repeat? [+? #f])
|
||||||
(define ps (regexp-match-positions* rx str))
|
(define rxs (get-rxs 'string-normalize-spaces sep +?))
|
||||||
(if (null? ps)
|
(define-values [l r]
|
||||||
str
|
(if trim?
|
||||||
(let ([drop-first? (and trim? (zero? (caar ps)))]
|
(internal-trim 'string-normalize-spaces str sep #t #t (cdr rxs))
|
||||||
[len (string-length str)])
|
(values #f #f)))
|
||||||
(let loop ([ps (if drop-first? (cdr ps) ps)]
|
(string-join (regexp-split (car rxs) str (or l 0) r) space))
|
||||||
[i (if drop-first? (cdar ps) 0)]
|
|
||||||
[r '()])
|
|
||||||
(if (or (null? ps) (and trim? (= len (cdar ps))))
|
|
||||||
(apply string-append
|
|
||||||
(reverse (cons (substring str i (if (null? ps) len (caar ps)))
|
|
||||||
r)))
|
|
||||||
(loop (cdr ps)
|
|
||||||
(cdar ps)
|
|
||||||
(list* space (substring str i (caar ps)) r)))))))
|
|
||||||
|
|
|
@ -389,44 +389,62 @@ one between @racket[list] and @racket[list*].
|
||||||
'("Alpha" "Beta" "Gamma")))))
|
'("Alpha" "Beta" "Gamma")))))
|
||||||
]}
|
]}
|
||||||
|
|
||||||
@defproc[(string-join [strs (listof string?)] [sep string?]) string?]{
|
@defproc[(string-join [strs (listof string?)] [sep string? " "]) string?]{
|
||||||
|
|
||||||
Appends the strings in @racket[strs], inserting @racket[sep] between
|
Appends the strings in @racket[strs], inserting @racket[sep] between
|
||||||
each pair of strings in @racket[strs].
|
each pair of strings in @racket[strs].
|
||||||
|
|
||||||
@mz-examples[#:eval string-eval
|
@mz-examples[#:eval string-eval
|
||||||
|
(string-join '("one" "two" "three" "four"))
|
||||||
|
(string-join '("one" "two" "three" "four") ", ")
|
||||||
(string-join '("one" "two" "three" "four") " potato ")
|
(string-join '("one" "two" "three" "four") " potato ")
|
||||||
]}
|
]}
|
||||||
|
|
||||||
@defproc[(string-trim [str string?] [rx regexp? #px"\\s+"]
|
@; *********************************************************************
|
||||||
[#:left? left? any/c #t] [#:right? right? any/c #t])
|
@; Meta note: these functions are intended to be newbie-friendly, so I'm
|
||||||
|
@; intentionally starting the descriptions with a short senstence that
|
||||||
|
@; describes the default behavior instead of diving straight to a
|
||||||
|
@; precise description.
|
||||||
|
|
||||||
|
@defproc[(string-trim [str string?]
|
||||||
|
[sep (or/c string? regexp?) #px"\\s+"]
|
||||||
|
[#:left? left? any/c #t]
|
||||||
|
[#:right? right? any/c #t]
|
||||||
|
[#:repeat? repeat? any/c #f])
|
||||||
string?]{
|
string?]{
|
||||||
|
|
||||||
Trims the input @racket[str] by removing prefix and suffix matches of
|
Trims the input @racket[str] by removing prefix and suffix whitespaces.
|
||||||
@racket[rx]. Use @racket[#:left?] or @racket[#:right?] to suppress
|
|
||||||
trimming one of these sides.
|
|
||||||
|
|
||||||
The @racket[rx] regexp should match a whole (non-empty) sequence of
|
The optional @racket[sep] argument can be specified as either a string
|
||||||
spaces and should not rely on surrounding context. This means that it
|
or a (p)regexp to remove a different prefix/suffix; a string is matched
|
||||||
should usually end with a @litchar{+}, and that it should not use
|
as-is. Use @racket[#:left?] or @racket[#:right?] to suppress trimming
|
||||||
@litchar{^}, @litchar{$}, or other lookaheads and lookbacks. (The
|
one of these sides. When @racket[repeat?] is @racket[#f] (the default),
|
||||||
regexp is expected to both identify a whole sequence of spaces, and
|
only one match is removed from each side, but when it is true any number
|
||||||
match on a non-empty part of such a sequence.)
|
of matches is trimmed. (Note that with a regexp separator you can use
|
||||||
|
@litchar{+} instead.)
|
||||||
|
|
||||||
@mz-examples[#:eval string-eval
|
@mz-examples[#:eval string-eval
|
||||||
(string-trim " foo bar baz \r\n\t")
|
(string-trim " foo bar baz \r\n\t")
|
||||||
|
(string-trim " foo bar baz \r\n\t" " " #:repeat? #t)
|
||||||
|
(string-trim "aaaxaayaa" "aa")
|
||||||
]}
|
]}
|
||||||
|
|
||||||
@defproc[(string-normalize-spaces
|
@defproc[(string-normalize-spaces [str string?]
|
||||||
[str string?] [rx regexp? #px"\\s+"]
|
[sep (or/c string? regexp?) #px"\\s+"]
|
||||||
[#:space space string? " "] [#:trim? trim? any/c #t])
|
[space string? " "]
|
||||||
|
[#:trim? trim? any/c #t]
|
||||||
|
[#:repeat? repeat? any/c #f])
|
||||||
string?]{
|
string?]{
|
||||||
|
|
||||||
Normalizes spaces (matching @racket[rx]) in the input @racket[str] by
|
Normalizes spaces in the input @racket[str] by trimming it (using
|
||||||
replacing them with @racket[space]. In the default configuration, this
|
@racket[string-trim]) and replacing all whitespace sequences in the
|
||||||
will replace any sequence of whitespaces by a single space character.
|
result with a single space.
|
||||||
In addition, prefix and suffix spaces are trimmed if @racket[trim?] is
|
|
||||||
true, otherwise they get normalized too.
|
Similarly to @racket[string-trim], @racket[sep] can be given as a string
|
||||||
|
or a (p)regexp, and @racket[repeat?] controls matching repeated
|
||||||
|
sequences. In addition, you can specify @racket[space] for an alternate
|
||||||
|
space replacement. @racket[trim?] determines whether trimming is done
|
||||||
|
(the default).
|
||||||
|
|
||||||
@mz-examples[#:eval string-eval
|
@mz-examples[#:eval string-eval
|
||||||
(string-normalize-spaces " foo bar baz \r\n\t")
|
(string-normalize-spaces " foo bar baz \r\n\t")
|
||||||
|
|
|
@ -367,11 +367,11 @@
|
||||||
|
|
||||||
;; ---------- string-join ----------
|
;; ---------- string-join ----------
|
||||||
(let ()
|
(let ()
|
||||||
(test "" string-join '() " ")
|
(test "" string-join '())
|
||||||
(test "" string-join '("") " ")
|
(test "" string-join '(""))
|
||||||
(test " " string-join '("" "") " ")
|
(test " " string-join '("" ""))
|
||||||
(test "x" string-join '("x") " ")
|
(test "x" string-join '("x"))
|
||||||
(test "x y" string-join '("x" "y") " ")
|
(test "x y" string-join '("x" "y"))
|
||||||
(test "x y z" string-join '("x" "y" "z") " ")
|
(test "x y z" string-join '("x" "y" "z") " ")
|
||||||
(test "x,y,z" string-join '("x" "y" "z") ","))
|
(test "x,y,z" string-join '("x" "y" "z") ","))
|
||||||
|
|
||||||
|
@ -405,6 +405,29 @@
|
||||||
(test "\t x \t" string-trim " \t x \t " #px" +")
|
(test "\t x \t" string-trim " \t x \t " #px" +")
|
||||||
(test " x" string-trim " x " #:left? #f)
|
(test " x" string-trim " x " #:left? #f)
|
||||||
(test "x " string-trim " x " #:right? #f)
|
(test "x " string-trim " x " #:right? #f)
|
||||||
(test " x " string-trim " x " #:left? #f #:right? #f))
|
(test " x " string-trim " x " #:left? #f #:right? #f)
|
||||||
|
(for* ([i+e '(["" "" ""]
|
||||||
|
["a" "a" "a"]
|
||||||
|
["aa" "" ""]
|
||||||
|
["aaa" "" ""] ; weird case
|
||||||
|
["aaaa" "" ""]
|
||||||
|
["aaaaa" "a" ""]
|
||||||
|
["aa-aa" "-" "-"]
|
||||||
|
["aaaaaa" "aa" ""]
|
||||||
|
["aa--aa" "--" "--"])]
|
||||||
|
[sep '("aa" #rx"aa" #px"aa")])
|
||||||
|
(define input (car i+e))
|
||||||
|
(define expected (cadr i+e))
|
||||||
|
(define expected+ (caddr i+e))
|
||||||
|
(test expected string-trim input sep)
|
||||||
|
(test expected+ string-trim input sep #:repeat? #t))
|
||||||
|
;; this is obvious, but...
|
||||||
|
(test "" string-trim "abaaba" "aba")
|
||||||
|
;; ...this is a version of the above weird case: it's questionable whether
|
||||||
|
;; this should return "" or "ba" (could also be "ab"), but it seems sensible
|
||||||
|
;; to do this (I haven't seen any existing trimmers that make any relevant
|
||||||
|
;; decision on this)
|
||||||
|
(test "" string-trim "ababa" "aba")
|
||||||
|
)
|
||||||
|
|
||||||
(report-errs)
|
(report-errs)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user