From 6f215759ae01354dc9df22413e792da1a9693fb0 Mon Sep 17 00:00:00 2001 From: Eli Barzilay Date: Wed, 18 Apr 2012 15:19:36 -0400 Subject: [PATCH] Add `string-trim' and `string-normalize-spaces'. --- collects/racket/string.rkt | 45 ++++++++++++- collects/scribblings/reference/strings.scrbl | 34 ++++++++++ collects/tests/racket/string.rktl | 71 ++++++++++++++------ 3 files changed, 129 insertions(+), 21 deletions(-) diff --git a/collects/racket/string.rkt b/collects/racket/string.rkt index d1cee90a32..a98f8c97ec 100644 --- a/collects/racket/string.rkt +++ b/collects/racket/string.rkt @@ -1,6 +1,6 @@ -#lang scheme/base +#lang racket/base -(provide string-append* string-join) +(provide string-append* string-join string-trim string-normalize-spaces) (define string-append* (case-lambda [(strs) (apply string-append strs)] ; optimize common case @@ -20,3 +20,44 @@ [(null? strs) ""] [(null? (cdr strs)) (car strs)] [else (apply string-append (add-between strs sep))])) + +;; See http://en.wikipedia.org/wiki/Trimming_(computer_programming) for a nice +;; overview of popular names etc for these functions; +;; http://blog.stevenlevithan.com/archives/faster-trim-javascript for some ways +;; to implement trimming. +(define (string-trim str [rx #px"\\s+"] + #:left? [left? #t] #:right? [right? #t]) + (unless (string? str) (raise-type-error 'string-trim "string" str)) + (unless (regexp? rx) (raise-type-error 'string-trim "regexp" rx)) + (define len (string-length str)) + (if (zero? len) + str + (let* ([start (if (and left? (regexp-match? rx (substring str 0 1))) + (cdar (regexp-match-positions rx str)) + 0)] + [end (and right? (< start len) + (regexp-match? rx (substring str (- len 1))) + (for/or ([i (in-range (- len 2) (- start 1) -1)]) + (and (not (regexp-match? rx (substring str i (add1 i)))) + (add1 i))))]) + (if (and (not start) (not end)) + str + (substring str (or start 0) (or end len)))))) + +(define (string-normalize-spaces str [rx #px"\\s+"] + #:space [space " "] #:trim? [trim? #t]) + (define ps (regexp-match-positions* rx str)) + (if (null? ps) + str + (let ([drop-first? (and trim? (zero? (caar ps)))] + [len (string-length str)]) + (let loop ([ps (if drop-first? (cdr ps) ps)] + [i (if drop-first? (cdar ps) 0)] + [r '()]) + (if (or (null? ps) (and trim? (= len (cdar ps)))) + (apply string-append + (reverse (cons (substring str i (if (null? ps) len (caar ps))) + r))) + (loop (cdr ps) + (cdar ps) + (list* space (substring str i (caar ps)) r))))))) diff --git a/collects/scribblings/reference/strings.scrbl b/collects/scribblings/reference/strings.scrbl index dfc9526c77..929ef21a16 100644 --- a/collects/scribblings/reference/strings.scrbl +++ b/collects/scribblings/reference/strings.scrbl @@ -398,4 +398,38 @@ each pair of strings in @racket[strs]. (string-join '("one" "two" "three" "four") " potato ") ]} +@defproc[(string-trim [str string?] [rx regexp? #px"\\s+"] + [#:left? left? any/c #t] [#:right? right? any/c #t]) + string?]{ + +Trims the input @racket[str] by removing prefix and suffix matches of +@racket[rx]. Use @racket[#:left?] or @racket[#:right?] to suppress +trimming one of these sides. + +The @racket[rx] regexp should match a whole (non-empty) sequence of +spaces and should not rely on surrounding context. This means that it +should usually end with a @litchar{+}, and that it should not use +@litchar{^}, @litchar{$}, or other lookaheads and lookbacks. (The +regexp is expected to both identify a whole sequence of spaces, and +match on a non-empty part of such a sequence.) + +@mz-examples[#:eval string-eval + (string-trim " foo bar baz \r\n\t") +]} + +@defproc[(string-normalize-spaces + [str string?] [rx regexp? #px"\\s+"] + [#:space space string? " "] [#:trim? trim? any/c #t]) + string?]{ + +Normalizes spaces (matching @racket[rx]) in the input @racket[str] by +replacing them with @racket[space]. In the default configuration, this +will replace any sequence of whitespaces by a single space character. +In addition, prefix and suffix spaces are trimmed if @racket[trim?] is +true, otherwise they get normalized too. + +@mz-examples[#:eval string-eval + (string-normalize-spaces " foo bar baz \r\n\t") +]} + @close-eval[string-eval] diff --git a/collects/tests/racket/string.rktl b/collects/tests/racket/string.rktl index ac41eb5317..d673da1f94 100644 --- a/collects/tests/racket/string.rktl +++ b/collects/tests/racket/string.rktl @@ -337,25 +337,6 @@ (t " 12 34 " #f "" " 12 34 ") ) -;; ---------- string-append* ---------- -(let () - (test "" string-append* '()) - (test "" string-append* '("")) - (test "" string-append* '("" "")) - (test "0123456789" string-append* '("0123456789")) - (test "0123456789" string-append* "0123456789" '()) - (test "0123456789" string-append* "0123456789" '("")) - (test "0123456789" string-append* "0123456789" '("" "")) - (test "0123456789" string-append* "01234567" '("8" "9"))) - -;; ---------- string-join ---------- -(let () - (test "" string-join '() " ") - (test "" string-join '("") " ") - (test " " string-join '("" "") " ") - (test "x y" string-join '("x" "y") " ") - (test "x" string-join '("x") " ")) - ;; String splitting can take longer than byte-string splitting, ;; but it should have the same computational complexity. (let () @@ -373,5 +354,57 @@ (and ((* 100 (- bcpu bgc)) . < . (- scpu sgc)) "suspiciously long time for regexp string split"))) +;; ---------- string-append* ---------- +(let ([t (λ (x . xs) (test ))]) + (test "" string-append* '()) + (test "" string-append* '("")) + (test "" string-append* '("" "")) + (test "0123456789" string-append* '("0123456789")) + (test "0123456789" string-append* "0123456789" '()) + (test "0123456789" string-append* "0123456789" '("")) + (test "0123456789" string-append* "0123456789" '("" "")) + (test "0123456789" string-append* "01234567" '("8" "9"))) + +;; ---------- string-join ---------- +(let () + (test "" string-join '() " ") + (test "" string-join '("") " ") + (test " " string-join '("" "") " ") + (test "x" string-join '("x") " ") + (test "x y" string-join '("x" "y") " ") + (test "x y z" string-join '("x" "y" "z") " ") + (test "x,y,z" string-join '("x" "y" "z") ",")) + +;; ---------- string-trim & string-normalize-spaces ---------- +(let () + (define spaces '("" " " " " "\r" "\r\n\t ")) + (define ++ string-append) + (define-syntax-rule (with-spaces id E ...) + (for ([id (in-list spaces)]) E ...)) + (define (both result arg) + (test result string-trim arg) + (test result string-normalize-spaces arg)) + (define (norm s) (if (equal? "" s) s " ")) + (with-spaces s1 + (both "" s1) + (with-spaces s2 + (both "x" (++ s1 "x" s2)) + (both "xx" (++ s1 "xx" s2)) + (with-spaces s3 + (test (++ "x" s3 "x") string-trim (++ s1 "x" s3 "x" s2)) + (test (++ "x" (norm s3) "x") + string-normalize-spaces (++ s1 "x" s3 "x" s2)) + (test (++ (norm s1) "x" (norm s3) "x" (norm s2)) + string-normalize-spaces (++ s1 "x" s3 "x" s2) #:trim? #f) + (with-spaces s4 + (test (++ "x" (norm s3) "y" (norm s4) "z") + string-normalize-spaces (++ s1 "x" s3 "y" s4 "z" s2)) + (test (++ (norm s1) "x" (norm s3) "y" (norm s4) "z" (norm s2)) + string-normalize-spaces (++ s1 "x" s3 "y" s4 "z" s2) + #:trim? #f))))) + (test "\t x \t" string-trim " \t x \t " #px" +") + (test " x" string-trim " x " #:left? #f) + (test "x " string-trim " x " #:right? #f) + (test " x " string-trim " x " #:left? #f #:right? #f)) (report-errs)