From 816b9a818f804d22b9206c923ab90f1eb3217aae Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Thu, 26 Jun 2014 17:56:39 +0100 Subject: [PATCH] reference & guide: improve explanation of regexp performance Explain better that regexp values can be used more efficiently than strings or byte strings as patterns. Adjust all examples to use regexp values instead of strings. --- .../scribblings/guide/performance.scrbl | 31 ++++++++++++ .../scribblings/reference/regexps.scrbl | 48 ++++++++++--------- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/pkgs/racket-pkgs/racket-doc/scribblings/guide/performance.scrbl b/pkgs/racket-pkgs/racket-doc/scribblings/guide/performance.scrbl index f1f5c5c5c5..26418eaed7 100644 --- a/pkgs/racket-pkgs/racket-doc/scribblings/guide/performance.scrbl +++ b/pkgs/racket-pkgs/racket-doc/scribblings/guide/performance.scrbl @@ -356,6 +356,37 @@ Beware that, as ``unsafe'' in the library and function names suggest, misusing the exports of @racketmodname[racket/unsafe/ops] can lead to crashes or memory corruption. +@; ---------------------------------------------------------------------- + +@section[#:tag "regexp-perf"]{Regular Expression Performance} + +When a string or byte string is provided to a function like +@racket[regexp-match], then the string is internally compiled into +a @tech{regexp} value. Instead of supplying a string or byte string +multiple times as a pattern for matching, compile the pattern once to +a @tech{regexp} value using @racket[regexp], @racket[byte-regexp], +@racket[pregexp], or @racket[byte-pregexp]. In place of a constant +string or byte string, write a constant @tech{regexp} using an +@litchar{#rx} or @litchar{#px} prefix. + +@racketblock[ +(define (slow-matcher str) + (regexp-match? "[0-9]+" str)) + +(define (fast-matcher str) + (regexp-match? #rx"[0-9]+" str)) + +(define (make-slow-matcher pattern-str) + (lambda (str) + (regexp-match? pattern-str str))) + +(define (make-fast-matcher pattern-str) + (define pattern-rx (regexp pattern-str)) + (lambda (str) + (regexp-match? pattern-rx str))) +] + + @; ---------------------------------------------------------------------- @section[#:tag "gc-perf"]{Memory Management} diff --git a/pkgs/racket-pkgs/racket-doc/scribblings/reference/regexps.scrbl b/pkgs/racket-pkgs/racket-doc/scribblings/reference/regexps.scrbl index bb7b7cc286..5a218ddc1f 100644 --- a/pkgs/racket-pkgs/racket-doc/scribblings/reference/regexps.scrbl +++ b/pkgs/racket-pkgs/racket-doc/scribblings/reference/regexps.scrbl @@ -43,25 +43,27 @@ port, it matches UTF-8 encodings (see @secref["encodings"]) of matching character streams; if a byte regexp is used with a character string, it matches bytes in the UTF-8 encoding of the string. -Regular expressions can be compiled into a @deftech{regexp value} for -repeated matches. The @racket[regexp] and @racket[byte-regexp] -procedures convert a string or byte string (respectively) into a -regexp value using one syntax of regular expressions that is most -compatible to @exec{egrep}. The @racket[pregexp] and -@racket[byte-pregexp] procedures produce a regexp value using a -slightly different syntax of regular expressions that is more -compatible with Perl. +A regular expression that is represented as a string or byte string +can be compiled to a @deftech{regexp value}, which can be used more +efficiently by functions such as @racket[regexp-match] compared to the +string or byte string form. The @racket[regexp] and +@racket[byte-regexp] procedures convert a string or byte string +(respectively) into a regexp value using a syntax of regular +expressions that is most compatible to @exec{egrep}. The +@racket[pregexp] and @racket[byte-pregexp] procedures produce a regexp +value using a slightly different syntax of regular expressions that is +more compatible with Perl. -Two regular expressions are @racket[equal?] if they have the same +Two @tech{regexp values} are @racket[equal?] if they have the same source, use the same pattern language, and are both character regexps or both byte regexps. -A literal or printed regular expression starts with @litchar{#rx} or -@litchar{#px}. @see-read-print["regexp"]{regular expressions} Regular -expressions produced by the default reader are @tech{interned} in +A literal or printed @tech{regexp value} starts with @litchar{#rx} or +@litchar{#px}. @see-read-print["regexp"]{regular expressions} Regexp +values produced by the default reader are @tech{interned} in @racket[read-syntax] mode. -The internal size of a regexp value is limited to 32 kilobytes; this +The internal size of a @tech{regexp value} is limited to 32 kilobytes; this limit roughly corresponds to a source string with 32,000 literal characters or 5,000 operators. @@ -850,10 +852,10 @@ before the @litchar{\}. For example, the Racket constant @racket["\\1"] is @litchar{\1}. @examples[ -(regexp-replace "mi" "mi casa" "su") -(regexp-replace "mi" "mi casa" string-upcase) -(regexp-replace "([Mm])i ([a-zA-Z]*)" "Mi Casa" "\\1y \\2") -(regexp-replace "([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi" +(regexp-replace #rx"mi" "mi casa" "su") +(regexp-replace #rx"mi" "mi casa" string-upcase) +(regexp-replace #rx"([Mm])i ([a-zA-Z]*)" "Mi Casa" "\\1y \\2") +(regexp-replace #rx"([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi" "\\1y \\2") (regexp-replace #rx"x" "12x4x6" "\\\\") (display (regexp-replace #rx"x" "12x4x6" "\\\\")) @@ -882,9 +884,9 @@ a portion of @racket[input] for matching; the default is the entire string or the stream up to an end-of-file. @examples[ -(regexp-replace* "([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi" +(regexp-replace* #rx"([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi" "\\1y \\2") -(regexp-replace* "([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi" +(regexp-replace* #rx"([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi" (lambda (all one two) (string-append (string-downcase one) "y" (string-upcase two)))) @@ -908,9 +910,9 @@ order, so later replacements can apply to previous insertions. @examples[ (regexp-replaces "zero-or-more?" - '([#rx"-" "_"] [#rx"(.*)\\?$" "is_\\1"])) + '([#rx"-" "_"] [#rx"(.*)\\?$" "is_\\1"])) (regexp-replaces "zero-or-more?" - '(["e" "o"] ["o" "oo"])) + '([#rx"e" "o"] [#rx"o" "oo"])) ]} @defproc*[([(regexp-replace-quote [str string?]) string?] @@ -923,6 +925,6 @@ Concretely, every @litchar{\} and @litchar{&} in @racket[str] or @racket[bstr] is protected by a quoting @litchar{\}. @examples[ -(regexp-replace "UT" "Go UT!" "A&M") -(regexp-replace "UT" "Go UT!" (regexp-replace-quote "A&M")) +(regexp-replace #rx"UT" "Go UT!" "A&M") +(regexp-replace #rx"UT" "Go UT!" (regexp-replace-quote "A&M")) ]}