From 800ea98525a1a642d484174688ae0d676846e03e Mon Sep 17 00:00:00 2001 From: Gustavo Massaccesi Date: Sat, 30 Mar 2019 16:20:15 -0300 Subject: [PATCH] use KMP to avoid quadratic time in string-contains? --- .../racket-test-core/tests/racket/string.rktl | 62 +++++++- racket/collects/racket/string.rkt | 132 ++++++++++++++++-- 2 files changed, 183 insertions(+), 11 deletions(-) diff --git a/pkgs/racket-test-core/tests/racket/string.rktl b/pkgs/racket-test-core/tests/racket/string.rktl index fb9a6a06a9..c7f8106216 100644 --- a/pkgs/racket-test-core/tests/racket/string.rktl +++ b/pkgs/racket-test-core/tests/racket/string.rktl @@ -3,7 +3,8 @@ (Section 'string) -(require racket/string) +(require racket/string + (submod racket/string private)) ;; ---------- real->decimal-string ---------- (test "0." real->decimal-string 0 0) @@ -536,10 +537,65 @@ (test #t string-contains? "racket" "cket") (test #t string-contains? "racket" "acke") (test #t string-contains? "racket" "t") - (test #t string-contains? "racket" "") (test #f string-contains? "racket" "b") (test #f string-contains? "racket" "R") + (test #f string-contains? "RACKET" "r") (test #f string-contains? "racket" "kc") - (test #f string-contains? "racket" "racketr")) + (test #f string-contains? "racket" "racketr") + (test #t string-contains? "racket" "") + (test #t string-contains? "" "") + (test #f string-contains? "" "racket") + (test #f string-contains? "racket" "a..e") + (test #t string-contains? "ra..et" "a..e") + ; string-contains? sometimes uses diferent code paths for short and long string, + ; so add some long test too. + (test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket" + "racket012345678901234567890123456789012345678901234567890123456789racket") + (test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket" + "racket01234567890123456789") + (test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket" + "01234567890123456789racket") + (test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket" + "012345678901234567890123456789") + (test #f string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket" + "racket01234567890123456789racket") + (test #t string-contains? "racket0123456789012345678901234567890123456789aaaaaaaaaaaaaaaaaaaaaaaaa" + "aaaaaaaaaaaaaaaaaaaaaaaaa") + (test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaaa0123456789012345678901234567890123456789racket" + "aaaaaaaaaaaaaaaaaaaaaaaaa") + (test #f string-contains? "aaaaaaaaaaaaaaaaaaaaaaaa_012345678901234567890_aaaaaaaaaaaaaaaaaaaaaaaa" + "aaaaaaaaaaaaaaaaaaaaaaaaa") + (test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaaa012345678901234567890_aaaaaaaaaaaaaaaaaaaaaaaa" + "aaaaaaaaaaaaaaaaaaaaaaaaa") + (test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaa_012345678901234567890aaaaaaaaaaaaaaaaaaaaaaaaa" + "aaaaaaaaaaaaaaaaaaaaaaaaa") + (test #t string-contains? "1234567890aaaaa123456789012345678901234567890aaaaa1234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890") + (test #f string-contains? "1234567890aaaaa123456789012345678901234567890aaaaa123456789012345678901234567890aaaa" + "1234567890123456789012345678901234567890") + (test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy") + (test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "xxxxxxxxxxxxxxxxxxxxxxxxy") + (test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "yxxxxxxxxxxxxxxxxxxxxxxxx") + (test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxyxxxxxxxxxxxxxxxxxxxxxxxxyxxxxxxxxxxxxxxxxxxxxx" + "xxxxxxxxxxxxxxxxxxxxxxxxx")) + +;; ---------- build-kmp-table ---------- + +; The tables have one more coefficient than the usual tables, +; that can be used to get or count all the matches. +(let () + (test #(#f 0 0 0 0 0 0) build-kmp-table "racket") + (test #(#f #f #f #f #f #f 5) build-kmp-table "rrrrrr") + (test #(#f 0 #f 0 #f 0 4) build-kmp-table "ababab") + (test #(#f 0 0 #f 0 0 #f 0 0 6) build-kmp-table "abcabcabc") + (test #(#f 0 0 #f 0 0 #f 0 0 6) build-kmp-table "abbabbabb") + (test #(#f #f 1 #f #f 1 #f #f 1 6) build-kmp-table "aabaabaab") + (test #(#f #f 1 0 #f #f 1 0 #f #f 1 0 8 ) build-kmp-table "aabbaabbaabb") + ; examples from https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm + (test #(#f 0 0 0 #f 0 2 0) build-kmp-table "abcdabd") + (test #(#f 0 #f 1 #f 0 #f 3 2 0) build-kmp-table "abacababc")) (report-errs) diff --git a/racket/collects/racket/string.rkt b/racket/collects/racket/string.rkt index d01adc4102..8a71c47554 100644 --- a/racket/collects/racket/string.rkt +++ b/racket/collects/racket/string.rkt @@ -11,6 +11,10 @@ string-suffix? string-contains?) +(module+ private + (provide build-kmp-table)) + + (define string-append* (case-lambda [(strs) (apply string-append strs)] ; optimize common cases [(s1 strs) (apply string-append s1 strs)] @@ -174,6 +178,71 @@ (and (char=? (string-ref str i+o) (string-ref suffix i)) (loop (add1 i+o) (add1 i))))))) +;; string-contains? uses a variant of the Knuth-Morris-Pratt string search +;; algorithm. It start with a direct search without the KMP table, until +;; it finds a partial match that long enough (currently 4 characters). +;; The table is also skiped when there are only a few characters left +;; to test (currently 4 characters). +;; The first time a partial match is found, it buils a small partial table +;; and if it is necesay it builds the complete table. +;; The KMP table has one more element as usual, that is unused for now. +;; The last coefficient can be used to continue the search in case a +;; match is found and we want to get all the matches or count them. +(define (build-kmp-table sub) + (define L (string-length sub)) + (if (> L 0) + (build-kmp-table/partial sub L #f) + (make-vector 1 #f))) + +(define (build-kmp-table/partial/first sub pos) + (define L (string-length sub)) + (if (> L 0) + (build-kmp-table/partial sub (min L (* 2 pos)) #f) + (make-vector 1 #f))) + +(define (build-kmp-table/partial/next sub pos prev) + (define L (string-length sub)) + (if (> L 0) + (build-kmp-table/partial sub L prev) + (make-vector 1 #f))) + +(define (build-kmp-table/partial sub vL prev) + (define L (string-length sub)) + (define pos (if prev + (sub1 (vector-length prev)) + 1)) + (define cnd (if prev + (unbox (vector-ref prev (sub1 (vector-length prev)))) + 0)) + (define t (make-vector (add1 vL) #f)) + (when prev + (vector-copy! t 0 prev)) + + (let loop ([pos pos] [cnd cnd]) + (cond + [(= pos L) + (vector-set! t pos cnd) + t] + [(= pos vL) + (vector-set! t pos (box cnd)) + t] + [(char=? (string-ref sub pos) + (string-ref sub cnd)) + (vector-set! t pos (vector-ref t cnd)) + (loop (add1 pos) (add1 cnd))] + [else + (vector-set! t pos cnd) + (let loop2 ([pos pos] + [cnd (vector-ref t cnd)]) + (cond + [(not cnd) + (loop (add1 pos) 0)] + [(char=? (string-ref sub pos) + (string-ref sub cnd)) + (loop (add1 pos) (add1 cnd))] + [else + (loop2 pos (vector-ref t cnd))]))]))) + (define (string-contains? str sub) (unless (string? str) (raise-argument-error 'string-contains? "string?" str)) @@ -182,12 +251,59 @@ (define L1 (string-length str)) (define L2 (string-length sub)) (define d (- L1 L2)) - (or (zero? L2) - (let loop ([start 0]) + (define d-4 (- d 4)) + + (or (= L2 0) + (let loop ([start 0] + [offset 0]) + (define start+offset (+ start offset)) (and (<= start d) - (or (let loop2 ([offset 0]) - (or (= offset L2) - (and (char=? (string-ref str (+ start offset)) - (string-ref sub offset)) - (loop2 (add1 offset))))) - (loop (add1 start))))))) + (or (= offset L2) + (cond + [(char=? (string-ref sub offset) + (string-ref str start+offset)) + (loop start (add1 offset))] + [(or (<= offset 4) (>= start d-4)) + (loop (add1 start) 0)] + [else + (define t (build-kmp-table/partial/first sub offset)) + (define skip (vector-ref t offset)) + (cond + [skip + (string-contains?/table str sub (- start+offset skip) skip t)] + [else + (string-contains?/table str sub (add1 start+offset) 0 t)])])))))) + +(define (string-contains?/table str sub start offset t) + (define L1 (string-length str)) + (define L2 (string-length sub)) + (define d (- L1 L2)) + + (or (= L2 0) + (let loop ([start start] + [offset offset] + [t t] + [tL (sub1 (vector-length t))]) + (define start+offset (+ start offset)) + (and (<= start d) + (or (= offset L2) + (cond + [(char=? (string-ref sub offset) + (string-ref str start+offset)) + (loop start (add1 offset) t tL)] + [(< offset tL) + (define skip (vector-ref t offset)) + (cond + [skip + (loop (- start+offset skip) skip t tL)] + [else + (loop (add1 start+offset) 0 t tL)])] + [else + (let* ([t (build-kmp-table/partial/next sub offset t)] + [tL (sub1 (vector-length t))] + [skip (vector-ref t offset)]) + (cond + [skip + (loop (- start+offset skip) skip t tL)] + [else + (loop (add1 start+offset) 0 t tL)]))]))))))