use KMP to avoid quadratic time in string-contains?

This commit is contained in:
Gustavo Massaccesi 2019-03-30 16:20:15 -03:00
parent a8d5a4f2f4
commit 800ea98525
2 changed files with 183 additions and 11 deletions

View File

@ -3,7 +3,8 @@
(Section 'string)
(require racket/string)
(require racket/string
(submod racket/string private))
;; ---------- real->decimal-string ----------
(test "0." real->decimal-string 0 0)
@ -536,10 +537,65 @@
(test #t string-contains? "racket" "cket")
(test #t string-contains? "racket" "acke")
(test #t string-contains? "racket" "t")
(test #t string-contains? "racket" "")
(test #f string-contains? "racket" "b")
(test #f string-contains? "racket" "R")
(test #f string-contains? "RACKET" "r")
(test #f string-contains? "racket" "kc")
(test #f string-contains? "racket" "racketr"))
(test #f string-contains? "racket" "racketr")
(test #t string-contains? "racket" "")
(test #t string-contains? "" "")
(test #f string-contains? "" "racket")
(test #f string-contains? "racket" "a..e")
(test #t string-contains? "ra..et" "a..e")
; string-contains? sometimes uses diferent code paths for short and long string,
; so add some long test too.
(test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
"racket012345678901234567890123456789012345678901234567890123456789racket")
(test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
"racket01234567890123456789")
(test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
"01234567890123456789racket")
(test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
"012345678901234567890123456789")
(test #f string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
"racket01234567890123456789racket")
(test #t string-contains? "racket0123456789012345678901234567890123456789aaaaaaaaaaaaaaaaaaaaaaaaa"
"aaaaaaaaaaaaaaaaaaaaaaaaa")
(test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaaa0123456789012345678901234567890123456789racket"
"aaaaaaaaaaaaaaaaaaaaaaaaa")
(test #f string-contains? "aaaaaaaaaaaaaaaaaaaaaaaa_012345678901234567890_aaaaaaaaaaaaaaaaaaaaaaaa"
"aaaaaaaaaaaaaaaaaaaaaaaaa")
(test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaaa012345678901234567890_aaaaaaaaaaaaaaaaaaaaaaaa"
"aaaaaaaaaaaaaaaaaaaaaaaaa")
(test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaa_012345678901234567890aaaaaaaaaaaaaaaaaaaaaaaaa"
"aaaaaaaaaaaaaaaaaaaaaaaaa")
(test #t string-contains? "1234567890aaaaa123456789012345678901234567890aaaaa1234567890123456789012345678901234567890"
"1234567890123456789012345678901234567890")
(test #f string-contains? "1234567890aaaaa123456789012345678901234567890aaaaa123456789012345678901234567890aaaa"
"1234567890123456789012345678901234567890")
(test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
"yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy")
(test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
"xxxxxxxxxxxxxxxxxxxxxxxxy")
(test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
"yxxxxxxxxxxxxxxxxxxxxxxxx")
(test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxyxxxxxxxxxxxxxxxxxxxxxxxxyxxxxxxxxxxxxxxxxxxxxx"
"xxxxxxxxxxxxxxxxxxxxxxxxx"))
;; ---------- build-kmp-table ----------
; The tables have one more coefficient than the usual tables,
; that can be used to get or count all the matches.
(let ()
(test #(#f 0 0 0 0 0 0) build-kmp-table "racket")
(test #(#f #f #f #f #f #f 5) build-kmp-table "rrrrrr")
(test #(#f 0 #f 0 #f 0 4) build-kmp-table "ababab")
(test #(#f 0 0 #f 0 0 #f 0 0 6) build-kmp-table "abcabcabc")
(test #(#f 0 0 #f 0 0 #f 0 0 6) build-kmp-table "abbabbabb")
(test #(#f #f 1 #f #f 1 #f #f 1 6) build-kmp-table "aabaabaab")
(test #(#f #f 1 0 #f #f 1 0 #f #f 1 0 8 ) build-kmp-table "aabbaabbaabb")
; examples from https://en.wikipedia.org/wiki/KnuthMorrisPratt_algorithm
(test #(#f 0 0 0 #f 0 2 0) build-kmp-table "abcdabd")
(test #(#f 0 #f 1 #f 0 #f 3 2 0) build-kmp-table "abacababc"))
(report-errs)

View File

@ -11,6 +11,10 @@
string-suffix?
string-contains?)
(module+ private
(provide build-kmp-table))
(define string-append*
(case-lambda [(strs) (apply string-append strs)] ; optimize common cases
[(s1 strs) (apply string-append s1 strs)]
@ -174,6 +178,71 @@
(and (char=? (string-ref str i+o) (string-ref suffix i))
(loop (add1 i+o) (add1 i)))))))
;; string-contains? uses a variant of the Knuth-Morris-Pratt string search
;; algorithm. It start with a direct search without the KMP table, until
;; it finds a partial match that long enough (currently 4 characters).
;; The table is also skiped when there are only a few characters left
;; to test (currently 4 characters).
;; The first time a partial match is found, it buils a small partial table
;; and if it is necesay it builds the complete table.
;; The KMP table has one more element as usual, that is unused for now.
;; The last coefficient can be used to continue the search in case a
;; match is found and we want to get all the matches or count them.
(define (build-kmp-table sub)
(define L (string-length sub))
(if (> L 0)
(build-kmp-table/partial sub L #f)
(make-vector 1 #f)))
(define (build-kmp-table/partial/first sub pos)
(define L (string-length sub))
(if (> L 0)
(build-kmp-table/partial sub (min L (* 2 pos)) #f)
(make-vector 1 #f)))
(define (build-kmp-table/partial/next sub pos prev)
(define L (string-length sub))
(if (> L 0)
(build-kmp-table/partial sub L prev)
(make-vector 1 #f)))
(define (build-kmp-table/partial sub vL prev)
(define L (string-length sub))
(define pos (if prev
(sub1 (vector-length prev))
1))
(define cnd (if prev
(unbox (vector-ref prev (sub1 (vector-length prev))))
0))
(define t (make-vector (add1 vL) #f))
(when prev
(vector-copy! t 0 prev))
(let loop ([pos pos] [cnd cnd])
(cond
[(= pos L)
(vector-set! t pos cnd)
t]
[(= pos vL)
(vector-set! t pos (box cnd))
t]
[(char=? (string-ref sub pos)
(string-ref sub cnd))
(vector-set! t pos (vector-ref t cnd))
(loop (add1 pos) (add1 cnd))]
[else
(vector-set! t pos cnd)
(let loop2 ([pos pos]
[cnd (vector-ref t cnd)])
(cond
[(not cnd)
(loop (add1 pos) 0)]
[(char=? (string-ref sub pos)
(string-ref sub cnd))
(loop (add1 pos) (add1 cnd))]
[else
(loop2 pos (vector-ref t cnd))]))])))
(define (string-contains? str sub)
(unless (string? str)
(raise-argument-error 'string-contains? "string?" str))
@ -182,12 +251,59 @@
(define L1 (string-length str))
(define L2 (string-length sub))
(define d (- L1 L2))
(or (zero? L2)
(let loop ([start 0])
(define d-4 (- d 4))
(or (= L2 0)
(let loop ([start 0]
[offset 0])
(define start+offset (+ start offset))
(and (<= start d)
(or (let loop2 ([offset 0])
(or (= offset L2)
(and (char=? (string-ref str (+ start offset))
(string-ref sub offset))
(loop2 (add1 offset)))))
(loop (add1 start)))))))
(or (= offset L2)
(cond
[(char=? (string-ref sub offset)
(string-ref str start+offset))
(loop start (add1 offset))]
[(or (<= offset 4) (>= start d-4))
(loop (add1 start) 0)]
[else
(define t (build-kmp-table/partial/first sub offset))
(define skip (vector-ref t offset))
(cond
[skip
(string-contains?/table str sub (- start+offset skip) skip t)]
[else
(string-contains?/table str sub (add1 start+offset) 0 t)])]))))))
(define (string-contains?/table str sub start offset t)
(define L1 (string-length str))
(define L2 (string-length sub))
(define d (- L1 L2))
(or (= L2 0)
(let loop ([start start]
[offset offset]
[t t]
[tL (sub1 (vector-length t))])
(define start+offset (+ start offset))
(and (<= start d)
(or (= offset L2)
(cond
[(char=? (string-ref sub offset)
(string-ref str start+offset))
(loop start (add1 offset) t tL)]
[(< offset tL)
(define skip (vector-ref t offset))
(cond
[skip
(loop (- start+offset skip) skip t tL)]
[else
(loop (add1 start+offset) 0 t tL)])]
[else
(let* ([t (build-kmp-table/partial/next sub offset t)]
[tL (sub1 (vector-length t))]
[skip (vector-ref t offset)])
(cond
[skip
(loop (- start+offset skip) skip t tL)]
[else
(loop (add1 start+offset) 0 t tL)]))]))))))