use KMP to avoid quadratic time in string-contains?
This commit is contained in:
parent
a8d5a4f2f4
commit
800ea98525
|
@ -3,7 +3,8 @@
|
|||
|
||||
(Section 'string)
|
||||
|
||||
(require racket/string)
|
||||
(require racket/string
|
||||
(submod racket/string private))
|
||||
|
||||
;; ---------- real->decimal-string ----------
|
||||
(test "0." real->decimal-string 0 0)
|
||||
|
@ -536,10 +537,65 @@
|
|||
(test #t string-contains? "racket" "cket")
|
||||
(test #t string-contains? "racket" "acke")
|
||||
(test #t string-contains? "racket" "t")
|
||||
(test #t string-contains? "racket" "")
|
||||
(test #f string-contains? "racket" "b")
|
||||
(test #f string-contains? "racket" "R")
|
||||
(test #f string-contains? "RACKET" "r")
|
||||
(test #f string-contains? "racket" "kc")
|
||||
(test #f string-contains? "racket" "racketr"))
|
||||
(test #f string-contains? "racket" "racketr")
|
||||
(test #t string-contains? "racket" "")
|
||||
(test #t string-contains? "" "")
|
||||
(test #f string-contains? "" "racket")
|
||||
(test #f string-contains? "racket" "a..e")
|
||||
(test #t string-contains? "ra..et" "a..e")
|
||||
; string-contains? sometimes uses diferent code paths for short and long string,
|
||||
; so add some long test too.
|
||||
(test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
|
||||
"racket012345678901234567890123456789012345678901234567890123456789racket")
|
||||
(test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
|
||||
"racket01234567890123456789")
|
||||
(test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
|
||||
"01234567890123456789racket")
|
||||
(test #t string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
|
||||
"012345678901234567890123456789")
|
||||
(test #f string-contains? "racket012345678901234567890123456789012345678901234567890123456789racket"
|
||||
"racket01234567890123456789racket")
|
||||
(test #t string-contains? "racket0123456789012345678901234567890123456789aaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
"aaaaaaaaaaaaaaaaaaaaaaaaa")
|
||||
(test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaaa0123456789012345678901234567890123456789racket"
|
||||
"aaaaaaaaaaaaaaaaaaaaaaaaa")
|
||||
(test #f string-contains? "aaaaaaaaaaaaaaaaaaaaaaaa_012345678901234567890_aaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
"aaaaaaaaaaaaaaaaaaaaaaaaa")
|
||||
(test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaaa012345678901234567890_aaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
"aaaaaaaaaaaaaaaaaaaaaaaaa")
|
||||
(test #t string-contains? "aaaaaaaaaaaaaaaaaaaaaaaa_012345678901234567890aaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
"aaaaaaaaaaaaaaaaaaaaaaaaa")
|
||||
(test #t string-contains? "1234567890aaaaa123456789012345678901234567890aaaaa1234567890123456789012345678901234567890"
|
||||
"1234567890123456789012345678901234567890")
|
||||
(test #f string-contains? "1234567890aaaaa123456789012345678901234567890aaaaa123456789012345678901234567890aaaa"
|
||||
"1234567890123456789012345678901234567890")
|
||||
(test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||
"yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy")
|
||||
(test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||
"xxxxxxxxxxxxxxxxxxxxxxxxy")
|
||||
(test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||
"yxxxxxxxxxxxxxxxxxxxxxxxx")
|
||||
(test #f string-contains? "xxxxxxxxxxxxxxxxxxxxxxxxyxxxxxxxxxxxxxxxxxxxxxxxxyxxxxxxxxxxxxxxxxxxxxx"
|
||||
"xxxxxxxxxxxxxxxxxxxxxxxxx"))
|
||||
|
||||
;; ---------- build-kmp-table ----------
|
||||
|
||||
; The tables have one more coefficient than the usual tables,
|
||||
; that can be used to get or count all the matches.
|
||||
(let ()
|
||||
(test #(#f 0 0 0 0 0 0) build-kmp-table "racket")
|
||||
(test #(#f #f #f #f #f #f 5) build-kmp-table "rrrrrr")
|
||||
(test #(#f 0 #f 0 #f 0 4) build-kmp-table "ababab")
|
||||
(test #(#f 0 0 #f 0 0 #f 0 0 6) build-kmp-table "abcabcabc")
|
||||
(test #(#f 0 0 #f 0 0 #f 0 0 6) build-kmp-table "abbabbabb")
|
||||
(test #(#f #f 1 #f #f 1 #f #f 1 6) build-kmp-table "aabaabaab")
|
||||
(test #(#f #f 1 0 #f #f 1 0 #f #f 1 0 8 ) build-kmp-table "aabbaabbaabb")
|
||||
; examples from https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm
|
||||
(test #(#f 0 0 0 #f 0 2 0) build-kmp-table "abcdabd")
|
||||
(test #(#f 0 #f 1 #f 0 #f 3 2 0) build-kmp-table "abacababc"))
|
||||
|
||||
(report-errs)
|
||||
|
|
|
@ -11,6 +11,10 @@
|
|||
string-suffix?
|
||||
string-contains?)
|
||||
|
||||
(module+ private
|
||||
(provide build-kmp-table))
|
||||
|
||||
|
||||
(define string-append*
|
||||
(case-lambda [(strs) (apply string-append strs)] ; optimize common cases
|
||||
[(s1 strs) (apply string-append s1 strs)]
|
||||
|
@ -174,6 +178,71 @@
|
|||
(and (char=? (string-ref str i+o) (string-ref suffix i))
|
||||
(loop (add1 i+o) (add1 i)))))))
|
||||
|
||||
;; string-contains? uses a variant of the Knuth-Morris-Pratt string search
|
||||
;; algorithm. It start with a direct search without the KMP table, until
|
||||
;; it finds a partial match that long enough (currently 4 characters).
|
||||
;; The table is also skiped when there are only a few characters left
|
||||
;; to test (currently 4 characters).
|
||||
;; The first time a partial match is found, it buils a small partial table
|
||||
;; and if it is necesay it builds the complete table.
|
||||
;; The KMP table has one more element as usual, that is unused for now.
|
||||
;; The last coefficient can be used to continue the search in case a
|
||||
;; match is found and we want to get all the matches or count them.
|
||||
(define (build-kmp-table sub)
|
||||
(define L (string-length sub))
|
||||
(if (> L 0)
|
||||
(build-kmp-table/partial sub L #f)
|
||||
(make-vector 1 #f)))
|
||||
|
||||
(define (build-kmp-table/partial/first sub pos)
|
||||
(define L (string-length sub))
|
||||
(if (> L 0)
|
||||
(build-kmp-table/partial sub (min L (* 2 pos)) #f)
|
||||
(make-vector 1 #f)))
|
||||
|
||||
(define (build-kmp-table/partial/next sub pos prev)
|
||||
(define L (string-length sub))
|
||||
(if (> L 0)
|
||||
(build-kmp-table/partial sub L prev)
|
||||
(make-vector 1 #f)))
|
||||
|
||||
(define (build-kmp-table/partial sub vL prev)
|
||||
(define L (string-length sub))
|
||||
(define pos (if prev
|
||||
(sub1 (vector-length prev))
|
||||
1))
|
||||
(define cnd (if prev
|
||||
(unbox (vector-ref prev (sub1 (vector-length prev))))
|
||||
0))
|
||||
(define t (make-vector (add1 vL) #f))
|
||||
(when prev
|
||||
(vector-copy! t 0 prev))
|
||||
|
||||
(let loop ([pos pos] [cnd cnd])
|
||||
(cond
|
||||
[(= pos L)
|
||||
(vector-set! t pos cnd)
|
||||
t]
|
||||
[(= pos vL)
|
||||
(vector-set! t pos (box cnd))
|
||||
t]
|
||||
[(char=? (string-ref sub pos)
|
||||
(string-ref sub cnd))
|
||||
(vector-set! t pos (vector-ref t cnd))
|
||||
(loop (add1 pos) (add1 cnd))]
|
||||
[else
|
||||
(vector-set! t pos cnd)
|
||||
(let loop2 ([pos pos]
|
||||
[cnd (vector-ref t cnd)])
|
||||
(cond
|
||||
[(not cnd)
|
||||
(loop (add1 pos) 0)]
|
||||
[(char=? (string-ref sub pos)
|
||||
(string-ref sub cnd))
|
||||
(loop (add1 pos) (add1 cnd))]
|
||||
[else
|
||||
(loop2 pos (vector-ref t cnd))]))])))
|
||||
|
||||
(define (string-contains? str sub)
|
||||
(unless (string? str)
|
||||
(raise-argument-error 'string-contains? "string?" str))
|
||||
|
@ -182,12 +251,59 @@
|
|||
(define L1 (string-length str))
|
||||
(define L2 (string-length sub))
|
||||
(define d (- L1 L2))
|
||||
(or (zero? L2)
|
||||
(let loop ([start 0])
|
||||
(define d-4 (- d 4))
|
||||
|
||||
(or (= L2 0)
|
||||
(let loop ([start 0]
|
||||
[offset 0])
|
||||
(define start+offset (+ start offset))
|
||||
(and (<= start d)
|
||||
(or (let loop2 ([offset 0])
|
||||
(or (= offset L2)
|
||||
(and (char=? (string-ref str (+ start offset))
|
||||
(string-ref sub offset))
|
||||
(loop2 (add1 offset)))))
|
||||
(loop (add1 start)))))))
|
||||
(or (= offset L2)
|
||||
(cond
|
||||
[(char=? (string-ref sub offset)
|
||||
(string-ref str start+offset))
|
||||
(loop start (add1 offset))]
|
||||
[(or (<= offset 4) (>= start d-4))
|
||||
(loop (add1 start) 0)]
|
||||
[else
|
||||
(define t (build-kmp-table/partial/first sub offset))
|
||||
(define skip (vector-ref t offset))
|
||||
(cond
|
||||
[skip
|
||||
(string-contains?/table str sub (- start+offset skip) skip t)]
|
||||
[else
|
||||
(string-contains?/table str sub (add1 start+offset) 0 t)])]))))))
|
||||
|
||||
(define (string-contains?/table str sub start offset t)
|
||||
(define L1 (string-length str))
|
||||
(define L2 (string-length sub))
|
||||
(define d (- L1 L2))
|
||||
|
||||
(or (= L2 0)
|
||||
(let loop ([start start]
|
||||
[offset offset]
|
||||
[t t]
|
||||
[tL (sub1 (vector-length t))])
|
||||
(define start+offset (+ start offset))
|
||||
(and (<= start d)
|
||||
(or (= offset L2)
|
||||
(cond
|
||||
[(char=? (string-ref sub offset)
|
||||
(string-ref str start+offset))
|
||||
(loop start (add1 offset) t tL)]
|
||||
[(< offset tL)
|
||||
(define skip (vector-ref t offset))
|
||||
(cond
|
||||
[skip
|
||||
(loop (- start+offset skip) skip t tL)]
|
||||
[else
|
||||
(loop (add1 start+offset) 0 t tL)])]
|
||||
[else
|
||||
(let* ([t (build-kmp-table/partial/next sub offset t)]
|
||||
[tL (sub1 (vector-length t))]
|
||||
[skip (vector-ref t offset)])
|
||||
(cond
|
||||
[skip
|
||||
(loop (- start+offset skip) skip t tL)]
|
||||
[else
|
||||
(loop (add1 start+offset) 0 t tL)]))]))))))
|
||||
|
|
Loading…
Reference in New Issue
Block a user