Fixing a bug reported by 张虎成. Original and small test case were added. I reimplemented Knuth-Morris-Pratt because I couldn't debug the existing implementation.

Please include in 5.0
This commit is contained in:
Jay McCarthy 2010-05-25 07:45:52 -06:00
parent 23617b08bd
commit 2df8fac233
2 changed files with 60 additions and 30 deletions

View File

@ -324,6 +324,22 @@ END
"<!-- comment --><br />"
"read-xml: parse-error: expected root element - received #<comment>")
(test-read-xml/element
"<title><![CDATA[hello world[mp3]]]></title>"
'(make-element
(make-source (make-location 1 0 1) (make-location 1 43 44))
'title
(list)
(list (make-cdata (make-source (make-location 1 7 8) (make-location 1 35 36)) "<![CDATA[hello world[mp3]]]>"))))
(test-read-xml/element
"<title><![CDATA[]]]></title>"
'(make-element
(make-source (make-location 1 0 1) (make-location 1 28 29))
'title
(list)
(list (make-cdata (make-source (make-location 1 7 8) (make-location 1 20 21)) "<![CDATA[]]]>"))))
; XXX need more read-xml/element tests
)

View File

@ -384,35 +384,49 @@
;; uses Knuth-Morris-Pratt from
;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
;; discards stop from input
(define (gen-read-until-string stop)
(let* ([len (string-length stop)]
[prefix (make-vector len 0)]
[fall-back
(lambda (k c)
(let ([k (let loop ([k k])
(cond
[(and (> k 0) (not (eq? (string-ref stop k) c)))
(loop (vector-ref prefix (sub1 k)))]
[else k]))])
(if (eq? (string-ref stop k) c)
(add1 k)
k)))])
(let init ([k 0] [q 1])
(when (< q len)
(let ([k (fall-back k (string-ref stop q))])
(vector-set! prefix q k)
(init k (add1 q)))))
;; (vector-ref prefix x) = the longest suffix that matches a prefix of stop
(lambda (in pos)
(list->string
(let/ec out
(let loop ([matched 0] [out out])
(let* ([c (non-eof read-char in pos)]
[matched (fall-back matched c)])
(cond
[(= matched len) (out null)]
[(zero? matched) (cons c (let/ec out (loop matched out)))]
[else (cons c (loop matched out))]))))))))
;; ---
;; Modified by Jay to look more like the version on Wikipedia after discovering a bug when parsing CDATA
;; The use of the hasheq table and the purely numeric code trades hash efficiency for stack/ec capture efficiency
(struct hash-string (port pos ht))
(define (hash-string-ref hs k)
(match-define (hash-string port pos ht) hs)
(hash-ref! ht k (lambda () (non-eof read-char port pos))))
(define (gen-read-until-string W)
(define Wlen (string-length W))
(define T (make-vector Wlen #f))
(vector-set! T 0 -1)
(vector-set! T 1 0)
(let kmp-table ([pos 2] [cnd 0])
(when (pos . < . Wlen)
(cond
[(char=? (string-ref W (sub1 pos)) (string-ref W cnd))
(vector-set! T pos (add1 cnd))
(kmp-table (add1 pos) (add1 cnd))]
[(cnd . > . 0)
(kmp-table pos (vector-ref T cnd))]
[(zero? cnd)
(vector-set! T pos 0)
(kmp-table (add1 pos) 0)])))
(lambda (S-as-port S-pos)
(define S (hash-string S-as-port S-pos (make-hasheq)))
(define W-starts-at
(let kmp-search ([m 0] [i 0])
(if (char=? (string-ref W i) (hash-string-ref S (+ m i)))
(let ([i (add1 i)])
(if (= i Wlen)
m
(kmp-search m i)))
(let* ([Ti (vector-ref T i)]
[m (+ m i (* -1 Ti))])
(if (Ti . > . -1)
(let ([i Ti])
(kmp-search m i))
(let ([i 0])
(kmp-search m i)))))))
(list->string
(for/list ([i (in-range 0 W-starts-at)])
(hash-string-ref S i)))))
;; "-->" makes more sense, but "--" follows the spec.
(define lex-comment-contents (gen-read-until-string "--"))
@ -460,4 +474,4 @@
(define (format-source loc)
(if (location? loc)
(format "~a.~a/~a" (location-line loc) (location-char loc) (location-offset loc))
(format "~a" loc)))
(format "~a" loc)))