svn: r13855
This commit is contained in:
Jay McCarthy 2009-02-26 17:15:44 +00:00
parent 196ec00f16
commit 243fb2f0e4
2 changed files with 512 additions and 512 deletions

View File

@ -8,130 +8,130 @@
"html-spec.ss" "html-spec.ss"
"html-sig.ss" "html-sig.ss"
(prefix-in sgml: "sgml-reader.ss") (prefix-in sgml: "sgml-reader.ss")
xml) xml)
(provide-signature-elements html^) (provide-signature-elements html^)
;; Html-content = Html-element | Pc-data | Entity ;; Html-content = Html-element | Pc-data | Entity
(include "html-structs.ss") (include "html-structs.ss")
(include "case.ss") (include "case.ss")
;; xml->html : Document -> Html ;; xml->html : Document -> Html
(define (xml->html doc) (define (xml->html doc)
(let ([root (document-element doc)]) (let ([root (document-element doc)])
(unless (eq? 'html (element-name root)) (unless (eq? 'html (element-name root))
(error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root))) (error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root)))
(make-html (element-attributes root) (xml-contents->html (element-content root))))) (make-html (element-attributes root) (xml-contents->html (element-content root)))))
;; xml-content->html : (listof Content) -> (listof Html-element) ;; xml-content->html : (listof Content) -> (listof Html-element)
(define (xml-contents->html contents) (define (xml-contents->html contents)
(foldr xml-single-content->html (foldr xml-single-content->html
null null
contents)) contents))
;; read-xhtml : [Input-port] -> Html ;; read-xhtml : [Input-port] -> Html
(define read-xhtml (compose xml->html read-xml)) (define read-xhtml (compose xml->html read-xml))
;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content) ;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
(define (peel-f toss? to-toss acc0) (define (peel-f toss? to-toss acc0)
(foldr (lambda (x acc) (foldr (lambda (x acc)
(if (toss? x) (if (toss? x)
(append (html-full-content x) acc) (append (html-full-content x) acc)
(cons x acc))) (cons x acc)))
acc0 acc0
to-toss)) to-toss))
;; repackage-html : (listof Html-content) -> Html ;; repackage-html : (listof Html-content) -> Html
(define (repackage-html contents) (define (repackage-html contents)
(let* ([html (memf html? contents)] (let* ([html (memf html? contents)]
[peeled (peel-f html? contents null)] [peeled (peel-f html? contents null)]
[body (memf body? peeled)]) [body (memf body? peeled)])
(make-html (if html (make-html (if html
(html-element-attributes (car html)) (html-element-attributes (car html))
null) null)
(append (filter head? peeled) (append (filter head? peeled)
(list (make-body (if body (list (make-body (if body
(html-element-attributes (car body)) (html-element-attributes (car body))
null) null)
(filter (compose not head?) (peel-f body? peeled null)))))))) (filter (compose not head?) (peel-f body? peeled null))))))))
;; clean-up-pcdata : (listof Content) -> (listof Content)
;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
;; c) discarded
;; unknown tags may contain pcdata
;; the top level may contain pcdata
(define clean-up-pcdata
;; clean-up-pcdata : (listof Content) -> (listof Content) ;; clean-up-pcdata : (listof Content) -> (listof Content)
;; Each pcdata inside a tag that isn't supposed to contain pcdata is either (letrec ([clean-up-pcdata
;; a) appended to the end of the previous subelement, if that subelement may contain pcdata (lambda (content)
;; b) prepended to the front of the next subelement, if that subelement may contain pcdata (map (lambda (to-fix)
;; c) discarded (cond
;; unknown tags may contain pcdata [(element? to-fix)
;; the top level may contain pcdata (recontent-xml to-fix
(define clean-up-pcdata (let ([possible (may-contain (element-name to-fix))]
;; clean-up-pcdata : (listof Content) -> (listof Content) [content (element-content to-fix)])
(letrec ([clean-up-pcdata (if (or (not possible) (memq 'pcdata possible))
(lambda (content) (clean-up-pcdata content)
(map (lambda (to-fix) (eliminate-pcdata content))))]
(cond [else to-fix]))
[(element? to-fix) content))]
(recontent-xml to-fix [eliminate-pcdata
(let ([possible (may-contain (element-name to-fix))] ;: (listof Content) -> (listof Content)
[content (element-content to-fix)]) (lambda (content)
(if (or (not possible) (memq 'pcdata possible)) (let ([non-elements (first-non-elements content)]
(clean-up-pcdata content) [more (memf element? content)])
(eliminate-pcdata content))))] (if more
[else to-fix])) (let* ([el (car more)]
content))] [possible (may-contain (element-name el))])
[eliminate-pcdata (if (or (not possible) (memq 'pcdata possible))
;: (listof Content) -> (listof Content) (cons (recontent-xml el (append non-elements (clean-up-pcdata (element-content el)) (eliminate-pcdata (first-non-elements (cdr more)))))
(lambda (content) (or (memf element? (cdr more)) null))
(let ([non-elements (first-non-elements content)] (cons (recontent-xml el (eliminate-pcdata (element-content el)))
[more (memf element? content)]) (eliminate-pcdata (cdr more)))))
(if more null)))])
(let* ([el (car more)] clean-up-pcdata))
[possible (may-contain (element-name el))])
(if (or (not possible) (memq 'pcdata possible)) ;; first-non-elements : (listof Content) -> (listof Content)
(cons (recontent-xml el (append non-elements (clean-up-pcdata (element-content el)) (eliminate-pcdata (first-non-elements (cdr more))))) (define (first-non-elements content)
(or (memf element? (cdr more)) null)) (cond
(cons (recontent-xml el (eliminate-pcdata (element-content el))) [(null? content) null]
(eliminate-pcdata (cdr more))))) [else (if (element? (car content))
null)))]) null
clean-up-pcdata)) (cons (car content) (first-non-elements (cdr content))))]))
;; first-non-elements : (listof Content) -> (listof Content) ;; recontent-xml : Element (listof Content) -> Element
(define (first-non-elements content) (define (recontent-xml e c)
(cond (make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c))
[(null? content) null]
[else (if (element? (car content)) ;; implicit-starts : Symbol Symbol -> (U #f Symbol)
null (define (implicit-starts parent child)
(cons (car content) (first-non-elements (cdr content))))])) (or (and (eq? child 'tr) (eq? parent 'table) 'tbody)
(and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
;; recontent-xml : Element (listof Content) -> Element
(define (recontent-xml e c) ;; may-contain : Kid-lister
(make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c)) (define may-contain
(sgml:gen-may-contain html-spec))
;; implicit-starts : Symbol Symbol -> (U #f Symbol)
(define (implicit-starts parent child) (define may-contain-anything
(or (and (eq? child 'tr) (eq? parent 'table) 'tbody) (sgml:gen-may-contain null))
(and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
(define use-html-spec (make-parameter #t))
;; may-contain : Kid-lister
(define may-contain ;; read-html-as-xml : [Input-port] -> (listof Content)
(sgml:gen-may-contain html-spec)) (define read-html-as-xml
(case-lambda
(define may-contain-anything [(port)
(sgml:gen-may-contain null)) ((if (use-html-spec) clean-up-pcdata values)
((sgml:gen-read-sgml (if (use-html-spec)
(define use-html-spec (make-parameter #t)) may-contain
may-contain-anything)
;; read-html-as-xml : [Input-port] -> (listof Content) implicit-starts) port))]
(define read-html-as-xml [() (read-html-as-xml (current-input-port))]))
(case-lambda
[(port) ;; read-html : [Input-port] -> Html
((if (use-html-spec) clean-up-pcdata values) (define read-html
((sgml:gen-read-sgml (if (use-html-spec) (compose repackage-html xml-contents->html read-html-as-xml))
may-contain
may-contain-anything)
implicit-starts) port))]
[() (read-html-as-xml (current-input-port))]))
;; read-html : [Input-port] -> Html
(define read-html
(compose repackage-html xml-contents->html read-html-as-xml))

View File

@ -5,286 +5,286 @@
(require mzlib/list (require mzlib/list
mzlib/string mzlib/string
"sgml-reader-sig.ss" "sgml-reader-sig.ss"
xml) xml)
(provide-signature-elements sgml-reader^) (provide-signature-elements sgml-reader^)
;; Start-tag ::= (make-start-tag Location Location Symbol (listof Attribute)) ;; Start-tag ::= (make-start-tag Location Location Symbol (listof Attribute))
(define-struct (start-tag source) (name attrs)) (define-struct (start-tag source) (name attrs))
;; End-tag ::= (make-end-tag Location Location Symbol) ;; End-tag ::= (make-end-tag Location Location Symbol)
(define-struct (end-tag source) (name)) (define-struct (end-tag source) (name))
;; Token ::= Contents | Start-tag | End-tag | Eof ;; Token ::= Contents | Start-tag | End-tag | Eof
(define read-html-comments (make-parameter #f)) (define read-html-comments (make-parameter #f))
(define trim-whitespace (make-parameter #f)) (define trim-whitespace (make-parameter #f))
;; Kid-lister : (Symbol -> (U (listof Symbol) #f)) ;; Kid-lister : (Symbol -> (U (listof Symbol) #f))
;; gen-may-contain : Spec -> Kid-lister ;; gen-may-contain : Spec -> Kid-lister
(define (gen-may-contain spec) (define (gen-may-contain spec)
(let ([table (make-hash)]) (let ([table (make-hash)])
(for-each (lambda (def) (for-each (lambda (def)
(let ([rhs (cdr def)]) (let ([rhs (cdr def)])
(for-each (lambda (name) (hash-set! table name rhs)) (for-each (lambda (name) (hash-set! table name rhs))
(car def)))) (car def))))
spec) spec)
(lambda (name) (lambda (name)
(hash-ref table name (lambda () #f))))) (hash-ref table name (lambda () #f)))))
;; gen-read-sgml : Kid-lister (Symbol Symbol -> (U #f Symbol)) -> [Input-port] -> (listof Content) ;; gen-read-sgml : Kid-lister (Symbol Symbol -> (U #f Symbol)) -> [Input-port] -> (listof Content)
(define (gen-read-sgml may-contain auto-insert) (define (gen-read-sgml may-contain auto-insert)
(case-lambda (case-lambda
[(in) (read-from-port may-contain auto-insert in)] [(in) (read-from-port may-contain auto-insert in)]
[() (read-from-port may-contain auto-insert (current-input-port))])) [() (read-from-port may-contain auto-insert (current-input-port))]))
;; read-from-port : Kid-lister (Symbol Symbol -> (U #f Symbol)) Input-port -> (listof Content) ;; read-from-port : Kid-lister (Symbol Symbol -> (U #f Symbol)) Input-port -> (listof Content)
(define (read-from-port may-contain auto-insert in) (define (read-from-port may-contain auto-insert in)
(let loop ([tokens (let read-tokens () (let loop ([tokens (let read-tokens ()
(let ([tok (lex in)]) (let ([tok (lex in)])
(cond (cond
[(eof-object? tok) null] [(eof-object? tok) null]
[else (cons tok (read-tokens))])))]) [else (cons tok (read-tokens))])))])
(cond
[(null? tokens) null]
[else
(let ([tok (car tokens)] [rest-tokens (cdr tokens)])
(cond
[(start-tag? tok)
(let-values ([(el more-tokens) (read-element tok null may-contain auto-insert rest-tokens)])
(cons el (loop more-tokens)))]
[(end-tag? tok) (loop rest-tokens)]
[else (let ([rest-contents (loop rest-tokens)])
(expand-content tok rest-contents))]))])))
;; read-element : Start-tag (listof Symbol) Kid-lister (Symbol Symbol -> (U #f Symbol)) (listof Token) -> Element (listof Token)
;; Note: How elements nest depends on their content model.
;; If a kind of element can't contain anything, then its start tags are implicitly ended, and
;; end tags are implicitly started.
;; Unknown elements can contain anything and can go inside anything.
;; Otherwise, only the subelements listed in the content model can go inside an element.
;; more here - may-contain shouldn't be used to decide if an element is known or not.
;; The edgar dtd puts tags in may-contain's range that aren't in its domain.
;; more here (or not) - the (memq name context) test leaks for a worst case of O(n^2) in the
;; tag nesting depth. However, this only should be a problem when the tag is there,
;; but far back. That shouldn't happen often. I'm guessing n will be about 3.
(define (read-element start-tag context may-contain auto-insert tokens)
(let read-el ([start-tag start-tag] [context (cons (start-tag-name start-tag) context)] [tokens tokens])
(let* ([start-name (start-tag-name start-tag)]
[ok-kids (may-contain start-name)])
(let-values ([(content remaining)
(cond
[(null? ok-kids) (values null tokens)]
[else
;; read-content : (listof Token) -> (listof Content) (listof Token)
(let read-content ([tokens tokens])
(cond
[(null? tokens) (values null tokens)]
[else
(let ([tok (car tokens)] [next-tokens (cdr tokens)])
(cond
[(start-tag? tok)
(let* ([name (start-tag-name tok)]
[auto-start (auto-insert start-name name)])
(if auto-start
(read-content (cons (make-start-tag (source-start tok) (source-stop tok) auto-start null) tokens))
(if (and ok-kids
(not (memq name ok-kids))
(may-contain name))
(values null tokens)
(let*-values ([(element post-element)
(read-el tok (cons name context) next-tokens)]
[(more-contents left-overs) (read-content post-element)])
(values (cons element more-contents) left-overs)))))]
[(end-tag? tok)
(let ([name (end-tag-name tok)])
(if (eq? name start-name)
(values null next-tokens)
(if (memq name context)
(values null tokens)
(read-content next-tokens))))]
[else ;; content
(let-values ([(more-contents left-overs) (read-content next-tokens)])
(values
(expand-content tok more-contents)
left-overs))]))]))])])
(values (make-element (source-start start-tag)
(source-stop start-tag)
start-name
(start-tag-attrs start-tag)
content)
remaining)))))
;; expand-content : Content (listof Content) -> (listof Content)
(define (expand-content x lst)
(cond (cond
[(entity? x) (cons (expand-entity x) lst)] [(null? tokens) null]
[(comment? x) (if (read-html-comments) [else
(cons x lst) (let ([tok (car tokens)] [rest-tokens (cdr tokens)])
lst)] (cond
[else (cons x lst)])) [(start-tag? tok)
(let-values ([(el more-tokens) (read-element tok null may-contain auto-insert rest-tokens)])
;; expand-entity : Entity -> (U Entity Pcdata) (cons el (loop more-tokens)))]
;; more here - allow expansion of user defined entities [(end-tag? tok) (loop rest-tokens)]
(define (expand-entity x) [else (let ([rest-contents (loop rest-tokens)])
(let ([expanded (default-entity-table (entity-text x))]) (expand-content tok rest-contents))]))])))
(if expanded
(make-pcdata (source-start x) (source-stop x) expanded) ;; read-element : Start-tag (listof Symbol) Kid-lister (Symbol Symbol -> (U #f Symbol)) (listof Token) -> Element (listof Token)
x))) ;; Note: How elements nest depends on their content model.
;; If a kind of element can't contain anything, then its start tags are implicitly ended, and
;; default-entity-table : Symbol -> (U #f String) ;; end tags are implicitly started.
(define (default-entity-table name) ;; Unknown elements can contain anything and can go inside anything.
(case name ;; Otherwise, only the subelements listed in the content model can go inside an element.
[(amp) "&"] ;; more here - may-contain shouldn't be used to decide if an element is known or not.
[(lt) "<"] ;; The edgar dtd puts tags in may-contain's range that aren't in its domain.
[(gt) ">"] ;; more here (or not) - the (memq name context) test leaks for a worst case of O(n^2) in the
[(quot) "\""] ;; tag nesting depth. However, this only should be a problem when the tag is there,
[(apos) "'"] ;; but far back. That shouldn't happen often. I'm guessing n will be about 3.
[else #f])) (define (read-element start-tag context may-contain auto-insert tokens)
(let read-el ([start-tag start-tag] [context (cons (start-tag-name start-tag) context)] [tokens tokens])
;; lex : Input-port -> Token (let* ([start-name (start-tag-name start-tag)]
(define (lex in) [ok-kids (may-contain start-name)])
(when (trim-whitespace) (let-values ([(content remaining)
(skip-space in))
(let ([c (peek-char in)])
(cond
[(eof-object? c) c]
[(eq? c #\&) (lex-entity in)]
[(eq? c #\<) (lex-tag-cdata-pi-comment in)]
[else (lex-pcdata in)])))
;; lex-entity : Input-port -> Token
;; This might not return an entity if it doesn't look like one afterall.
(define (lex-entity in)
(let ([start (file-position in)])
(read-char in)
(case (peek-char in)
;; more here - read while it's numeric (or hex) not until #\;
[(#\#)
(read-char in)
(let* ([hex? (if (equal? #\x (peek-char in))
(and (read-char in) #t)
#f)]
[str (read-until #\; in)]
[n (cond
[hex?
(string->number str 16)]
[else (string->number str)])])
(if (number? n)
(make-entity start (file-position in) n)
(make-pcdata start (file-position in) (string-append "&#" str))))]
[else
(let ([name (lex-name/case-sensitive in)]
[c (peek-char in)])
(if (eq? c #\;)
(begin (read-char in) (make-entity start (file-position in) name))
(make-pcdata start (file-position in) (format "&~a" name))))])))
;; lex-tag-cdata-pi-comment : Input-port -> Start-tag | Element | End-tag | Pcdata | Pi | Comment
(define (lex-tag-cdata-pi-comment in)
(let ([start (file-position in)])
(read-char in)
(case (peek-char in)
[(#\!)
(read-char in)
(case (peek-char in)
[(#\-) (read-char in)
(let ([c (read-char in)])
(cond (cond
[(eq? c #\-) [(null? ok-kids) (values null tokens)]
(let ([data (lex-comment-contents in)]) [else
(make-comment data))] ;; read-content : (listof Token) -> (listof Content) (listof Token)
[else (make-pcdata start (file-position in) (format "<!-~a" c))]))] (let read-content ([tokens tokens])
[(#\[) (read-char in) (cond
(let ([s (read-string 6 in)]) [(null? tokens) (values null tokens)]
(if (string=? s "CDATA[") [else
(let ([data (lex-cdata-contents in)]) (let ([tok (car tokens)] [next-tokens (cdr tokens)])
(make-pcdata start (file-position in) data)) (cond
(make-pcdata start (file-position in) (format "<[~a" s))))] [(start-tag? tok)
[else (skip-dtd in) (lex in)])] (let* ([name (start-tag-name tok)]
[(#\?) (read-char in) [auto-start (auto-insert start-name name)])
(let ([name (lex-name in)]) (if auto-start
(skip-space in) (read-content (cons (make-start-tag (source-start tok) (source-stop tok) auto-start null) tokens))
(let ([data (lex-pi-data in)]) (if (and ok-kids
(make-p-i start (file-position in) name data)))] (not (memq name ok-kids))
[(#\/) (read-char in) (may-contain name))
(let ([name (lex-name in)]) (values null tokens)
(skip-space in) (let*-values ([(element post-element)
(read-char in) ;; skip #\> or whatever else is there (read-el tok (cons name context) next-tokens)]
(make-end-tag start (file-position in) name))] [(more-contents left-overs) (read-content post-element)])
[else (values (cons element more-contents) left-overs)))))]
(let ([name (lex-name in)] [(end-tag? tok)
[attrs (lex-attributes in)]) (let ([name (end-tag-name tok)])
(skip-space in) (if (eq? name start-name)
(case (read-char in) (values null next-tokens)
[(#\/) (if (memq name context)
(read-char in) ;; skip #\> or something (values null tokens)
(make-element start (file-position in) name attrs null)] (read-content next-tokens))))]
[else (make-start-tag start (file-position in) name attrs)]))]))) [else ;; content
(let-values ([(more-contents left-overs) (read-content next-tokens)])
(values
;; lex-attributes : Input-port -> (listof Attribute) (expand-content tok more-contents)
(define (lex-attributes in) left-overs))]))]))])])
(sort (let loop () (values (make-element (source-start start-tag)
(skip-space in) (source-stop start-tag)
(cond [(name-start? (peek-char in)) start-name
(cons (lex-attribute in) (loop))] (start-tag-attrs start-tag)
[else null])) content)
(lambda (a b) remaining)))))
(string<? (symbol->string (attribute-name a))
(symbol->string (attribute-name b)))))) ;; expand-content : Content (listof Content) -> (listof Content)
(define (expand-content x lst)
;; lex-attribute : Input-port -> Attribute (cond
;; Note: entities in attributes are ignored, since defacto html uses & in them for URL syntax [(entity? x) (cons (expand-entity x) lst)]
(define (lex-attribute in) [(comment? x) (if (read-html-comments)
(let ([start (file-position in)] (cons x lst)
[name (lex-name in)]) lst)]
(skip-space in) [else (cons x lst)]))
(cond
[(eq? (peek-char in) #\=) ;; expand-entity : Entity -> (U Entity Pcdata)
(read-char in) ;; more here - allow expansion of user defined entities
(define (expand-entity x)
(let ([expanded (default-entity-table (entity-text x))])
(if expanded
(make-pcdata (source-start x) (source-stop x) expanded)
x)))
;; default-entity-table : Symbol -> (U #f String)
(define (default-entity-table name)
(case name
[(amp) "&"]
[(lt) "<"]
[(gt) ">"]
[(quot) "\""]
[(apos) "'"]
[else #f]))
;; lex : Input-port -> Token
(define (lex in)
(when (trim-whitespace)
(skip-space in))
(let ([c (peek-char in)])
(cond
[(eof-object? c) c]
[(eq? c #\&) (lex-entity in)]
[(eq? c #\<) (lex-tag-cdata-pi-comment in)]
[else (lex-pcdata in)])))
;; lex-entity : Input-port -> Token
;; This might not return an entity if it doesn't look like one afterall.
(define (lex-entity in)
(let ([start (file-position in)])
(read-char in)
(case (peek-char in)
;; more here - read while it's numeric (or hex) not until #\;
[(#\#)
(read-char in)
(let* ([hex? (if (equal? #\x (peek-char in))
(and (read-char in) #t)
#f)]
[str (read-until #\; in)]
[n (cond
[hex?
(string->number str 16)]
[else (string->number str)])])
(if (number? n)
(make-entity start (file-position in) n)
(make-pcdata start (file-position in) (string-append "&#" str))))]
[else
(let ([name (lex-name/case-sensitive in)]
[c (peek-char in)])
(if (eq? c #\;)
(begin (read-char in) (make-entity start (file-position in) name))
(make-pcdata start (file-position in) (format "&~a" name))))])))
;; lex-tag-cdata-pi-comment : Input-port -> Start-tag | Element | End-tag | Pcdata | Pi | Comment
(define (lex-tag-cdata-pi-comment in)
(let ([start (file-position in)])
(read-char in)
(case (peek-char in)
[(#\!)
(read-char in)
(case (peek-char in)
[(#\-) (read-char in)
(let ([c (read-char in)])
(cond
[(eq? c #\-)
(let ([data (lex-comment-contents in)])
(make-comment data))]
[else (make-pcdata start (file-position in) (format "<!-~a" c))]))]
[(#\[) (read-char in)
(let ([s (read-string 6 in)])
(if (string=? s "CDATA[")
(let ([data (lex-cdata-contents in)])
(make-pcdata start (file-position in) data))
(make-pcdata start (file-position in) (format "<[~a" s))))]
[else (skip-dtd in) (lex in)])]
[(#\?) (read-char in)
(let ([name (lex-name in)])
(skip-space in)
(let ([data (lex-pi-data in)])
(make-p-i start (file-position in) name data)))]
[(#\/) (read-char in)
(let ([name (lex-name in)])
(skip-space in)
(read-char in) ;; skip #\> or whatever else is there
(make-end-tag start (file-position in) name))]
[else
(let ([name (lex-name in)]
[attrs (lex-attributes in)])
(skip-space in) (skip-space in)
(let* ([delimiter (read-char in)] (case (read-char in)
[value (list->string [(#\/)
(case delimiter (read-char in) ;; skip #\> or something
[(#\' #\") (make-element start (file-position in) name attrs null)]
(let read-more () [else (make-start-tag start (file-position in) name attrs)]))])))
(let ([c (read-char in)])
(cond
[(or (eq? c delimiter) (eof-object? c)) null] ;; lex-attributes : Input-port -> (listof Attribute)
[else (cons c (read-more))])))] (define (lex-attributes in)
[else (cons delimiter (read-up-to (lambda (c) (or (char-whitespace? c) (eq? c #\>))) in))]))]) (sort (let loop ()
(make-attribute start (file-position in) name value))] (skip-space in)
[else (make-attribute start (file-position in) name (symbol->string name))]))) (cond [(name-start? (peek-char in))
(cons (lex-attribute in) (loop))]
;; skip-space : Input-port -> Void [else null]))
;; deviation - should sometimes insist on at least one space (lambda (a b)
(define (skip-space in) (string<? (symbol->string (attribute-name a))
(let loop () (symbol->string (attribute-name b))))))
(let ([c (peek-char in)])
(when (and (not (eof-object? c)) (char-whitespace? c)) ;; lex-attribute : Input-port -> Attribute
(read-char in) ;; Note: entities in attributes are ignored, since defacto html uses & in them for URL syntax
(loop))))) (define (lex-attribute in)
(let ([start (file-position in)]
;; lex-pcdata : Input-port -> Pcdata [name (lex-name in)])
;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec (skip-space in)
(define (lex-pcdata in) (cond
(let ([start (file-position in)]) [(eq? (peek-char in) #\=)
;; The following regexp match must use bytes, not chars, because (read-char in)
;; `in' might not be a well-formed UTF-8 sequence. If it isn't, (skip-space in)
;; and it goes wrong with the first byte sequence, then a char-based (let* ([delimiter (read-char in)]
;; pattern would match 0 characters. Meanwhile, the caller of this function [value (list->string
;; expects characters to be read. (case delimiter
(let ([s (regexp-match #rx#"^[^&<]*" in)]) [(#\' #\")
(make-pcdata start (let read-more ()
(file-position in) (let ([c (read-char in)])
(bytes->string/utf-8 (cond
(if (trim-whitespace) [(or (eq? c delimiter) (eof-object? c)) null]
(regexp-replace* #rx#"[ \t\v\r\n]+" (car s) #"") [else (cons c (read-more))])))]
(car s)) [else (cons delimiter (read-up-to (lambda (c) (or (char-whitespace? c) (eq? c #\>))) in))]))])
#\?))))) (make-attribute start (file-position in) name value))]
#| [else (make-attribute start (file-position in) name (symbol->string name))])))
;; skip-space : Input-port -> Void
;; deviation - should sometimes insist on at least one space
(define (skip-space in)
(let loop ()
(let ([c (peek-char in)])
(when (and (not (eof-object? c)) (char-whitespace? c))
(read-char in)
(loop)))))
;; lex-pcdata : Input-port -> Pcdata
;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec
(define (lex-pcdata in)
(let ([start (file-position in)])
;; The following regexp match must use bytes, not chars, because
;; `in' might not be a well-formed UTF-8 sequence. If it isn't,
;; and it goes wrong with the first byte sequence, then a char-based
;; pattern would match 0 characters. Meanwhile, the caller of this function
;; expects characters to be read.
(let ([s (regexp-match #rx#"^[^&<]*" in)])
(make-pcdata start
(file-position in)
(bytes->string/utf-8
(if (trim-whitespace)
(regexp-replace* #rx#"[ \t\v\r\n]+" (car s) #"")
(car s))
#\?)))))
#|
;; Original slow version: ;; Original slow version:
(define (lex-pcdata in) (define (lex-pcdata in)
(let ([start (file-position in)] (let ([start (file-position in)]
@ -304,23 +304,23 @@
(file-position in) (file-position in)
(list->string data)))) (list->string data))))
|# |#
;; lex-name : Input-port -> Symbol ;; lex-name : Input-port -> Symbol
(define (lex-name in) (define (lex-name in)
(let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))]) (let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))])
(string->symbol (string->symbol
;; Common case: string is already lowercased ;; Common case: string is already lowercased
(if (regexp-match-positions #rx"[A-Z]" s) (if (regexp-match-positions #rx"[A-Z]" s)
(begin (begin
(string-lowercase! s) (string-lowercase! s)
s) s)
s)))) s))))
;; lex-name/case-sensitive : Input-port -> Symbol ;; lex-name/case-sensitive : Input-port -> Symbol
(define (lex-name/case-sensitive in) (define (lex-name/case-sensitive in)
(let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))]) (let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))])
(string->symbol s))) (string->symbol s)))
#| #|
(define (lex-name in) (define (lex-name in)
(string->symbol (string->symbol
(list->string (list->string
@ -330,100 +330,100 @@
(cons (char-downcase (read-char in)) (lex-rest))] (cons (char-downcase (read-char in)) (lex-rest))]
[else null]))))) [else null])))))
|# |#
;; skip-dtd : Input-port -> Void ;; skip-dtd : Input-port -> Void
(define (skip-dtd in) (define (skip-dtd in)
(let skip () (let skip ()
(let ([c (read-char in)]) (let ([c (read-char in)])
(if (eof-object? c) (if (eof-object? c)
(void) (void)
(case c (case c
[(#\') (read-until #\' in) (skip)] [(#\') (read-until #\' in) (skip)]
[(#\") (read-until #\" in) (skip)] [(#\") (read-until #\" in) (skip)]
[(#\<) [(#\<)
(case (read-char in) (case (read-char in)
[(#\!) (case (read-char in) [(#\!) (case (read-char in)
[(#\-) (read-char in) (lex-comment-contents in) (skip)] [(#\-) (read-char in) (lex-comment-contents in) (skip)]
[else (skip) (skip)])] [else (skip) (skip)])]
[(#\?) (lex-pi-data in) (skip)] [(#\?) (lex-pi-data in) (skip)]
[else (skip) (skip)])] [else (skip) (skip)])]
[(#\>) (void)] [(#\>) (void)]
[else (skip)]))))) [else (skip)])))))
;; name-start? : TST -> Bool ;; name-start? : TST -> Bool
(define (name-start? ch) (define (name-start? ch)
(and (char? ch) (char-name-start? ch))) (and (char? ch) (char-name-start? ch)))
;; char-name-start? : Char -> Bool ;; char-name-start? : Char -> Bool
(define (char-name-start? ch) (define (char-name-start? ch)
(or (char-alphabetic? ch) (or (char-alphabetic? ch)
(eq? ch #\_) (eq? ch #\_)
(eq? ch #\:))) (eq? ch #\:)))
;; name-char? : TST -> Bool ;; name-char? : TST -> Bool
(define (name-char? ch) (define (name-char? ch)
(and (char? ch) (and (char? ch)
(or (char-name-start? ch) (or (char-name-start? ch)
(char-numeric? ch) (char-numeric? ch)
(eq? ch #\&) ; ugly illegal junk for SEC's EDGAR database (eq? ch #\&) ; ugly illegal junk for SEC's EDGAR database
(eq? ch #\.) (eq? ch #\.)
(eq? ch #\-)))) (eq? ch #\-))))
;; read-up-to : (Char -> Bool) Input-port -> (listof Char) ;; read-up-to : (Char -> Bool) Input-port -> (listof Char)
;; abstract this with read-until ;; abstract this with read-until
(define (read-up-to p? in) (define (read-up-to p? in)
(let loop () (let loop ()
(let ([c (peek-char in)]) (let ([c (peek-char in)])
(cond (cond
[(or (eof-object? c) (p? c)) null] [(or (eof-object? c) (p? c)) null]
[else (cons (read-char in) (loop))])))) [else (cons (read-char in) (loop))]))))
;; read-until : Char Input-port -> String ;; read-until : Char Input-port -> String
;; discards the stop character, too ;; discards the stop character, too
(define (read-until char in) (define (read-until char in)
(list->string (list->string
(let read-more () (let read-more ()
(let ([c (read-char in)]) (let ([c (read-char in)])
(cond (cond
[(or (eof-object? c) (eq? c char)) null] [(or (eof-object? c) (eq? c char)) null]
[else (cons c (read-more))]))))) [else (cons c (read-more))])))))
;; gen-read-until-string : String -> Input-port -> String ;; gen-read-until-string : String -> Input-port -> String
;; uses Knuth-Morris-Pratt from ;; uses Knuth-Morris-Pratt from
;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876 ;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
;; discards stop from input ;; discards stop from input
(define (gen-read-until-string stop) (define (gen-read-until-string stop)
(let* ([len (string-length stop)] (let* ([len (string-length stop)]
[prefix (make-vector len 0)] [prefix (make-vector len 0)]
[fall-back [fall-back
(lambda (k c) (lambda (k c)
(let ([k (let loop ([k k]) (let ([k (let loop ([k k])
(cond (cond
[(and (> k 0) (not (eq? (string-ref stop k) c))) [(and (> k 0) (not (eq? (string-ref stop k) c)))
(loop (vector-ref prefix (sub1 k)))] (loop (vector-ref prefix (sub1 k)))]
[else k]))]) [else k]))])
(if (eq? (string-ref stop k) c) (if (eq? (string-ref stop k) c)
(add1 k) (add1 k)
k)))]) k)))])
(let init ([k 0] [q 1]) (let init ([k 0] [q 1])
(when (< q len) (when (< q len)
(let ([k (fall-back k (string-ref stop q))]) (let ([k (fall-back k (string-ref stop q))])
(vector-set! prefix q k) (vector-set! prefix q k)
(init k (add1 q))))) (init k (add1 q)))))
;; (vector-ref prefix x) = the longest suffix that matches a prefix of stop ;; (vector-ref prefix x) = the longest suffix that matches a prefix of stop
(lambda (in) (lambda (in)
(list->string (list->string
(let/ec out (let/ec out
(let loop ([matched 0] [out out]) (let loop ([matched 0] [out out])
(let* ([c (read-char in)] (let* ([c (read-char in)]
[matched (fall-back matched c)]) [matched (fall-back matched c)])
(cond (cond
[(or (eof-object? c) (= matched len)) (out null)] [(or (eof-object? c) (= matched len)) (out null)]
[(zero? matched) (cons c (let/ec out (loop matched out)))] [(zero? matched) (cons c (let/ec out (loop matched out)))]
[else (cons c (loop matched out))])))))))) [else (cons c (loop matched out))]))))))))
;; "-->" makes more sense, but "--" follows the spec, but this isn't XML anymore. ;; "-->" makes more sense, but "--" follows the spec, but this isn't XML anymore.
(define lex-comment-contents (gen-read-until-string "-->")) (define lex-comment-contents (gen-read-until-string "-->"))
(define lex-pi-data (gen-read-until-string "?>")) (define lex-pi-data (gen-read-until-string "?>"))
(define lex-cdata-contents (gen-read-until-string "]]>")) (define lex-cdata-contents (gen-read-until-string "]]>"))