svn: r13855
This commit is contained in:
Jay McCarthy 2009-02-26 17:15:44 +00:00
parent 196ec00f16
commit 243fb2f0e4
2 changed files with 512 additions and 512 deletions

View File

@ -12,30 +12,30 @@
(provide-signature-elements html^)
;; Html-content = Html-element | Pc-data | Entity
;; Html-content = Html-element | Pc-data | Entity
(include "html-structs.ss")
(include "case.ss")
(include "html-structs.ss")
(include "case.ss")
;; xml->html : Document -> Html
(define (xml->html doc)
;; xml->html : Document -> Html
(define (xml->html doc)
(let ([root (document-element doc)])
(unless (eq? 'html (element-name root))
(error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root)))
(make-html (element-attributes root) (xml-contents->html (element-content root)))))
;; xml-content->html : (listof Content) -> (listof Html-element)
(define (xml-contents->html contents)
;; xml-content->html : (listof Content) -> (listof Html-element)
(define (xml-contents->html contents)
(foldr xml-single-content->html
null
contents))
;; read-xhtml : [Input-port] -> Html
(define read-xhtml (compose xml->html read-xml))
;; read-xhtml : [Input-port] -> Html
(define read-xhtml (compose xml->html read-xml))
;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
(define (peel-f toss? to-toss acc0)
;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
(define (peel-f toss? to-toss acc0)
(foldr (lambda (x acc)
(if (toss? x)
(append (html-full-content x) acc)
@ -43,8 +43,8 @@
acc0
to-toss))
;; repackage-html : (listof Html-content) -> Html
(define (repackage-html contents)
;; repackage-html : (listof Html-content) -> Html
(define (repackage-html contents)
(let* ([html (memf html? contents)]
[peeled (peel-f html? contents null)]
[body (memf body? peeled)])
@ -57,14 +57,14 @@
null)
(filter (compose not head?) (peel-f body? peeled null))))))))
;; clean-up-pcdata : (listof Content) -> (listof Content)
;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
;; c) discarded
;; unknown tags may contain pcdata
;; the top level may contain pcdata
(define clean-up-pcdata
;; clean-up-pcdata : (listof Content) -> (listof Content)
;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
;; c) discarded
;; unknown tags may contain pcdata
;; the top level may contain pcdata
(define clean-up-pcdata
;; clean-up-pcdata : (listof Content) -> (listof Content)
(letrec ([clean-up-pcdata
(lambda (content)
@ -95,34 +95,34 @@
null)))])
clean-up-pcdata))
;; first-non-elements : (listof Content) -> (listof Content)
(define (first-non-elements content)
;; first-non-elements : (listof Content) -> (listof Content)
(define (first-non-elements content)
(cond
[(null? content) null]
[else (if (element? (car content))
null
(cons (car content) (first-non-elements (cdr content))))]))
;; recontent-xml : Element (listof Content) -> Element
(define (recontent-xml e c)
;; recontent-xml : Element (listof Content) -> Element
(define (recontent-xml e c)
(make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c))
;; implicit-starts : Symbol Symbol -> (U #f Symbol)
(define (implicit-starts parent child)
;; implicit-starts : Symbol Symbol -> (U #f Symbol)
(define (implicit-starts parent child)
(or (and (eq? child 'tr) (eq? parent 'table) 'tbody)
(and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
;; may-contain : Kid-lister
(define may-contain
;; may-contain : Kid-lister
(define may-contain
(sgml:gen-may-contain html-spec))
(define may-contain-anything
(define may-contain-anything
(sgml:gen-may-contain null))
(define use-html-spec (make-parameter #t))
(define use-html-spec (make-parameter #t))
;; read-html-as-xml : [Input-port] -> (listof Content)
(define read-html-as-xml
;; read-html-as-xml : [Input-port] -> (listof Content)
(define read-html-as-xml
(case-lambda
[(port)
((if (use-html-spec) clean-up-pcdata values)
@ -132,6 +132,6 @@
implicit-starts) port))]
[() (read-html-as-xml (current-input-port))]))
;; read-html : [Input-port] -> Html
(define read-html
;; read-html : [Input-port] -> Html
(define read-html
(compose repackage-html xml-contents->html read-html-as-xml))

View File

@ -9,21 +9,21 @@
(provide-signature-elements sgml-reader^)
;; Start-tag ::= (make-start-tag Location Location Symbol (listof Attribute))
(define-struct (start-tag source) (name attrs))
;; Start-tag ::= (make-start-tag Location Location Symbol (listof Attribute))
(define-struct (start-tag source) (name attrs))
;; End-tag ::= (make-end-tag Location Location Symbol)
(define-struct (end-tag source) (name))
;; End-tag ::= (make-end-tag Location Location Symbol)
(define-struct (end-tag source) (name))
;; Token ::= Contents | Start-tag | End-tag | Eof
;; Token ::= Contents | Start-tag | End-tag | Eof
(define read-html-comments (make-parameter #f))
(define trim-whitespace (make-parameter #f))
(define read-html-comments (make-parameter #f))
(define trim-whitespace (make-parameter #f))
;; Kid-lister : (Symbol -> (U (listof Symbol) #f))
;; Kid-lister : (Symbol -> (U (listof Symbol) #f))
;; gen-may-contain : Spec -> Kid-lister
(define (gen-may-contain spec)
;; gen-may-contain : Spec -> Kid-lister
(define (gen-may-contain spec)
(let ([table (make-hash)])
(for-each (lambda (def)
(let ([rhs (cdr def)])
@ -33,14 +33,14 @@
(lambda (name)
(hash-ref table name (lambda () #f)))))
;; gen-read-sgml : Kid-lister (Symbol Symbol -> (U #f Symbol)) -> [Input-port] -> (listof Content)
(define (gen-read-sgml may-contain auto-insert)
;; gen-read-sgml : Kid-lister (Symbol Symbol -> (U #f Symbol)) -> [Input-port] -> (listof Content)
(define (gen-read-sgml may-contain auto-insert)
(case-lambda
[(in) (read-from-port may-contain auto-insert in)]
[() (read-from-port may-contain auto-insert (current-input-port))]))
;; read-from-port : Kid-lister (Symbol Symbol -> (U #f Symbol)) Input-port -> (listof Content)
(define (read-from-port may-contain auto-insert in)
;; read-from-port : Kid-lister (Symbol Symbol -> (U #f Symbol)) Input-port -> (listof Content)
(define (read-from-port may-contain auto-insert in)
(let loop ([tokens (let read-tokens ()
(let ([tok (lex in)])
(cond
@ -58,18 +58,18 @@
[else (let ([rest-contents (loop rest-tokens)])
(expand-content tok rest-contents))]))])))
;; read-element : Start-tag (listof Symbol) Kid-lister (Symbol Symbol -> (U #f Symbol)) (listof Token) -> Element (listof Token)
;; Note: How elements nest depends on their content model.
;; If a kind of element can't contain anything, then its start tags are implicitly ended, and
;; end tags are implicitly started.
;; Unknown elements can contain anything and can go inside anything.
;; Otherwise, only the subelements listed in the content model can go inside an element.
;; more here - may-contain shouldn't be used to decide if an element is known or not.
;; The edgar dtd puts tags in may-contain's range that aren't in its domain.
;; more here (or not) - the (memq name context) test leaks for a worst case of O(n^2) in the
;; tag nesting depth. However, this only should be a problem when the tag is there,
;; but far back. That shouldn't happen often. I'm guessing n will be about 3.
(define (read-element start-tag context may-contain auto-insert tokens)
;; read-element : Start-tag (listof Symbol) Kid-lister (Symbol Symbol -> (U #f Symbol)) (listof Token) -> Element (listof Token)
;; Note: How elements nest depends on their content model.
;; If a kind of element can't contain anything, then its start tags are implicitly ended, and
;; end tags are implicitly started.
;; Unknown elements can contain anything and can go inside anything.
;; Otherwise, only the subelements listed in the content model can go inside an element.
;; more here - may-contain shouldn't be used to decide if an element is known or not.
;; The edgar dtd puts tags in may-contain's range that aren't in its domain.
;; more here (or not) - the (memq name context) test leaks for a worst case of O(n^2) in the
;; tag nesting depth. However, this only should be a problem when the tag is there,
;; but far back. That shouldn't happen often. I'm guessing n will be about 3.
(define (read-element start-tag context may-contain auto-insert tokens)
(let read-el ([start-tag start-tag] [context (cons (start-tag-name start-tag) context)] [tokens tokens])
(let* ([start-name (start-tag-name start-tag)]
[ok-kids (may-contain start-name)])
@ -116,8 +116,8 @@
content)
remaining)))))
;; expand-content : Content (listof Content) -> (listof Content)
(define (expand-content x lst)
;; expand-content : Content (listof Content) -> (listof Content)
(define (expand-content x lst)
(cond
[(entity? x) (cons (expand-entity x) lst)]
[(comment? x) (if (read-html-comments)
@ -125,16 +125,16 @@
lst)]
[else (cons x lst)]))
;; expand-entity : Entity -> (U Entity Pcdata)
;; more here - allow expansion of user defined entities
(define (expand-entity x)
;; expand-entity : Entity -> (U Entity Pcdata)
;; more here - allow expansion of user defined entities
(define (expand-entity x)
(let ([expanded (default-entity-table (entity-text x))])
(if expanded
(make-pcdata (source-start x) (source-stop x) expanded)
x)))
;; default-entity-table : Symbol -> (U #f String)
(define (default-entity-table name)
;; default-entity-table : Symbol -> (U #f String)
(define (default-entity-table name)
(case name
[(amp) "&"]
[(lt) "<"]
@ -143,8 +143,8 @@
[(apos) "'"]
[else #f]))
;; lex : Input-port -> Token
(define (lex in)
;; lex : Input-port -> Token
(define (lex in)
(when (trim-whitespace)
(skip-space in))
(let ([c (peek-char in)])
@ -154,9 +154,9 @@
[(eq? c #\<) (lex-tag-cdata-pi-comment in)]
[else (lex-pcdata in)])))
;; lex-entity : Input-port -> Token
;; This might not return an entity if it doesn't look like one afterall.
(define (lex-entity in)
;; lex-entity : Input-port -> Token
;; This might not return an entity if it doesn't look like one afterall.
(define (lex-entity in)
(let ([start (file-position in)])
(read-char in)
(case (peek-char in)
@ -181,8 +181,8 @@
(begin (read-char in) (make-entity start (file-position in) name))
(make-pcdata start (file-position in) (format "&~a" name))))])))
;; lex-tag-cdata-pi-comment : Input-port -> Start-tag | Element | End-tag | Pcdata | Pi | Comment
(define (lex-tag-cdata-pi-comment in)
;; lex-tag-cdata-pi-comment : Input-port -> Start-tag | Element | End-tag | Pcdata | Pi | Comment
(define (lex-tag-cdata-pi-comment in)
(let ([start (file-position in)])
(read-char in)
(case (peek-char in)
@ -224,8 +224,8 @@
[else (make-start-tag start (file-position in) name attrs)]))])))
;; lex-attributes : Input-port -> (listof Attribute)
(define (lex-attributes in)
;; lex-attributes : Input-port -> (listof Attribute)
(define (lex-attributes in)
(sort (let loop ()
(skip-space in)
(cond [(name-start? (peek-char in))
@ -235,9 +235,9 @@
(string<? (symbol->string (attribute-name a))
(symbol->string (attribute-name b))))))
;; lex-attribute : Input-port -> Attribute
;; Note: entities in attributes are ignored, since defacto html uses & in them for URL syntax
(define (lex-attribute in)
;; lex-attribute : Input-port -> Attribute
;; Note: entities in attributes are ignored, since defacto html uses & in them for URL syntax
(define (lex-attribute in)
(let ([start (file-position in)]
[name (lex-name in)])
(skip-space in)
@ -258,18 +258,18 @@
(make-attribute start (file-position in) name value))]
[else (make-attribute start (file-position in) name (symbol->string name))])))
;; skip-space : Input-port -> Void
;; deviation - should sometimes insist on at least one space
(define (skip-space in)
;; skip-space : Input-port -> Void
;; deviation - should sometimes insist on at least one space
(define (skip-space in)
(let loop ()
(let ([c (peek-char in)])
(when (and (not (eof-object? c)) (char-whitespace? c))
(read-char in)
(loop)))))
;; lex-pcdata : Input-port -> Pcdata
;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec
(define (lex-pcdata in)
;; lex-pcdata : Input-port -> Pcdata
;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec
(define (lex-pcdata in)
(let ([start (file-position in)])
;; The following regexp match must use bytes, not chars, because
;; `in' might not be a well-formed UTF-8 sequence. If it isn't,
@ -284,7 +284,7 @@
(regexp-replace* #rx#"[ \t\v\r\n]+" (car s) #"")
(car s))
#\?)))))
#|
#|
;; Original slow version:
(define (lex-pcdata in)
(let ([start (file-position in)]
@ -306,8 +306,8 @@
|#
;; lex-name : Input-port -> Symbol
(define (lex-name in)
;; lex-name : Input-port -> Symbol
(define (lex-name in)
(let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))])
(string->symbol
;; Common case: string is already lowercased
@ -316,11 +316,11 @@
(string-lowercase! s)
s)
s))))
;; lex-name/case-sensitive : Input-port -> Symbol
(define (lex-name/case-sensitive in)
;; lex-name/case-sensitive : Input-port -> Symbol
(define (lex-name/case-sensitive in)
(let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))])
(string->symbol s)))
#|
#|
(define (lex-name in)
(string->symbol
(list->string
@ -332,8 +332,8 @@
|#
;; skip-dtd : Input-port -> Void
(define (skip-dtd in)
;; skip-dtd : Input-port -> Void
(define (skip-dtd in)
(let skip ()
(let ([c (read-char in)])
(if (eof-object? c)
@ -351,18 +351,18 @@
[(#\>) (void)]
[else (skip)])))))
;; name-start? : TST -> Bool
(define (name-start? ch)
;; name-start? : TST -> Bool
(define (name-start? ch)
(and (char? ch) (char-name-start? ch)))
;; char-name-start? : Char -> Bool
(define (char-name-start? ch)
;; char-name-start? : Char -> Bool
(define (char-name-start? ch)
(or (char-alphabetic? ch)
(eq? ch #\_)
(eq? ch #\:)))
;; name-char? : TST -> Bool
(define (name-char? ch)
;; name-char? : TST -> Bool
(define (name-char? ch)
(and (char? ch)
(or (char-name-start? ch)
(char-numeric? ch)
@ -370,18 +370,18 @@
(eq? ch #\.)
(eq? ch #\-))))
;; read-up-to : (Char -> Bool) Input-port -> (listof Char)
;; abstract this with read-until
(define (read-up-to p? in)
;; read-up-to : (Char -> Bool) Input-port -> (listof Char)
;; abstract this with read-until
(define (read-up-to p? in)
(let loop ()
(let ([c (peek-char in)])
(cond
[(or (eof-object? c) (p? c)) null]
[else (cons (read-char in) (loop))]))))
;; read-until : Char Input-port -> String
;; discards the stop character, too
(define (read-until char in)
;; read-until : Char Input-port -> String
;; discards the stop character, too
(define (read-until char in)
(list->string
(let read-more ()
(let ([c (read-char in)])
@ -389,11 +389,11 @@
[(or (eof-object? c) (eq? c char)) null]
[else (cons c (read-more))])))))
;; gen-read-until-string : String -> Input-port -> String
;; uses Knuth-Morris-Pratt from
;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
;; discards stop from input
(define (gen-read-until-string stop)
;; gen-read-until-string : String -> Input-port -> String
;; uses Knuth-Morris-Pratt from
;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
;; discards stop from input
(define (gen-read-until-string stop)
(let* ([len (string-length stop)]
[prefix (make-vector len 0)]
[fall-back
@ -423,7 +423,7 @@
[(zero? matched) (cons c (let/ec out (loop matched out)))]
[else (cons c (loop matched out))]))))))))
;; "-->" makes more sense, but "--" follows the spec, but this isn't XML anymore.
(define lex-comment-contents (gen-read-until-string "-->"))
(define lex-pi-data (gen-read-until-string "?>"))
(define lex-cdata-contents (gen-read-until-string "]]>"))
;; "-->" makes more sense, but "--" follows the spec, but this isn't XML anymore.
(define lex-comment-contents (gen-read-until-string "-->"))
(define lex-pi-data (gen-read-until-string "?>"))
(define lex-cdata-contents (gen-read-until-string "]]>"))