diff --git a/collects/html/html-mod.ss b/collects/html/html-mod.ss
index 49a6f71b9e..5356f9949d 100644
--- a/collects/html/html-mod.ss
+++ b/collects/html/html-mod.ss
@@ -8,130 +8,130 @@
"html-spec.ss"
"html-sig.ss"
(prefix-in sgml: "sgml-reader.ss")
- xml)
+ xml)
(provide-signature-elements html^)
-
- ;; Html-content = Html-element | Pc-data | Entity
-
- (include "html-structs.ss")
- (include "case.ss")
-
- ;; xml->html : Document -> Html
- (define (xml->html doc)
- (let ([root (document-element doc)])
- (unless (eq? 'html (element-name root))
- (error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root)))
- (make-html (element-attributes root) (xml-contents->html (element-content root)))))
-
-
- ;; xml-content->html : (listof Content) -> (listof Html-element)
- (define (xml-contents->html contents)
- (foldr xml-single-content->html
- null
- contents))
-
- ;; read-xhtml : [Input-port] -> Html
- (define read-xhtml (compose xml->html read-xml))
-
- ;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
- (define (peel-f toss? to-toss acc0)
- (foldr (lambda (x acc)
- (if (toss? x)
- (append (html-full-content x) acc)
- (cons x acc)))
- acc0
- to-toss))
-
- ;; repackage-html : (listof Html-content) -> Html
- (define (repackage-html contents)
- (let* ([html (memf html? contents)]
- [peeled (peel-f html? contents null)]
- [body (memf body? peeled)])
- (make-html (if html
- (html-element-attributes (car html))
- null)
- (append (filter head? peeled)
- (list (make-body (if body
- (html-element-attributes (car body))
- null)
- (filter (compose not head?) (peel-f body? peeled null))))))))
-
+
+;; Html-content = Html-element | Pc-data | Entity
+
+(include "html-structs.ss")
+(include "case.ss")
+
+;; xml->html : Document -> Html
+(define (xml->html doc)
+ (let ([root (document-element doc)])
+ (unless (eq? 'html (element-name root))
+ (error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root)))
+ (make-html (element-attributes root) (xml-contents->html (element-content root)))))
+
+
+;; xml-content->html : (listof Content) -> (listof Html-element)
+(define (xml-contents->html contents)
+ (foldr xml-single-content->html
+ null
+ contents))
+
+;; read-xhtml : [Input-port] -> Html
+(define read-xhtml (compose xml->html read-xml))
+
+;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
+(define (peel-f toss? to-toss acc0)
+ (foldr (lambda (x acc)
+ (if (toss? x)
+ (append (html-full-content x) acc)
+ (cons x acc)))
+ acc0
+ to-toss))
+
+;; repackage-html : (listof Html-content) -> Html
+(define (repackage-html contents)
+ (let* ([html (memf html? contents)]
+ [peeled (peel-f html? contents null)]
+ [body (memf body? peeled)])
+ (make-html (if html
+ (html-element-attributes (car html))
+ null)
+ (append (filter head? peeled)
+ (list (make-body (if body
+ (html-element-attributes (car body))
+ null)
+ (filter (compose not head?) (peel-f body? peeled null))))))))
+
+;; clean-up-pcdata : (listof Content) -> (listof Content)
+;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
+;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
+;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
+;; c) discarded
+;; unknown tags may contain pcdata
+;; the top level may contain pcdata
+(define clean-up-pcdata
;; clean-up-pcdata : (listof Content) -> (listof Content)
- ;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
- ;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
- ;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
- ;; c) discarded
- ;; unknown tags may contain pcdata
- ;; the top level may contain pcdata
- (define clean-up-pcdata
- ;; clean-up-pcdata : (listof Content) -> (listof Content)
- (letrec ([clean-up-pcdata
- (lambda (content)
- (map (lambda (to-fix)
- (cond
- [(element? to-fix)
- (recontent-xml to-fix
- (let ([possible (may-contain (element-name to-fix))]
- [content (element-content to-fix)])
- (if (or (not possible) (memq 'pcdata possible))
- (clean-up-pcdata content)
- (eliminate-pcdata content))))]
- [else to-fix]))
- content))]
- [eliminate-pcdata
- ;: (listof Content) -> (listof Content)
- (lambda (content)
- (let ([non-elements (first-non-elements content)]
- [more (memf element? content)])
- (if more
- (let* ([el (car more)]
- [possible (may-contain (element-name el))])
- (if (or (not possible) (memq 'pcdata possible))
- (cons (recontent-xml el (append non-elements (clean-up-pcdata (element-content el)) (eliminate-pcdata (first-non-elements (cdr more)))))
- (or (memf element? (cdr more)) null))
- (cons (recontent-xml el (eliminate-pcdata (element-content el)))
- (eliminate-pcdata (cdr more)))))
- null)))])
- clean-up-pcdata))
-
- ;; first-non-elements : (listof Content) -> (listof Content)
- (define (first-non-elements content)
- (cond
- [(null? content) null]
- [else (if (element? (car content))
- null
- (cons (car content) (first-non-elements (cdr content))))]))
-
- ;; recontent-xml : Element (listof Content) -> Element
- (define (recontent-xml e c)
- (make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c))
-
- ;; implicit-starts : Symbol Symbol -> (U #f Symbol)
- (define (implicit-starts parent child)
- (or (and (eq? child 'tr) (eq? parent 'table) 'tbody)
- (and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
-
- ;; may-contain : Kid-lister
- (define may-contain
- (sgml:gen-may-contain html-spec))
-
- (define may-contain-anything
- (sgml:gen-may-contain null))
-
- (define use-html-spec (make-parameter #t))
-
- ;; read-html-as-xml : [Input-port] -> (listof Content)
- (define read-html-as-xml
- (case-lambda
- [(port)
- ((if (use-html-spec) clean-up-pcdata values)
- ((sgml:gen-read-sgml (if (use-html-spec)
- may-contain
- may-contain-anything)
- implicit-starts) port))]
- [() (read-html-as-xml (current-input-port))]))
-
- ;; read-html : [Input-port] -> Html
- (define read-html
- (compose repackage-html xml-contents->html read-html-as-xml))
\ No newline at end of file
+ (letrec ([clean-up-pcdata
+ (lambda (content)
+ (map (lambda (to-fix)
+ (cond
+ [(element? to-fix)
+ (recontent-xml to-fix
+ (let ([possible (may-contain (element-name to-fix))]
+ [content (element-content to-fix)])
+ (if (or (not possible) (memq 'pcdata possible))
+ (clean-up-pcdata content)
+ (eliminate-pcdata content))))]
+ [else to-fix]))
+ content))]
+ [eliminate-pcdata
+ ;: (listof Content) -> (listof Content)
+ (lambda (content)
+ (let ([non-elements (first-non-elements content)]
+ [more (memf element? content)])
+ (if more
+ (let* ([el (car more)]
+ [possible (may-contain (element-name el))])
+ (if (or (not possible) (memq 'pcdata possible))
+ (cons (recontent-xml el (append non-elements (clean-up-pcdata (element-content el)) (eliminate-pcdata (first-non-elements (cdr more)))))
+ (or (memf element? (cdr more)) null))
+ (cons (recontent-xml el (eliminate-pcdata (element-content el)))
+ (eliminate-pcdata (cdr more)))))
+ null)))])
+ clean-up-pcdata))
+
+;; first-non-elements : (listof Content) -> (listof Content)
+(define (first-non-elements content)
+ (cond
+ [(null? content) null]
+ [else (if (element? (car content))
+ null
+ (cons (car content) (first-non-elements (cdr content))))]))
+
+;; recontent-xml : Element (listof Content) -> Element
+(define (recontent-xml e c)
+ (make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c))
+
+;; implicit-starts : Symbol Symbol -> (U #f Symbol)
+(define (implicit-starts parent child)
+ (or (and (eq? child 'tr) (eq? parent 'table) 'tbody)
+ (and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
+
+;; may-contain : Kid-lister
+(define may-contain
+ (sgml:gen-may-contain html-spec))
+
+(define may-contain-anything
+ (sgml:gen-may-contain null))
+
+(define use-html-spec (make-parameter #t))
+
+;; read-html-as-xml : [Input-port] -> (listof Content)
+(define read-html-as-xml
+ (case-lambda
+ [(port)
+ ((if (use-html-spec) clean-up-pcdata values)
+ ((sgml:gen-read-sgml (if (use-html-spec)
+ may-contain
+ may-contain-anything)
+ implicit-starts) port))]
+ [() (read-html-as-xml (current-input-port))]))
+
+;; read-html : [Input-port] -> Html
+(define read-html
+ (compose repackage-html xml-contents->html read-html-as-xml))
\ No newline at end of file
diff --git a/collects/html/sgml-reader.ss b/collects/html/sgml-reader.ss
index 4dfe8a95be..6de4bbb93b 100644
--- a/collects/html/sgml-reader.ss
+++ b/collects/html/sgml-reader.ss
@@ -5,286 +5,286 @@
(require mzlib/list
mzlib/string
"sgml-reader-sig.ss"
- xml)
+ xml)
(provide-signature-elements sgml-reader^)
- ;; Start-tag ::= (make-start-tag Location Location Symbol (listof Attribute))
- (define-struct (start-tag source) (name attrs))
-
- ;; End-tag ::= (make-end-tag Location Location Symbol)
- (define-struct (end-tag source) (name))
-
- ;; Token ::= Contents | Start-tag | End-tag | Eof
-
- (define read-html-comments (make-parameter #f))
- (define trim-whitespace (make-parameter #f))
-
- ;; Kid-lister : (Symbol -> (U (listof Symbol) #f))
-
- ;; gen-may-contain : Spec -> Kid-lister
- (define (gen-may-contain spec)
- (let ([table (make-hash)])
- (for-each (lambda (def)
- (let ([rhs (cdr def)])
- (for-each (lambda (name) (hash-set! table name rhs))
- (car def))))
- spec)
- (lambda (name)
- (hash-ref table name (lambda () #f)))))
-
- ;; gen-read-sgml : Kid-lister (Symbol Symbol -> (U #f Symbol)) -> [Input-port] -> (listof Content)
- (define (gen-read-sgml may-contain auto-insert)
- (case-lambda
- [(in) (read-from-port may-contain auto-insert in)]
- [() (read-from-port may-contain auto-insert (current-input-port))]))
-
- ;; read-from-port : Kid-lister (Symbol Symbol -> (U #f Symbol)) Input-port -> (listof Content)
- (define (read-from-port may-contain auto-insert in)
- (let loop ([tokens (let read-tokens ()
- (let ([tok (lex in)])
- (cond
- [(eof-object? tok) null]
- [else (cons tok (read-tokens))])))])
- (cond
- [(null? tokens) null]
- [else
- (let ([tok (car tokens)] [rest-tokens (cdr tokens)])
- (cond
- [(start-tag? tok)
- (let-values ([(el more-tokens) (read-element tok null may-contain auto-insert rest-tokens)])
- (cons el (loop more-tokens)))]
- [(end-tag? tok) (loop rest-tokens)]
- [else (let ([rest-contents (loop rest-tokens)])
- (expand-content tok rest-contents))]))])))
-
- ;; read-element : Start-tag (listof Symbol) Kid-lister (Symbol Symbol -> (U #f Symbol)) (listof Token) -> Element (listof Token)
- ;; Note: How elements nest depends on their content model.
- ;; If a kind of element can't contain anything, then its start tags are implicitly ended, and
- ;; end tags are implicitly started.
- ;; Unknown elements can contain anything and can go inside anything.
- ;; Otherwise, only the subelements listed in the content model can go inside an element.
- ;; more here - may-contain shouldn't be used to decide if an element is known or not.
- ;; The edgar dtd puts tags in may-contain's range that aren't in its domain.
- ;; more here (or not) - the (memq name context) test leaks for a worst case of O(n^2) in the
- ;; tag nesting depth. However, this only should be a problem when the tag is there,
- ;; but far back. That shouldn't happen often. I'm guessing n will be about 3.
- (define (read-element start-tag context may-contain auto-insert tokens)
- (let read-el ([start-tag start-tag] [context (cons (start-tag-name start-tag) context)] [tokens tokens])
- (let* ([start-name (start-tag-name start-tag)]
- [ok-kids (may-contain start-name)])
- (let-values ([(content remaining)
- (cond
- [(null? ok-kids) (values null tokens)]
- [else
- ;; read-content : (listof Token) -> (listof Content) (listof Token)
- (let read-content ([tokens tokens])
- (cond
- [(null? tokens) (values null tokens)]
- [else
- (let ([tok (car tokens)] [next-tokens (cdr tokens)])
- (cond
- [(start-tag? tok)
- (let* ([name (start-tag-name tok)]
- [auto-start (auto-insert start-name name)])
- (if auto-start
- (read-content (cons (make-start-tag (source-start tok) (source-stop tok) auto-start null) tokens))
- (if (and ok-kids
- (not (memq name ok-kids))
- (may-contain name))
- (values null tokens)
- (let*-values ([(element post-element)
- (read-el tok (cons name context) next-tokens)]
- [(more-contents left-overs) (read-content post-element)])
- (values (cons element more-contents) left-overs)))))]
- [(end-tag? tok)
- (let ([name (end-tag-name tok)])
- (if (eq? name start-name)
- (values null next-tokens)
- (if (memq name context)
- (values null tokens)
- (read-content next-tokens))))]
- [else ;; content
- (let-values ([(more-contents left-overs) (read-content next-tokens)])
- (values
- (expand-content tok more-contents)
- left-overs))]))]))])])
- (values (make-element (source-start start-tag)
- (source-stop start-tag)
- start-name
- (start-tag-attrs start-tag)
- content)
- remaining)))))
-
- ;; expand-content : Content (listof Content) -> (listof Content)
- (define (expand-content x lst)
+;; Start-tag ::= (make-start-tag Location Location Symbol (listof Attribute))
+(define-struct (start-tag source) (name attrs))
+
+;; End-tag ::= (make-end-tag Location Location Symbol)
+(define-struct (end-tag source) (name))
+
+;; Token ::= Contents | Start-tag | End-tag | Eof
+
+(define read-html-comments (make-parameter #f))
+(define trim-whitespace (make-parameter #f))
+
+;; Kid-lister : (Symbol -> (U (listof Symbol) #f))
+
+;; gen-may-contain : Spec -> Kid-lister
+(define (gen-may-contain spec)
+ (let ([table (make-hash)])
+ (for-each (lambda (def)
+ (let ([rhs (cdr def)])
+ (for-each (lambda (name) (hash-set! table name rhs))
+ (car def))))
+ spec)
+ (lambda (name)
+ (hash-ref table name (lambda () #f)))))
+
+;; gen-read-sgml : Kid-lister (Symbol Symbol -> (U #f Symbol)) -> [Input-port] -> (listof Content)
+(define (gen-read-sgml may-contain auto-insert)
+ (case-lambda
+ [(in) (read-from-port may-contain auto-insert in)]
+ [() (read-from-port may-contain auto-insert (current-input-port))]))
+
+;; read-from-port : Kid-lister (Symbol Symbol -> (U #f Symbol)) Input-port -> (listof Content)
+(define (read-from-port may-contain auto-insert in)
+ (let loop ([tokens (let read-tokens ()
+ (let ([tok (lex in)])
+ (cond
+ [(eof-object? tok) null]
+ [else (cons tok (read-tokens))])))])
(cond
- [(entity? x) (cons (expand-entity x) lst)]
- [(comment? x) (if (read-html-comments)
- (cons x lst)
- lst)]
- [else (cons x lst)]))
-
- ;; expand-entity : Entity -> (U Entity Pcdata)
- ;; more here - allow expansion of user defined entities
- (define (expand-entity x)
- (let ([expanded (default-entity-table (entity-text x))])
- (if expanded
- (make-pcdata (source-start x) (source-stop x) expanded)
- x)))
-
- ;; default-entity-table : Symbol -> (U #f String)
- (define (default-entity-table name)
- (case name
- [(amp) "&"]
- [(lt) "<"]
- [(gt) ">"]
- [(quot) "\""]
- [(apos) "'"]
- [else #f]))
-
- ;; lex : Input-port -> Token
- (define (lex in)
- (when (trim-whitespace)
- (skip-space in))
- (let ([c (peek-char in)])
- (cond
- [(eof-object? c) c]
- [(eq? c #\&) (lex-entity in)]
- [(eq? c #\<) (lex-tag-cdata-pi-comment in)]
- [else (lex-pcdata in)])))
-
- ;; lex-entity : Input-port -> Token
- ;; This might not return an entity if it doesn't look like one afterall.
- (define (lex-entity in)
- (let ([start (file-position in)])
- (read-char in)
- (case (peek-char in)
- ;; more here - read while it's numeric (or hex) not until #\;
- [(#\#)
- (read-char in)
- (let* ([hex? (if (equal? #\x (peek-char in))
- (and (read-char in) #t)
- #f)]
- [str (read-until #\; in)]
- [n (cond
- [hex?
- (string->number str 16)]
- [else (string->number str)])])
- (if (number? n)
- (make-entity start (file-position in) n)
- (make-pcdata start (file-position in) (string-append "" str))))]
- [else
- (let ([name (lex-name/case-sensitive in)]
- [c (peek-char in)])
- (if (eq? c #\;)
- (begin (read-char in) (make-entity start (file-position in) name))
- (make-pcdata start (file-position in) (format "&~a" name))))])))
-
- ;; lex-tag-cdata-pi-comment : Input-port -> Start-tag | Element | End-tag | Pcdata | Pi | Comment
- (define (lex-tag-cdata-pi-comment in)
- (let ([start (file-position in)])
- (read-char in)
- (case (peek-char in)
- [(#\!)
- (read-char in)
- (case (peek-char in)
- [(#\-) (read-char in)
- (let ([c (read-char in)])
+ [(null? tokens) null]
+ [else
+ (let ([tok (car tokens)] [rest-tokens (cdr tokens)])
+ (cond
+ [(start-tag? tok)
+ (let-values ([(el more-tokens) (read-element tok null may-contain auto-insert rest-tokens)])
+ (cons el (loop more-tokens)))]
+ [(end-tag? tok) (loop rest-tokens)]
+ [else (let ([rest-contents (loop rest-tokens)])
+ (expand-content tok rest-contents))]))])))
+
+;; read-element : Start-tag (listof Symbol) Kid-lister (Symbol Symbol -> (U #f Symbol)) (listof Token) -> Element (listof Token)
+;; Note: How elements nest depends on their content model.
+;; If a kind of element can't contain anything, then its start tags are implicitly ended, and
+;; end tags are implicitly started.
+;; Unknown elements can contain anything and can go inside anything.
+;; Otherwise, only the subelements listed in the content model can go inside an element.
+;; more here - may-contain shouldn't be used to decide if an element is known or not.
+;; The edgar dtd puts tags in may-contain's range that aren't in its domain.
+;; more here (or not) - the (memq name context) test leaks for a worst case of O(n^2) in the
+;; tag nesting depth. However, this only should be a problem when the tag is there,
+;; but far back. That shouldn't happen often. I'm guessing n will be about 3.
+(define (read-element start-tag context may-contain auto-insert tokens)
+ (let read-el ([start-tag start-tag] [context (cons (start-tag-name start-tag) context)] [tokens tokens])
+ (let* ([start-name (start-tag-name start-tag)]
+ [ok-kids (may-contain start-name)])
+ (let-values ([(content remaining)
(cond
- [(eq? c #\-)
- (let ([data (lex-comment-contents in)])
- (make-comment data))]
- [else (make-pcdata start (file-position in) (format " or whatever else is there
- (make-end-tag start (file-position in) name))]
- [else
- (let ([name (lex-name in)]
- [attrs (lex-attributes in)])
- (skip-space in)
- (case (read-char in)
- [(#\/)
- (read-char in) ;; skip #\> or something
- (make-element start (file-position in) name attrs null)]
- [else (make-start-tag start (file-position in) name attrs)]))])))
-
-
- ;; lex-attributes : Input-port -> (listof Attribute)
- (define (lex-attributes in)
- (sort (let loop ()
- (skip-space in)
- (cond [(name-start? (peek-char in))
- (cons (lex-attribute in) (loop))]
- [else null]))
- (lambda (a b)
- (string (symbol->string (attribute-name a))
- (symbol->string (attribute-name b))))))
-
- ;; lex-attribute : Input-port -> Attribute
- ;; Note: entities in attributes are ignored, since defacto html uses & in them for URL syntax
- (define (lex-attribute in)
- (let ([start (file-position in)]
- [name (lex-name in)])
- (skip-space in)
- (cond
- [(eq? (peek-char in) #\=)
- (read-char in)
+ [(null? ok-kids) (values null tokens)]
+ [else
+ ;; read-content : (listof Token) -> (listof Content) (listof Token)
+ (let read-content ([tokens tokens])
+ (cond
+ [(null? tokens) (values null tokens)]
+ [else
+ (let ([tok (car tokens)] [next-tokens (cdr tokens)])
+ (cond
+ [(start-tag? tok)
+ (let* ([name (start-tag-name tok)]
+ [auto-start (auto-insert start-name name)])
+ (if auto-start
+ (read-content (cons (make-start-tag (source-start tok) (source-stop tok) auto-start null) tokens))
+ (if (and ok-kids
+ (not (memq name ok-kids))
+ (may-contain name))
+ (values null tokens)
+ (let*-values ([(element post-element)
+ (read-el tok (cons name context) next-tokens)]
+ [(more-contents left-overs) (read-content post-element)])
+ (values (cons element more-contents) left-overs)))))]
+ [(end-tag? tok)
+ (let ([name (end-tag-name tok)])
+ (if (eq? name start-name)
+ (values null next-tokens)
+ (if (memq name context)
+ (values null tokens)
+ (read-content next-tokens))))]
+ [else ;; content
+ (let-values ([(more-contents left-overs) (read-content next-tokens)])
+ (values
+ (expand-content tok more-contents)
+ left-overs))]))]))])])
+ (values (make-element (source-start start-tag)
+ (source-stop start-tag)
+ start-name
+ (start-tag-attrs start-tag)
+ content)
+ remaining)))))
+
+;; expand-content : Content (listof Content) -> (listof Content)
+(define (expand-content x lst)
+ (cond
+ [(entity? x) (cons (expand-entity x) lst)]
+ [(comment? x) (if (read-html-comments)
+ (cons x lst)
+ lst)]
+ [else (cons x lst)]))
+
+;; expand-entity : Entity -> (U Entity Pcdata)
+;; more here - allow expansion of user defined entities
+(define (expand-entity x)
+ (let ([expanded (default-entity-table (entity-text x))])
+ (if expanded
+ (make-pcdata (source-start x) (source-stop x) expanded)
+ x)))
+
+;; default-entity-table : Symbol -> (U #f String)
+(define (default-entity-table name)
+ (case name
+ [(amp) "&"]
+ [(lt) "<"]
+ [(gt) ">"]
+ [(quot) "\""]
+ [(apos) "'"]
+ [else #f]))
+
+;; lex : Input-port -> Token
+(define (lex in)
+ (when (trim-whitespace)
+ (skip-space in))
+ (let ([c (peek-char in)])
+ (cond
+ [(eof-object? c) c]
+ [(eq? c #\&) (lex-entity in)]
+ [(eq? c #\<) (lex-tag-cdata-pi-comment in)]
+ [else (lex-pcdata in)])))
+
+;; lex-entity : Input-port -> Token
+;; This might not return an entity if it doesn't look like one afterall.
+(define (lex-entity in)
+ (let ([start (file-position in)])
+ (read-char in)
+ (case (peek-char in)
+ ;; more here - read while it's numeric (or hex) not until #\;
+ [(#\#)
+ (read-char in)
+ (let* ([hex? (if (equal? #\x (peek-char in))
+ (and (read-char in) #t)
+ #f)]
+ [str (read-until #\; in)]
+ [n (cond
+ [hex?
+ (string->number str 16)]
+ [else (string->number str)])])
+ (if (number? n)
+ (make-entity start (file-position in) n)
+ (make-pcdata start (file-position in) (string-append "" str))))]
+ [else
+ (let ([name (lex-name/case-sensitive in)]
+ [c (peek-char in)])
+ (if (eq? c #\;)
+ (begin (read-char in) (make-entity start (file-position in) name))
+ (make-pcdata start (file-position in) (format "&~a" name))))])))
+
+;; lex-tag-cdata-pi-comment : Input-port -> Start-tag | Element | End-tag | Pcdata | Pi | Comment
+(define (lex-tag-cdata-pi-comment in)
+ (let ([start (file-position in)])
+ (read-char in)
+ (case (peek-char in)
+ [(#\!)
+ (read-char in)
+ (case (peek-char in)
+ [(#\-) (read-char in)
+ (let ([c (read-char in)])
+ (cond
+ [(eq? c #\-)
+ (let ([data (lex-comment-contents in)])
+ (make-comment data))]
+ [else (make-pcdata start (file-position in) (format " or whatever else is there
+ (make-end-tag start (file-position in) name))]
+ [else
+ (let ([name (lex-name in)]
+ [attrs (lex-attributes in)])
(skip-space in)
- (let* ([delimiter (read-char in)]
- [value (list->string
- (case delimiter
- [(#\' #\")
- (let read-more ()
- (let ([c (read-char in)])
- (cond
- [(or (eq? c delimiter) (eof-object? c)) null]
- [else (cons c (read-more))])))]
- [else (cons delimiter (read-up-to (lambda (c) (or (char-whitespace? c) (eq? c #\>))) in))]))])
- (make-attribute start (file-position in) name value))]
- [else (make-attribute start (file-position in) name (symbol->string name))])))
-
- ;; skip-space : Input-port -> Void
- ;; deviation - should sometimes insist on at least one space
- (define (skip-space in)
- (let loop ()
- (let ([c (peek-char in)])
- (when (and (not (eof-object? c)) (char-whitespace? c))
- (read-char in)
- (loop)))))
-
- ;; lex-pcdata : Input-port -> Pcdata
- ;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec
- (define (lex-pcdata in)
- (let ([start (file-position in)])
- ;; The following regexp match must use bytes, not chars, because
- ;; `in' might not be a well-formed UTF-8 sequence. If it isn't,
- ;; and it goes wrong with the first byte sequence, then a char-based
- ;; pattern would match 0 characters. Meanwhile, the caller of this function
- ;; expects characters to be read.
- (let ([s (regexp-match #rx#"^[^&<]*" in)])
- (make-pcdata start
- (file-position in)
- (bytes->string/utf-8
- (if (trim-whitespace)
- (regexp-replace* #rx#"[ \t\v\r\n]+" (car s) #"")
- (car s))
- #\?)))))
- #|
+ (case (read-char in)
+ [(#\/)
+ (read-char in) ;; skip #\> or something
+ (make-element start (file-position in) name attrs null)]
+ [else (make-start-tag start (file-position in) name attrs)]))])))
+
+
+;; lex-attributes : Input-port -> (listof Attribute)
+(define (lex-attributes in)
+ (sort (let loop ()
+ (skip-space in)
+ (cond [(name-start? (peek-char in))
+ (cons (lex-attribute in) (loop))]
+ [else null]))
+ (lambda (a b)
+ (string (symbol->string (attribute-name a))
+ (symbol->string (attribute-name b))))))
+
+;; lex-attribute : Input-port -> Attribute
+;; Note: entities in attributes are ignored, since defacto html uses & in them for URL syntax
+(define (lex-attribute in)
+ (let ([start (file-position in)]
+ [name (lex-name in)])
+ (skip-space in)
+ (cond
+ [(eq? (peek-char in) #\=)
+ (read-char in)
+ (skip-space in)
+ (let* ([delimiter (read-char in)]
+ [value (list->string
+ (case delimiter
+ [(#\' #\")
+ (let read-more ()
+ (let ([c (read-char in)])
+ (cond
+ [(or (eq? c delimiter) (eof-object? c)) null]
+ [else (cons c (read-more))])))]
+ [else (cons delimiter (read-up-to (lambda (c) (or (char-whitespace? c) (eq? c #\>))) in))]))])
+ (make-attribute start (file-position in) name value))]
+ [else (make-attribute start (file-position in) name (symbol->string name))])))
+
+;; skip-space : Input-port -> Void
+;; deviation - should sometimes insist on at least one space
+(define (skip-space in)
+ (let loop ()
+ (let ([c (peek-char in)])
+ (when (and (not (eof-object? c)) (char-whitespace? c))
+ (read-char in)
+ (loop)))))
+
+;; lex-pcdata : Input-port -> Pcdata
+;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec
+(define (lex-pcdata in)
+ (let ([start (file-position in)])
+ ;; The following regexp match must use bytes, not chars, because
+ ;; `in' might not be a well-formed UTF-8 sequence. If it isn't,
+ ;; and it goes wrong with the first byte sequence, then a char-based
+ ;; pattern would match 0 characters. Meanwhile, the caller of this function
+ ;; expects characters to be read.
+ (let ([s (regexp-match #rx#"^[^&<]*" in)])
+ (make-pcdata start
+ (file-position in)
+ (bytes->string/utf-8
+ (if (trim-whitespace)
+ (regexp-replace* #rx#"[ \t\v\r\n]+" (car s) #"")
+ (car s))
+ #\?)))))
+#|
;; Original slow version:
(define (lex-pcdata in)
(let ([start (file-position in)]
@@ -304,23 +304,23 @@
(file-position in)
(list->string data))))
|#
-
-
- ;; lex-name : Input-port -> Symbol
- (define (lex-name in)
- (let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))])
- (string->symbol
- ;; Common case: string is already lowercased
- (if (regexp-match-positions #rx"[A-Z]" s)
- (begin
- (string-lowercase! s)
- s)
- s))))
- ;; lex-name/case-sensitive : Input-port -> Symbol
- (define (lex-name/case-sensitive in)
- (let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))])
- (string->symbol s)))
- #|
+
+
+;; lex-name : Input-port -> Symbol
+(define (lex-name in)
+ (let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))])
+ (string->symbol
+ ;; Common case: string is already lowercased
+ (if (regexp-match-positions #rx"[A-Z]" s)
+ (begin
+ (string-lowercase! s)
+ s)
+ s))))
+;; lex-name/case-sensitive : Input-port -> Symbol
+(define (lex-name/case-sensitive in)
+ (let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))])
+ (string->symbol s)))
+#|
(define (lex-name in)
(string->symbol
(list->string
@@ -330,100 +330,100 @@
(cons (char-downcase (read-char in)) (lex-rest))]
[else null])))))
|#
-
-
- ;; skip-dtd : Input-port -> Void
- (define (skip-dtd in)
- (let skip ()
- (let ([c (read-char in)])
- (if (eof-object? c)
- (void)
- (case c
- [(#\') (read-until #\' in) (skip)]
- [(#\") (read-until #\" in) (skip)]
- [(#\<)
- (case (read-char in)
- [(#\!) (case (read-char in)
- [(#\-) (read-char in) (lex-comment-contents in) (skip)]
- [else (skip) (skip)])]
- [(#\?) (lex-pi-data in) (skip)]
- [else (skip) (skip)])]
- [(#\>) (void)]
- [else (skip)])))))
-
- ;; name-start? : TST -> Bool
- (define (name-start? ch)
- (and (char? ch) (char-name-start? ch)))
-
- ;; char-name-start? : Char -> Bool
- (define (char-name-start? ch)
- (or (char-alphabetic? ch)
- (eq? ch #\_)
- (eq? ch #\:)))
-
- ;; name-char? : TST -> Bool
- (define (name-char? ch)
- (and (char? ch)
- (or (char-name-start? ch)
- (char-numeric? ch)
- (eq? ch #\&) ; ugly illegal junk for SEC's EDGAR database
- (eq? ch #\.)
- (eq? ch #\-))))
-
- ;; read-up-to : (Char -> Bool) Input-port -> (listof Char)
- ;; abstract this with read-until
- (define (read-up-to p? in)
- (let loop ()
- (let ([c (peek-char in)])
- (cond
- [(or (eof-object? c) (p? c)) null]
- [else (cons (read-char in) (loop))]))))
-
- ;; read-until : Char Input-port -> String
- ;; discards the stop character, too
- (define (read-until char in)
- (list->string
- (let read-more ()
- (let ([c (read-char in)])
- (cond
- [(or (eof-object? c) (eq? c char)) null]
- [else (cons c (read-more))])))))
-
- ;; gen-read-until-string : String -> Input-port -> String
- ;; uses Knuth-Morris-Pratt from
- ;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
- ;; discards stop from input
- (define (gen-read-until-string stop)
- (let* ([len (string-length stop)]
- [prefix (make-vector len 0)]
- [fall-back
- (lambda (k c)
- (let ([k (let loop ([k k])
- (cond
- [(and (> k 0) (not (eq? (string-ref stop k) c)))
- (loop (vector-ref prefix (sub1 k)))]
- [else k]))])
- (if (eq? (string-ref stop k) c)
- (add1 k)
- k)))])
- (let init ([k 0] [q 1])
- (when (< q len)
- (let ([k (fall-back k (string-ref stop q))])
- (vector-set! prefix q k)
- (init k (add1 q)))))
- ;; (vector-ref prefix x) = the longest suffix that matches a prefix of stop
- (lambda (in)
- (list->string
- (let/ec out
- (let loop ([matched 0] [out out])
- (let* ([c (read-char in)]
- [matched (fall-back matched c)])
- (cond
- [(or (eof-object? c) (= matched len)) (out null)]
- [(zero? matched) (cons c (let/ec out (loop matched out)))]
- [else (cons c (loop matched out))]))))))))
-
- ;; "-->" makes more sense, but "--" follows the spec, but this isn't XML anymore.
- (define lex-comment-contents (gen-read-until-string "-->"))
- (define lex-pi-data (gen-read-until-string "?>"))
- (define lex-cdata-contents (gen-read-until-string "]]>"))
+
+
+;; skip-dtd : Input-port -> Void
+(define (skip-dtd in)
+ (let skip ()
+ (let ([c (read-char in)])
+ (if (eof-object? c)
+ (void)
+ (case c
+ [(#\') (read-until #\' in) (skip)]
+ [(#\") (read-until #\" in) (skip)]
+ [(#\<)
+ (case (read-char in)
+ [(#\!) (case (read-char in)
+ [(#\-) (read-char in) (lex-comment-contents in) (skip)]
+ [else (skip) (skip)])]
+ [(#\?) (lex-pi-data in) (skip)]
+ [else (skip) (skip)])]
+ [(#\>) (void)]
+ [else (skip)])))))
+
+;; name-start? : TST -> Bool
+(define (name-start? ch)
+ (and (char? ch) (char-name-start? ch)))
+
+;; char-name-start? : Char -> Bool
+(define (char-name-start? ch)
+ (or (char-alphabetic? ch)
+ (eq? ch #\_)
+ (eq? ch #\:)))
+
+;; name-char? : TST -> Bool
+(define (name-char? ch)
+ (and (char? ch)
+ (or (char-name-start? ch)
+ (char-numeric? ch)
+ (eq? ch #\&) ; ugly illegal junk for SEC's EDGAR database
+ (eq? ch #\.)
+ (eq? ch #\-))))
+
+;; read-up-to : (Char -> Bool) Input-port -> (listof Char)
+;; abstract this with read-until
+(define (read-up-to p? in)
+ (let loop ()
+ (let ([c (peek-char in)])
+ (cond
+ [(or (eof-object? c) (p? c)) null]
+ [else (cons (read-char in) (loop))]))))
+
+;; read-until : Char Input-port -> String
+;; discards the stop character, too
+(define (read-until char in)
+ (list->string
+ (let read-more ()
+ (let ([c (read-char in)])
+ (cond
+ [(or (eof-object? c) (eq? c char)) null]
+ [else (cons c (read-more))])))))
+
+;; gen-read-until-string : String -> Input-port -> String
+;; uses Knuth-Morris-Pratt from
+;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
+;; discards stop from input
+(define (gen-read-until-string stop)
+ (let* ([len (string-length stop)]
+ [prefix (make-vector len 0)]
+ [fall-back
+ (lambda (k c)
+ (let ([k (let loop ([k k])
+ (cond
+ [(and (> k 0) (not (eq? (string-ref stop k) c)))
+ (loop (vector-ref prefix (sub1 k)))]
+ [else k]))])
+ (if (eq? (string-ref stop k) c)
+ (add1 k)
+ k)))])
+ (let init ([k 0] [q 1])
+ (when (< q len)
+ (let ([k (fall-back k (string-ref stop q))])
+ (vector-set! prefix q k)
+ (init k (add1 q)))))
+ ;; (vector-ref prefix x) = the longest suffix that matches a prefix of stop
+ (lambda (in)
+ (list->string
+ (let/ec out
+ (let loop ([matched 0] [out out])
+ (let* ([c (read-char in)]
+ [matched (fall-back matched c)])
+ (cond
+ [(or (eof-object? c) (= matched len)) (out null)]
+ [(zero? matched) (cons c (let/ec out (loop matched out)))]
+ [else (cons c (loop matched out))]))))))))
+
+;; "-->" makes more sense, but "--" follows the spec, but this isn't XML anymore.
+(define lex-comment-contents (gen-read-until-string "-->"))
+(define lex-pi-data (gen-read-until-string "?>"))
+(define lex-cdata-contents (gen-read-until-string "]]>"))