142 lines
4.9 KiB
Scheme
142 lines
4.9 KiB
Scheme
;; copyright by Paul Graunke June 2000 AD
|
|
|
|
(module html-unit mzscheme
|
|
(require (lib "unitsig.ss")
|
|
(lib "file.ss")
|
|
(lib "list.ss")
|
|
(lib "etc.ss")
|
|
(lib "include.ss")
|
|
"html-sig.ss"
|
|
"sgml-reader-sig.ss"
|
|
(lib "xml-sig.ss" "xml"))
|
|
|
|
(provide html@)
|
|
|
|
(define html@
|
|
(unit/sig html^
|
|
(import xml^ (sgml : sgml-reader^))
|
|
|
|
;; Html-content = Html-element | Pc-data | Entity
|
|
|
|
(include "html-structs.ss")
|
|
(include "case.ss")
|
|
|
|
;; xml->html : Document -> Html
|
|
(define (xml->html doc)
|
|
(let ([root (document-element doc)])
|
|
(unless (eq? 'html (element-name root))
|
|
(error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root)))
|
|
(make-html (element-attributes root) (xml-contents->html (element-content root)))))
|
|
|
|
|
|
;; xml-content->html : (listof Content) -> (listof Html-element)
|
|
(define (xml-contents->html contents)
|
|
(foldr xml-single-content->html
|
|
null
|
|
contents))
|
|
|
|
;; read-xhtml : [Input-port] -> Html
|
|
(define read-xhtml (compose xml->html read-xml))
|
|
|
|
;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
|
|
(define (peel-f toss? to-toss acc0)
|
|
(foldr (lambda (x acc)
|
|
(if (toss? x)
|
|
(append (html-full-content x) acc)
|
|
(cons x acc)))
|
|
acc0
|
|
to-toss))
|
|
|
|
;; repackage-html : (listof Html-content) -> Html
|
|
(define (repackage-html contents)
|
|
(let* ([html (memf html? contents)]
|
|
[peeled (peel-f html? contents null)]
|
|
[body (memf body? peeled)])
|
|
(make-html (if html
|
|
(html-element-attributes (car html))
|
|
null)
|
|
(append (filter head? peeled)
|
|
(list (make-body (if body
|
|
(html-element-attributes (car body))
|
|
null)
|
|
(filter (compose not head?) (peel-f body? peeled null))))))))
|
|
|
|
;; clean-up-pcdata : (listof Content) -> (listof Content)
|
|
;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
|
|
;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
|
|
;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
|
|
;; c) discarded
|
|
;; unknown tags may contain pcdata
|
|
;; the top level may contain pcdata
|
|
(define clean-up-pcdata
|
|
;; clean-up-pcdata : (listof Content) -> (listof Content)
|
|
(letrec ([clean-up-pcdata
|
|
(lambda (content)
|
|
(map (lambda (to-fix)
|
|
(cond
|
|
[(element? to-fix)
|
|
(recontent-xml to-fix
|
|
(let ([possible (may-contain (element-name to-fix))]
|
|
[content (element-content to-fix)])
|
|
(if (or (not possible) (memq 'pcdata possible))
|
|
(clean-up-pcdata content)
|
|
(eliminate-pcdata content))))]
|
|
[else to-fix]))
|
|
content))]
|
|
[eliminate-pcdata
|
|
;: (listof Content) -> (listof Content)
|
|
(lambda (content)
|
|
(let ([non-elements (first-non-elements content)]
|
|
[more (memf element? content)])
|
|
(if more
|
|
(let* ([el (car more)]
|
|
[possible (may-contain (element-name el))])
|
|
(if (or (not possible) (memq 'pcdata possible))
|
|
(cons (recontent-xml el (append non-elements (clean-up-pcdata (element-content el)) (eliminate-pcdata (first-non-elements (cdr more)))))
|
|
(or (memf element? (cdr more)) null))
|
|
(cons (recontent-xml el (eliminate-pcdata (element-content el)))
|
|
(eliminate-pcdata (cdr more)))))
|
|
null)))])
|
|
clean-up-pcdata))
|
|
|
|
;; first-non-elements : (listof Content) -> (listof Content)
|
|
(define (first-non-elements content)
|
|
(cond
|
|
[(null? content) null]
|
|
[else (if (element? (car content))
|
|
null
|
|
(cons (car content) (first-non-elements (cdr content))))]))
|
|
|
|
;; recontent-xml : Element (listof Content) -> Element
|
|
(define (recontent-xml e c)
|
|
(make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c))
|
|
|
|
;; implicit-starts : Symbol Symbol -> (U #f Symbol)
|
|
(define (implicit-starts parent child)
|
|
(or (and (eq? child 'tr) (eq? parent 'table) 'tbody)
|
|
(and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
|
|
|
|
;; may-contain : Kid-lister
|
|
(define may-contain
|
|
(sgml:gen-may-contain (call-with-input-file (find-library "html-spec" "html") read)))
|
|
|
|
(define may-contain-anything
|
|
(sgml:gen-may-contain null))
|
|
|
|
(define use-html-spec (make-parameter #t))
|
|
|
|
;; read-html-as-xml : [Input-port] -> (listof Content)
|
|
(define read-html-as-xml
|
|
(case-lambda
|
|
[(port)
|
|
((if (use-html-spec) clean-up-pcdata values)
|
|
((sgml:gen-read-sgml (if (use-html-spec)
|
|
may-contain
|
|
may-contain-anything)
|
|
implicit-starts) port))]
|
|
[() (read-html-as-xml (current-input-port))]))
|
|
|
|
;; read-html : [Input-port] -> Html
|
|
(define read-html
|
|
(compose repackage-html xml-contents->html read-html-as-xml)))))
|