racket/collects/html/html-unit.ss
Eli Barzilay 7d50e61c7f * Newlines at EOFs
* Another big chunk of v4-require-isms
* Allow `#lang framework/keybinding-lang' for keybinding files
* Move hierlist sources into "mrlib/hierlist", leave stub behind

svn: r10689
2008-07-09 07:18:06 +00:00

142 lines
4.8 KiB
Scheme

;; copyright by Paul Graunke June 2000 AD
(module html-unit mzscheme
(require mzlib/unitsig
mzlib/file
mzlib/list
mzlib/etc
mzlib/include
"html-sig.ss"
"sgml-reader-sig.ss"
xml/xml-sig)
(provide html@)
(define html@
(unit/sig html^
(import xml^ (sgml : sgml-reader^))
;; Html-content = Html-element | Pc-data | Entity
(include "html-structs.ss")
(include "case.ss")
;; xml->html : Document -> Html
(define (xml->html doc)
(let ([root (document-element doc)])
(unless (eq? 'html (element-name root))
(error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root)))
(make-html (element-attributes root) (xml-contents->html (element-content root)))))
;; xml-content->html : (listof Content) -> (listof Html-element)
(define (xml-contents->html contents)
(foldr xml-single-content->html
null
contents))
;; read-xhtml : [Input-port] -> Html
(define read-xhtml (compose xml->html read-xml))
;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
(define (peel-f toss? to-toss acc0)
(foldr (lambda (x acc)
(if (toss? x)
(append (html-full-content x) acc)
(cons x acc)))
acc0
to-toss))
;; repackage-html : (listof Html-content) -> Html
(define (repackage-html contents)
(let* ([html (memf html? contents)]
[peeled (peel-f html? contents null)]
[body (memf body? peeled)])
(make-html (if html
(html-element-attributes (car html))
null)
(append (filter head? peeled)
(list (make-body (if body
(html-element-attributes (car body))
null)
(filter (compose not head?) (peel-f body? peeled null))))))))
;; clean-up-pcdata : (listof Content) -> (listof Content)
;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
;; c) discarded
;; unknown tags may contain pcdata
;; the top level may contain pcdata
(define clean-up-pcdata
;; clean-up-pcdata : (listof Content) -> (listof Content)
(letrec ([clean-up-pcdata
(lambda (content)
(map (lambda (to-fix)
(cond
[(element? to-fix)
(recontent-xml to-fix
(let ([possible (may-contain (element-name to-fix))]
[content (element-content to-fix)])
(if (or (not possible) (memq 'pcdata possible))
(clean-up-pcdata content)
(eliminate-pcdata content))))]
[else to-fix]))
content))]
[eliminate-pcdata
;: (listof Content) -> (listof Content)
(lambda (content)
(let ([non-elements (first-non-elements content)]
[more (memf element? content)])
(if more
(let* ([el (car more)]
[possible (may-contain (element-name el))])
(if (or (not possible) (memq 'pcdata possible))
(cons (recontent-xml el (append non-elements (clean-up-pcdata (element-content el)) (eliminate-pcdata (first-non-elements (cdr more)))))
(or (memf element? (cdr more)) null))
(cons (recontent-xml el (eliminate-pcdata (element-content el)))
(eliminate-pcdata (cdr more)))))
null)))])
clean-up-pcdata))
;; first-non-elements : (listof Content) -> (listof Content)
(define (first-non-elements content)
(cond
[(null? content) null]
[else (if (element? (car content))
null
(cons (car content) (first-non-elements (cdr content))))]))
;; recontent-xml : Element (listof Content) -> Element
(define (recontent-xml e c)
(make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c))
;; implicit-starts : Symbol Symbol -> (U #f Symbol)
(define (implicit-starts parent child)
(or (and (eq? child 'tr) (eq? parent 'table) 'tbody)
(and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
;; may-contain : Kid-lister
(define may-contain
(sgml:gen-may-contain (call-with-input-file (find-library "html-spec" "html") read)))
(define may-contain-anything
(sgml:gen-may-contain null))
(define use-html-spec (make-parameter #t))
;; read-html-as-xml : [Input-port] -> (listof Content)
(define read-html-as-xml
(case-lambda
[(port)
((if (use-html-spec) clean-up-pcdata values)
((sgml:gen-read-sgml (if (use-html-spec)
may-contain
may-contain-anything)
implicit-starts) port))]
[() (read-html-as-xml (current-input-port))]))
;; read-html : [Input-port] -> Html
(define read-html
(compose repackage-html xml-contents->html read-html-as-xml)))))