racket/collects/html/html.rkt

620 lines
17 KiB
Racket

#lang racket
;; copyright by Paul Graunke June 2000 AD
(require "html-structs.rkt"
"html-spec.rkt"
"sgml-reader.rkt"
xml)
(provide (all-from-out "html-structs.rkt")
read-html-comments)
(provide/contract
[use-html-spec (parameter/c boolean?)]
[read-html (() (input-port?) . ->* . html?)]
[read-xhtml (() (input-port?) . ->* . html?)]
[read-html-as-xml (() (input-port?) . ->* . (listof content/c))])
;; xml-single-content->html : Content (listof Html-content) -> (listof Html-content)
(define (xml-single-content->html x acc)
(cond
((element? x)
(case (element-name x)
((basefont) (cons (make-basefont (element-attributes x)) acc))
((br) (cons (make-br (element-attributes x)) acc))
((area) (cons (make-area (element-attributes x)) acc))
((link) (cons (make-alink (element-attributes x)) acc))
((img) (cons (make-img (element-attributes x)) acc))
((param) (cons (make-param (element-attributes x)) acc))
((hr) (cons (make-hr (element-attributes x)) acc))
((input) (cons (make-input (element-attributes x)) acc))
((col) (cons (make-col (element-attributes x)) acc))
((isindex) (cons (make-isindex (element-attributes x)) acc))
((base) (cons (make-base (element-attributes x)) acc))
((meta) (cons (make-meta (element-attributes x)) acc))
((mzscheme)
(cons
(make-mzscheme
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((html)
(cons
(make-html
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((div)
(cons
(make-div
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((center)
(cons
(make-center
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((blockquote)
(cons
(make-blockquote
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((ins)
(cons
(make-ins
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((del)
(cons
(make-del
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((dd)
(cons
(make-dd
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((li)
(cons
(make-li
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((th)
(cons
(make-th
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((td)
(cons
(make-td
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((iframe)
(cons
(make-iframe
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((noframes)
(cons
(make-noframes
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((noscript)
(cons
(make-noscript
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((style)
(cons
(make-style
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((script)
(cons
(make-script
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((option)
(cons
(make-option
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((textarea)
(cons
(make-textarea
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((title)
(cons
(make-title
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((head)
(cons
(make-head
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((tr)
(cons
(make-tr
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((colgroup)
(cons
(make-colgroup
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((thead)
(cons
(make-thead
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((tfoot)
(cons
(make-tfoot
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((tbody)
(cons
(make-tbody
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((tt)
(cons
(make-tt
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((i)
(cons
(make-i
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((b)
(cons
(make-b
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((u)
(cons
(make-u
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((s)
(cons
(make-s
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((strike)
(cons
(make-strike
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((big)
(cons
(make-big
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((small)
(cons
(make-small
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((em)
(cons
(make-em
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((strong)
(cons
(make-strong
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((dfn)
(cons
(make-dfn
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((code)
(cons
(make-code
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((samp)
(cons
(make-samp
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((kbd)
(cons
(make-kbd
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((var)
(cons
(make-var
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((cite)
(cons
(make-cite
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((abbr)
(cons
(make-abbr
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((acronym)
(cons
(make-acronym
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((sub)
(cons
(make-sub
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((sup)
(cons
(make-sup
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((span)
(cons
(make-span
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((bdo)
(cons
(make-bdo
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((font)
(cons
(make-font
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((p)
(cons
(make-p
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((h1)
(cons
(make-h1
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((h2)
(cons
(make-h2
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((h3)
(cons
(make-h3
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((h4)
(cons
(make-h4
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((h5)
(cons
(make-h5
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((h6)
(cons
(make-h6
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((q)
(cons
(make-q
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((dt)
(cons
(make-dt
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((legend)
(cons
(make-legend
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((caption)
(cons
(make-caption
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((table)
(cons
(make-table
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((button)
(cons
(make-button
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((fieldset)
(cons
(make-fieldset
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((optgroup)
(cons
(make-optgroup
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((select)
(cons
(make-select
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((label)
(cons
(make-label
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((form)
(cons
(make-form
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((ol)
(cons
(make-ol
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((ul)
(cons
(make-ul
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((dir)
(cons
(make-dir
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((menu)
(cons
(make-menu
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((dl)
(cons
(make-dl
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((pre)
(cons
(make-pre
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((object)
(cons
(make-object (element-attributes x)
(xml-contents->html (element-content x)))
acc))
((applet)
(cons
(make-applet
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((map)
(cons
(make--map
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((a)
(cons
(make-a
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((address)
(cons
(make-address
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
((body)
(cons
(make-body
(element-attributes x)
(xml-contents->html (element-content x)))
acc))
(else acc)))
((or (pcdata? x) (entity? x)) (cons x acc))
(else acc)))
;; xml->html : Document -> Html
(define (xml->html doc)
(let ([root (document-element doc)])
(unless (eq? 'html (element-name root))
(error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root)))
(make-html (element-attributes root) (xml-contents->html (element-content root)))))
;; xml-content->html : (listof Content) -> (listof Html-element)
(define (xml-contents->html contents)
(foldr xml-single-content->html
null
contents))
;; read-xhtml : [Input-port] -> Html
(define read-xhtml (compose xml->html read-xml))
;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
(define (peel-f toss? to-toss acc0)
(foldr (lambda (x acc)
(if (toss? x)
(append (html-full-content x) acc)
(cons x acc)))
acc0
to-toss))
;; repackage-html : (listof Html-content) -> Html
(define (repackage-html contents)
(let* ([html (memf html? contents)]
[peeled (peel-f html? contents null)]
[body (memf body? peeled)])
(make-html (if html
(html-element-attributes (car html))
null)
(append (filter head? peeled)
(list (make-body (if body
(html-element-attributes (car body))
null)
(filter (compose not head?) (peel-f body? peeled null))))))))
;; clean-up-pcdata : (listof Content) -> (listof Content)
;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
;; c) discarded
;; unknown tags may contain pcdata
;; the top level may contain pcdata
(define clean-up-pcdata
;; clean-up-pcdata : (listof Content) -> (listof Content)
(letrec ([clean-up-pcdata
(lambda (content)
(map (lambda (to-fix)
(cond
[(element? to-fix)
(recontent-xml to-fix
(let ([possible (may-contain (element-name to-fix))]
[content (element-content to-fix)])
(if (or (not possible) (memq 'pcdata possible))
(clean-up-pcdata content)
(eliminate-pcdata content))))]
[else to-fix]))
content))]
[eliminate-pcdata
;: (listof Content) -> (listof Content)
(lambda (content)
(let ([non-elements (first-non-elements content)]
[more (memf element? content)])
(if more
(let* ([el (car more)]
[possible (may-contain (element-name el))])
(if (or (not possible) (memq 'pcdata possible))
(cons (recontent-xml el (append non-elements (clean-up-pcdata (element-content el)) (eliminate-pcdata (first-non-elements (cdr more)))))
(or (memf element? (cdr more)) null))
(cons (recontent-xml el (eliminate-pcdata (element-content el)))
(eliminate-pcdata (cdr more)))))
null)))])
clean-up-pcdata))
;; first-non-elements : (listof Content) -> (listof Content)
(define (first-non-elements content)
(cond
[(null? content) null]
[else (if (element? (car content))
null
(cons (car content) (first-non-elements (cdr content))))]))
;; recontent-xml : Element (listof Content) -> Element
(define (recontent-xml e c)
(make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c))
;; implicit-starts : Symbol Symbol -> (U #f Symbol)
(define (implicit-starts parent child)
(or (and (eq? child 'tr) (eq? parent 'table) 'tbody)
(and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
;; may-contain : Kid-lister
(define may-contain
(gen-may-contain html-spec))
(define may-contain-anything
(gen-may-contain null))
(define use-html-spec (make-parameter #t))
;; read-html-as-xml : [Input-port] -> (listof Content)
(define (read-html-as-xml [port (current-input-port)])
((if (use-html-spec) clean-up-pcdata values)
((gen-read-sgml (if (use-html-spec)
may-contain
may-contain-anything)
implicit-starts) port)))
;; read-html : [Input-port] -> Html
(define read-html
(compose repackage-html xml-contents->html read-html-as-xml))