620 lines
17 KiB
Racket
620 lines
17 KiB
Racket
#lang racket
|
|
;; copyright by Paul Graunke June 2000 AD
|
|
|
|
(require "html-structs.rkt"
|
|
"html-spec.rkt"
|
|
"sgml-reader.rkt"
|
|
xml)
|
|
|
|
(provide (all-from-out "html-structs.rkt")
|
|
read-html-comments)
|
|
(provide/contract
|
|
[use-html-spec (parameter/c boolean?)]
|
|
[read-html (() (input-port?) . ->* . html?)]
|
|
[read-xhtml (() (input-port?) . ->* . html?)]
|
|
[read-html-as-xml (() (input-port?) . ->* . (listof content/c))])
|
|
|
|
;; xml-single-content->html : Content (listof Html-content) -> (listof Html-content)
|
|
(define (xml-single-content->html x acc)
|
|
(cond
|
|
((element? x)
|
|
(case (element-name x)
|
|
((basefont) (cons (make-basefont (element-attributes x)) acc))
|
|
((br) (cons (make-br (element-attributes x)) acc))
|
|
((area) (cons (make-area (element-attributes x)) acc))
|
|
((link) (cons (make-alink (element-attributes x)) acc))
|
|
((img) (cons (make-img (element-attributes x)) acc))
|
|
((param) (cons (make-param (element-attributes x)) acc))
|
|
((hr) (cons (make-hr (element-attributes x)) acc))
|
|
((input) (cons (make-input (element-attributes x)) acc))
|
|
((col) (cons (make-col (element-attributes x)) acc))
|
|
((isindex) (cons (make-isindex (element-attributes x)) acc))
|
|
((base) (cons (make-base (element-attributes x)) acc))
|
|
((meta) (cons (make-meta (element-attributes x)) acc))
|
|
((mzscheme)
|
|
(cons
|
|
(make-mzscheme
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((html)
|
|
(cons
|
|
(make-html
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((div)
|
|
(cons
|
|
(make-div
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((center)
|
|
(cons
|
|
(make-center
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((blockquote)
|
|
(cons
|
|
(make-blockquote
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((ins)
|
|
(cons
|
|
(make-ins
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((del)
|
|
(cons
|
|
(make-del
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((dd)
|
|
(cons
|
|
(make-dd
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((li)
|
|
(cons
|
|
(make-li
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((th)
|
|
(cons
|
|
(make-th
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((td)
|
|
(cons
|
|
(make-td
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((iframe)
|
|
(cons
|
|
(make-iframe
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((noframes)
|
|
(cons
|
|
(make-noframes
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((noscript)
|
|
(cons
|
|
(make-noscript
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((style)
|
|
(cons
|
|
(make-style
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((script)
|
|
(cons
|
|
(make-script
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((option)
|
|
(cons
|
|
(make-option
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((textarea)
|
|
(cons
|
|
(make-textarea
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((title)
|
|
(cons
|
|
(make-title
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((head)
|
|
(cons
|
|
(make-head
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((tr)
|
|
(cons
|
|
(make-tr
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((colgroup)
|
|
(cons
|
|
(make-colgroup
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((thead)
|
|
(cons
|
|
(make-thead
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((tfoot)
|
|
(cons
|
|
(make-tfoot
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((tbody)
|
|
(cons
|
|
(make-tbody
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((tt)
|
|
(cons
|
|
(make-tt
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((i)
|
|
(cons
|
|
(make-i
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((b)
|
|
(cons
|
|
(make-b
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((u)
|
|
(cons
|
|
(make-u
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((s)
|
|
(cons
|
|
(make-s
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((strike)
|
|
(cons
|
|
(make-strike
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((big)
|
|
(cons
|
|
(make-big
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((small)
|
|
(cons
|
|
(make-small
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((em)
|
|
(cons
|
|
(make-em
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((strong)
|
|
(cons
|
|
(make-strong
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((dfn)
|
|
(cons
|
|
(make-dfn
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((code)
|
|
(cons
|
|
(make-code
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((samp)
|
|
(cons
|
|
(make-samp
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((kbd)
|
|
(cons
|
|
(make-kbd
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((var)
|
|
(cons
|
|
(make-var
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((cite)
|
|
(cons
|
|
(make-cite
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((abbr)
|
|
(cons
|
|
(make-abbr
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((acronym)
|
|
(cons
|
|
(make-acronym
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((sub)
|
|
(cons
|
|
(make-sub
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((sup)
|
|
(cons
|
|
(make-sup
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((span)
|
|
(cons
|
|
(make-span
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((bdo)
|
|
(cons
|
|
(make-bdo
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((font)
|
|
(cons
|
|
(make-font
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((p)
|
|
(cons
|
|
(make-p
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((h1)
|
|
(cons
|
|
(make-h1
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((h2)
|
|
(cons
|
|
(make-h2
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((h3)
|
|
(cons
|
|
(make-h3
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((h4)
|
|
(cons
|
|
(make-h4
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((h5)
|
|
(cons
|
|
(make-h5
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((h6)
|
|
(cons
|
|
(make-h6
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((q)
|
|
(cons
|
|
(make-q
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((dt)
|
|
(cons
|
|
(make-dt
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((legend)
|
|
(cons
|
|
(make-legend
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((caption)
|
|
(cons
|
|
(make-caption
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((table)
|
|
(cons
|
|
(make-table
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((button)
|
|
(cons
|
|
(make-button
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((fieldset)
|
|
(cons
|
|
(make-fieldset
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((optgroup)
|
|
(cons
|
|
(make-optgroup
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((select)
|
|
(cons
|
|
(make-select
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((label)
|
|
(cons
|
|
(make-label
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((form)
|
|
(cons
|
|
(make-form
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((ol)
|
|
(cons
|
|
(make-ol
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((ul)
|
|
(cons
|
|
(make-ul
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((dir)
|
|
(cons
|
|
(make-dir
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((menu)
|
|
(cons
|
|
(make-menu
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((dl)
|
|
(cons
|
|
(make-dl
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((pre)
|
|
(cons
|
|
(make-pre
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((object)
|
|
(cons
|
|
(make-object (element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((applet)
|
|
(cons
|
|
(make-applet
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((map)
|
|
(cons
|
|
(make--map
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((a)
|
|
(cons
|
|
(make-a
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((address)
|
|
(cons
|
|
(make-address
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
((body)
|
|
(cons
|
|
(make-body
|
|
(element-attributes x)
|
|
(xml-contents->html (element-content x)))
|
|
acc))
|
|
(else acc)))
|
|
((or (pcdata? x) (entity? x)) (cons x acc))
|
|
(else acc)))
|
|
|
|
;; xml->html : Document -> Html
|
|
(define (xml->html doc)
|
|
(let ([root (document-element doc)])
|
|
(unless (eq? 'html (element-name root))
|
|
(error 'xml->html "This is not an html document. Expected 'html, given ~a" (element-name root)))
|
|
(make-html (element-attributes root) (xml-contents->html (element-content root)))))
|
|
|
|
|
|
;; xml-content->html : (listof Content) -> (listof Html-element)
|
|
(define (xml-contents->html contents)
|
|
(foldr xml-single-content->html
|
|
null
|
|
contents))
|
|
|
|
;; read-xhtml : [Input-port] -> Html
|
|
(define read-xhtml (compose xml->html read-xml))
|
|
|
|
;; peel-f : (Html-content -> Bool) (listof Html-content) (listof Html-content) -> (listof Html-content)
|
|
(define (peel-f toss? to-toss acc0)
|
|
(foldr (lambda (x acc)
|
|
(if (toss? x)
|
|
(append (html-full-content x) acc)
|
|
(cons x acc)))
|
|
acc0
|
|
to-toss))
|
|
|
|
;; repackage-html : (listof Html-content) -> Html
|
|
(define (repackage-html contents)
|
|
(let* ([html (memf html? contents)]
|
|
[peeled (peel-f html? contents null)]
|
|
[body (memf body? peeled)])
|
|
(make-html (if html
|
|
(html-element-attributes (car html))
|
|
null)
|
|
(append (filter head? peeled)
|
|
(list (make-body (if body
|
|
(html-element-attributes (car body))
|
|
null)
|
|
(filter (compose not head?) (peel-f body? peeled null))))))))
|
|
|
|
;; clean-up-pcdata : (listof Content) -> (listof Content)
|
|
;; Each pcdata inside a tag that isn't supposed to contain pcdata is either
|
|
;; a) appended to the end of the previous subelement, if that subelement may contain pcdata
|
|
;; b) prepended to the front of the next subelement, if that subelement may contain pcdata
|
|
;; c) discarded
|
|
;; unknown tags may contain pcdata
|
|
;; the top level may contain pcdata
|
|
(define clean-up-pcdata
|
|
;; clean-up-pcdata : (listof Content) -> (listof Content)
|
|
(letrec ([clean-up-pcdata
|
|
(lambda (content)
|
|
(map (lambda (to-fix)
|
|
(cond
|
|
[(element? to-fix)
|
|
(recontent-xml to-fix
|
|
(let ([possible (may-contain (element-name to-fix))]
|
|
[content (element-content to-fix)])
|
|
(if (or (not possible) (memq 'pcdata possible))
|
|
(clean-up-pcdata content)
|
|
(eliminate-pcdata content))))]
|
|
[else to-fix]))
|
|
content))]
|
|
[eliminate-pcdata
|
|
;: (listof Content) -> (listof Content)
|
|
(lambda (content)
|
|
(let ([non-elements (first-non-elements content)]
|
|
[more (memf element? content)])
|
|
(if more
|
|
(let* ([el (car more)]
|
|
[possible (may-contain (element-name el))])
|
|
(if (or (not possible) (memq 'pcdata possible))
|
|
(cons (recontent-xml el (append non-elements (clean-up-pcdata (element-content el)) (eliminate-pcdata (first-non-elements (cdr more)))))
|
|
(or (memf element? (cdr more)) null))
|
|
(cons (recontent-xml el (eliminate-pcdata (element-content el)))
|
|
(eliminate-pcdata (cdr more)))))
|
|
null)))])
|
|
clean-up-pcdata))
|
|
|
|
;; first-non-elements : (listof Content) -> (listof Content)
|
|
(define (first-non-elements content)
|
|
(cond
|
|
[(null? content) null]
|
|
[else (if (element? (car content))
|
|
null
|
|
(cons (car content) (first-non-elements (cdr content))))]))
|
|
|
|
;; recontent-xml : Element (listof Content) -> Element
|
|
(define (recontent-xml e c)
|
|
(make-element (source-start e) (source-stop e) (element-name e) (element-attributes e) c))
|
|
|
|
;; implicit-starts : Symbol Symbol -> (U #f Symbol)
|
|
(define (implicit-starts parent child)
|
|
(or (and (eq? child 'tr) (eq? parent 'table) 'tbody)
|
|
(and (eq? child 'td) (memq parent '(table tbody tfoot thead)) 'tr)))
|
|
|
|
;; may-contain : Kid-lister
|
|
(define may-contain
|
|
(gen-may-contain html-spec))
|
|
|
|
(define may-contain-anything
|
|
(gen-may-contain null))
|
|
|
|
(define use-html-spec (make-parameter #t))
|
|
|
|
;; read-html-as-xml : [Input-port] -> (listof Content)
|
|
(define (read-html-as-xml [port (current-input-port)])
|
|
((if (use-html-spec) clean-up-pcdata values)
|
|
((gen-read-sgml (if (use-html-spec)
|
|
may-contain
|
|
may-contain-anything)
|
|
implicit-starts) port)))
|
|
|
|
;; read-html : [Input-port] -> Html
|
|
(define read-html
|
|
(compose repackage-html xml-contents->html read-html-as-xml))
|