racket/collects/html/dtd.ss
2005-05-27 18:56:37 +00:00

205 lines
8.3 KiB
Scheme

; copyright by Paul Graunke June 2000 AD
(module dtd mzscheme
(require "dtd-ast.ss" "entity-expander.ss"
(lib "list.ss")
(lib "string.ss"))
(provide read-sgml-dtd summarize-dtd)
; Note: this library only supports a the subset of SGML used for HTML as descibed in
; http://www.w3.org/TR/html401/intro/sgmltut.html
; Spec = (listof (cons (listof Symbol) (listof Symbol)))
; read-sgml-dtd : Input-port -> Dtd
(define (read-sgml-dtd in)
(parameterize ([read-case-sensitive #t])
(let ([in (filter-comments in)])
(let read-items ()
(let skip-entity-refs ()
(skip-whitespace in)
(when (eq? (peek-char in) #\%)
(skip-until (lambda (c) (eq? c #\;)) in)
(skip-entity-refs)))
(cond
[(eof-object? (peek-char in)) null]
[else
(unless (and (eq? (read-char in) #\<)
(eq? (read-char in) #\!))
(error 'read-sgml-dtd "unknown junk in dtd at ~a" (file-position in)))
(case (peek-char in)
[(#\>) (read-char in) (read-items)]
[(#\[) (read-char in)
(skip-whitespace in)
(let ([uh (read-id in)])
(skip-until (lambda (c) (eq? c #\[)) in)
(cons (begin0 (make-thingy uh (read-until (lambda (c) (eq? c #\])) in))
(unless (and (eq? (read-char in) #\]) (eq? (read-char in) #\>))
(error 'read-sgml-dtd "Invalid <![ uh [ whatever ]]> thingy.")))
(read-items)))]
[else (cons (let ([name (read-id in)])
(case name
[(entity) (skip-until (lambda (c) (eq? c #\%)) in)
(skip-whitespace in)
(begin0 (make-entity-def (read-until char-whitespace? in); more here check case sensitivity ; (read-id in)
(begin (skip-until quote? in) (read-until quote? in)))
(skip-until gt? in))]
[(element)
(make-element-def (read-names in)
(read-required/optional in)
(read-required/optional in)
(read-until gt? in))]
[(attlist)
(make-att-list (read-names in) (read-until gt? in))]
[else (error 'read-sgml-dtd "Unknown name: <!~s ...>" name)]))
(read-items))])])))))
; quote? : Char -> Bool
(define (quote? c) (eq? c #\"))
; gt? : Char -> Bool
(define (gt? c) (eq? c #\>))
; read-names : Input-port -> (listof Symbol)
(define (read-names in)
(skip-whitespace in)
(case (peek-char in)
[(#\() (read-char in)
(skip-whitespace in)
(cons (read-id in)
(let loop ()
(skip-whitespace in)
(case (read-char in)
[(#\| #\,) (skip-whitespace in) (cons (read-id in) (loop))]
[(#\)) null]
[else (error 'read-names "unexpected character in element names '~a'" (read-line in))])))]
[else (list (read-id in))]))
; read-required/optional : Input-port -> Bool
(define (read-required/optional in)
(skip-whitespace in)
(eq? (char-downcase (read-char in)) #\o))
; read-id : Input-port -> Symbol
(define (read-id in)
(string->symbol
(list->string
(let loop ()
(let ([c (peek-char in)])
(if (and (not (eof-object? c))
(or (char-id? c)
; this is a hack - allowing % and ; characters accepts parameter entities.
(memq c '(#\% #\;))))
(cons (char-downcase c) (begin (read-char in) (loop)))
null))))))
; char-id? : Char -> Bool
; Note: the & is not legal, really, but SEC's Edgar format uses it. Yuck.
; Nope. & messes up SGML dtds - remove it.
(define (char-id? c)
(or (char-alphabetic? c) (char-numeric? c) (memq c '(#\_ #\- #\: #\.))))
; skip-whitespace : Input-port -> Void
(define (skip-whitespace in) (skip-past char-whitespace? in))
; skip-past : (Char -> Bool) Input-port -> Void
(define (skip-past skip? in)
(let loop ()
(when (let ([c (peek-char in)])
(and (not (eof-object? c)) (skip? c)))
(read-char in)
(loop))))
; skip-until : (Char -> Bool) Input-port -> Void
(define (skip-until delimiter? in)
(let loop ()
(unless (delimiter? (read-char in))
(loop))))
; read-until : (Char -> Bool) Input-port -> String
; skips delimiter
(define (read-until delimiter? in)
(list->string
(let loop ()
(let ([c (read-char in)])
(if (delimiter? c)
null
(cons c (loop)))))))
; filter-comments : Input-port -> Input-port
; Note: <!-- blah --> comments come out <!>, which need to be removed later.
(define (filter-comments in)
(make-input-port
(lambda ()
(let ([char (read-char in)])
(if (and (eq? char #\-) (eq? (peek-char in) #\-))
(let loop ()
(if (and (eq? (read-char in) #\-) (eq? (read-char in) #\-))
(read-char in)
(loop)))
char)))
(lambda ()
; more here - this is broken if the next char is #\- and starts a comment and more chars
; aren't ready after the comment.
(char-ready? in))
(lambda () (close-input-port in))))
; summarize-dtd : Dtd -> Spec
(define (summarize-dtd dtd)
(let ([expander
(foldr (lambda (x acc)
(extend-entity-expander (entity-def-name x) (entity-def-value x) acc))
empty-entity-expander
(filter entity-def? dtd))]
[hack-content ;: String -> (listof Symbol)
(lambda (content)
(let* ([stripped (filter (lambda (x) (not (char-whitespace? x))) (string->list content))]
[cludged (map (lambda (c)
(cond
[(char-id? c) c]
[else #\space]))
stripped)]
[symbols (read-from-string-all (list->string cludged))]
[nix (memq '- symbols)])
(cond
[nix (filter (lambda (s) (not (memq s nix))) symbols)]
[(and (pair? symbols) (null? (cdr symbols)) (eq? (car symbols) 'empty)) null]
[else symbols])))])
(merge-contents
(map (lambda (x)
(cons (foldr (lambda (s acc)
(let ([in (open-input-string
(format "(~a)" (expand-entities expander (symbol->string s))))])
(append (read-names in) acc)))
null
(element-def-name x))
(quicksort (hack-content (expand-entities expander (element-def-content x)))
(lambda (a b)
(string<=? (symbol->string a)
(symbol->string b))))))
(filter element-def? dtd)))))
; merge-contents : Spec -> Spec
(define merge-contents
(letrec ([comb
(lambda (x rst)
(cond
[(null? rst) (list x)]
[else (cond
[(equal? (cdr x) (cdar rst))
(cons (cons (append (car x) (caar rst))
(cdr x))
(cdr rst))]
[else (cons (car rst) (comb x (cdr rst)))])]))])
(lambda (lst)
(cond
[(null? lst) null]
[else (comb (car lst)
(merge-contents (cdr lst)))])))))
; loosedtd.txt is from www.w3c.org's html4.0.1 spec.
;(define dtd (call-with-input-file "/home/ptg/world/junk/loosedtd.txt" read-sgml-dtd))
;(call-with-output-file "html-spec"
; (lambda (out) (write (cons '((mzscheme) pcdata) (summarize-dtd dtd)) out))
; 'truncate)