
It was very hard to figure out why � is invalid. Both the XML 1.0 and 1.1 specs say that a CharRef can be '&#' [0-9]+ ';' (sec 4.1), but there is a note on the side that it must also be a valid Char. The Char definition (sec 2.2) then lays out which characters are valid. (The 1.0 and 1.1 specs disagree though.) I've gone with the 1.1 definition. I did not update the rest of the reader to disallow those characters in pcdata segments. If this hurts you or you morally disagree, submit another PR, please.
499 lines
19 KiB
Racket
499 lines
19 KiB
Racket
#lang racket
|
|
(require "structures.rkt")
|
|
|
|
(provide/contract
|
|
[read-xml (() (input-port?) . ->* . document?)]
|
|
[read-xml/document (() (input-port?) . ->* . document?)]
|
|
[read-xml/element (() (input-port?) . ->* . element?)]
|
|
[read-comments (parameter/c boolean?)]
|
|
[collapse-whitespace (parameter/c boolean?)]
|
|
[exn:xml? (any/c . -> . boolean?)])
|
|
|
|
;; Start-tag ::= (make-start-tag Location Location Symbol (listof Attribute))
|
|
(define-struct (start-tag source) (name attrs))
|
|
|
|
;; End-tag ::= (make-end-tag Location Location Symbol)
|
|
(define-struct (end-tag source) (name))
|
|
|
|
;; Token ::= Contents | Start-tag | End-tag | Eof
|
|
|
|
(define read-comments (make-parameter #f))
|
|
(define collapse-whitespace (make-parameter #f))
|
|
|
|
;; read-xml : [Input-port] -> Document
|
|
(define read-xml
|
|
(lambda ([in (current-input-port)])
|
|
(let*-values ([(in pos) (positionify in)]
|
|
[(misc0 start) (read-misc in pos)])
|
|
(make-document (make-prolog misc0 #f empty)
|
|
(read-xml-element-helper pos in start)
|
|
(let-values ([(misc1 end-of-file) (read-misc in pos)])
|
|
(unless (EOF? end-of-file)
|
|
(parse-error (list
|
|
(make-srcloc
|
|
(object-name in)
|
|
#f
|
|
#f
|
|
(location-offset (source-start end-of-file))
|
|
(- (location-offset (source-stop end-of-file))
|
|
(location-offset (source-start end-of-file)))))
|
|
"extra stuff at end of document ~e"
|
|
end-of-file))
|
|
misc1)))))
|
|
|
|
;; read-xml : [Input-port] -> Document
|
|
(define (read-xml/document [in (current-input-port)])
|
|
(let*-values ([(in pos) (positionify in)]
|
|
[(misc0 start) (read-misc in pos)])
|
|
(make-document (make-prolog misc0 #f empty)
|
|
(read-xml-element-helper pos in start)
|
|
empty)))
|
|
|
|
;; read-xml/element : [Input-port] -> Element
|
|
(define read-xml/element
|
|
(lambda ([in (current-input-port)])
|
|
(let-values ([(in pos) (positionify in)])
|
|
(skip-space in)
|
|
(read-xml-element-helper pos in (lex in pos)))))
|
|
|
|
;; read-xml-element-helper : Nat Iport Token -> Element
|
|
(define (read-xml-element-helper pos in start)
|
|
(cond
|
|
[(start-tag? start) (read-element start in pos)]
|
|
[(element? start) start]
|
|
[else
|
|
(parse-error
|
|
(list
|
|
(make-srcloc
|
|
(object-name in)
|
|
#f
|
|
#f
|
|
; XXX Some data structures should really be changed to be sources
|
|
(if (source? start)
|
|
(location-offset (source-start start))
|
|
#f)
|
|
(if (source? start)
|
|
(- (location-offset (source-stop start))
|
|
(location-offset (source-start start)))
|
|
#f)))
|
|
"expected root element - received ~e"
|
|
(cond
|
|
[(pcdata? start) (pcdata-string start)]
|
|
[(EOF? start) eof]
|
|
[else start]))]))
|
|
|
|
;; read-misc : Input-port (-> Location) -> (listof Misc) Token
|
|
(define (read-misc in pos)
|
|
(let read-more ()
|
|
(let ([x (lex in pos)])
|
|
(cond
|
|
[(p-i? x)
|
|
(let-values ([(lst next) (read-more)])
|
|
(values (cons x lst) next))]
|
|
[(comment? x)
|
|
(let-values ([(lst next) (read-more)])
|
|
(if (read-comments)
|
|
(values (cons x lst) next)
|
|
(values lst next)))]
|
|
[(and (pcdata? x) (andmap char-whitespace? (string->list (pcdata-string x))))
|
|
(read-more)]
|
|
[else (values null x)]))))
|
|
|
|
;; read-element : Start-tag Input-port (-> Location) -> Element
|
|
(define (read-element start in pos)
|
|
(let ([name (start-tag-name start)]
|
|
[a (source-start start)]
|
|
[b (source-stop start)])
|
|
(let read-content ([k (lambda (body end-loc)
|
|
(make-element
|
|
a end-loc name (start-tag-attrs start)
|
|
body))])
|
|
(let ([x (lex in pos)])
|
|
(cond
|
|
[(EOF? x)
|
|
(parse-error (list
|
|
(make-srcloc
|
|
(object-name in)
|
|
#f
|
|
#f
|
|
(location-offset (source-start start))
|
|
(- (location-offset (source-stop start))
|
|
(location-offset (source-start start)))))
|
|
"unclosed `~a' tag at [~a ~a]"
|
|
name
|
|
(format-source a)
|
|
(format-source b))]
|
|
[(start-tag? x)
|
|
(let ([next-el (read-element x in pos)])
|
|
(read-content (lambda (body end-loc)
|
|
(k (cons next-el body)
|
|
end-loc))))]
|
|
[(end-tag? x)
|
|
(let ([end-loc (source-stop x)])
|
|
(unless (eq? name (end-tag-name x))
|
|
(parse-error
|
|
(list
|
|
(make-srcloc (object-name in)
|
|
#f
|
|
#f
|
|
(location-offset a)
|
|
(- (location-offset b) (location-offset a)))
|
|
(make-srcloc (object-name in)
|
|
#f
|
|
#f
|
|
(location-offset (source-start x))
|
|
(- (location-offset end-loc) (location-offset (source-start x)))))
|
|
"start tag `~a' at [~a ~a] doesn't match end tag `~a' at [~a ~a]"
|
|
name
|
|
(format-source a)
|
|
(format-source b)
|
|
(end-tag-name x)
|
|
(format-source (source-start x))
|
|
(format-source end-loc)))
|
|
(k null end-loc))]
|
|
[(entity? x) (read-content (lambda (body end-loc)
|
|
(k (cons (expand-entity x) body)
|
|
end-loc)))]
|
|
[(comment? x) (if (read-comments)
|
|
(read-content (lambda (body end-loc) (k (cons x body) end-loc)))
|
|
(read-content k))]
|
|
[else (read-content (lambda (body end-loc) (k (cons x body) end-loc)))])))))
|
|
|
|
;; expand-entity : Entity -> (U Entity Pcdata)
|
|
;; more here - allow expansion of user defined entities
|
|
(define (expand-entity x)
|
|
(let ([expanded (default-entity-table (entity-text x))])
|
|
(if expanded
|
|
(make-pcdata (source-start x) (source-stop x) expanded)
|
|
x)))
|
|
|
|
;; default-entity-table : Symbol -> (U #f String)
|
|
(define (default-entity-table name)
|
|
(case name
|
|
[(amp) "&"]
|
|
[(lt) "<"]
|
|
[(gt) ">"]
|
|
[(quot) "\""]
|
|
[(apos) "'"]
|
|
[else #f]))
|
|
|
|
(define-struct (EOF source) ())
|
|
|
|
;; lex : Input-port (-> Location) -> (U Token special)
|
|
(define (lex in pos)
|
|
(let ([c (peek-char-or-special in)])
|
|
(cond
|
|
[(eof-object? c) (read-char in) (EOF (pos) (pos))]
|
|
[(eq? c #\&) (lex-entity in pos)]
|
|
[(eq? c #\<) (lex-tag-cdata-pi-comment in pos)]
|
|
[(not (char? c)) (read-char-or-special in)]
|
|
[else (lex-pcdata in pos)])))
|
|
|
|
; lex-entity : Input-port (-> Location) -> Entity
|
|
; pre: the first char is a #\&
|
|
(define (lex-entity in pos)
|
|
(let ([start (pos)])
|
|
(read-char in)
|
|
(let ([data (case (peek-char in)
|
|
[(#\#)
|
|
(read-char in)
|
|
(let ([n (case (peek-char in)
|
|
[(#\x) (read-char in)
|
|
(string->number (read-until #\; in pos) 16)]
|
|
[else (string->number (read-until #\; in pos))])])
|
|
(unless (number? n)
|
|
(lex-error in pos "malformed numeric entity"))
|
|
(unless (valid-char? n)
|
|
(lex-error in pos "not a well-formed numeric entity (does not match the production for Char, see XML 4.1)"))
|
|
n)]
|
|
[else
|
|
(begin0
|
|
(lex-name in pos)
|
|
(unless (eq? (read-char in) #\;)
|
|
(lex-error in pos "expected ; at the end of an entity")))])])
|
|
(make-entity start (pos) data))))
|
|
|
|
; lex-tag-cdata-pi-comment : Input-port (-> Location) -> Start-tag | Element | End-tag | Cdata | p-i | Comment
|
|
; pre: the first char is a #\<
|
|
(define (lex-tag-cdata-pi-comment in pos)
|
|
(let ([start (pos)])
|
|
(read-char in)
|
|
(case (non-eof peek-char-or-special in pos)
|
|
[(#\!)
|
|
(read-char in)
|
|
(case (non-eof peek-char in pos)
|
|
[(#\-) (read-char in)
|
|
(unless (eq? (read-char-or-special in) #\-)
|
|
(lex-error in pos "expected second - after <!-"))
|
|
(let ([data (lex-comment-contents in pos)])
|
|
(unless (eq? (read-char in) #\>)
|
|
(lex-error in pos "expected > to end comment (\"--\" can't appear in comments)"))
|
|
;(make-comment start (pos) data)
|
|
(make-comment data))]
|
|
[(#\[) (read-char in)
|
|
(unless (string=? (read-string 6 in) "CDATA[")
|
|
(lex-error in pos "expected CDATA following <["))
|
|
(let ([data (lex-cdata-contents in pos)])
|
|
(make-cdata start (pos) (format "<![CDATA[~a]]>" data)))]
|
|
[else (skip-dtd in pos)
|
|
(skip-space in)
|
|
(unless (eq? (peek-char-or-special in) #\<)
|
|
(lex-error in pos "expected p-i, comment, or element after doctype"))
|
|
(lex-tag-cdata-pi-comment in pos)])]
|
|
[(#\?) (read-char in)
|
|
(let ([name (lex-name in pos)])
|
|
(skip-space in)
|
|
(let ([data (lex-pi-data in pos)])
|
|
(make-p-i start (pos) name data)))]
|
|
[(#\/) (read-char in)
|
|
(let ([name (lex-name in pos)])
|
|
(skip-space in)
|
|
(unless (eq? (read-char-or-special in) #\>)
|
|
(lex-error in pos "expected > to close ~a's end tag" name))
|
|
(make-end-tag start (pos) name))]
|
|
[else ; includes 'special, but lex-name will fail in that case
|
|
(let ([name (lex-name in pos)]
|
|
[attrs (lex-attributes in pos)])
|
|
(skip-space in)
|
|
(case (read-char-or-special in)
|
|
[(#\/)
|
|
(unless (eq? (read-char in) #\>)
|
|
(lex-error in pos "expected > to close empty element ~a" name))
|
|
(make-element start (pos) name attrs null)]
|
|
[(#\>) (make-start-tag start (pos) name attrs)]
|
|
[else (lex-error in pos "expected / or > to close tag `~a'" name)]))])))
|
|
|
|
;; lex-attributes : Input-port (-> Location) -> (listof Attribute)
|
|
(define (lex-attributes in pos)
|
|
(sort (let loop ()
|
|
(skip-space in)
|
|
(cond [(name-start? (peek-char-or-special in))
|
|
(cons (lex-attribute in pos) (loop))]
|
|
[else null]))
|
|
(lambda (a b)
|
|
(let ([na (attribute-name a)]
|
|
[nb (attribute-name b)])
|
|
(cond
|
|
[(eq? na nb) (lex-error in pos "duplicated attribute name ~a" na)]
|
|
[else (string<? (symbol->string na) (symbol->string nb))])))))
|
|
|
|
;; lex-attribute : Input-port (-> Location) -> Attribute
|
|
(define (lex-attribute in pos)
|
|
(let ([start (pos)]
|
|
[name (lex-name in pos)])
|
|
(skip-space in)
|
|
(unless (eq? (read-char in) #\=)
|
|
(lex-error in pos "expected = in attribute ~a" name))
|
|
(skip-space in)
|
|
;; more here - handle entites and disallow "<"
|
|
(let* ([delimiter (read-char-or-special in)]
|
|
[value (case delimiter
|
|
[(#\' #\")
|
|
(list->string
|
|
(let read-more ()
|
|
(let ([c (non-eof peek-char-or-special in pos)])
|
|
(cond
|
|
[(eq? c 'special)
|
|
(lex-error in pos "attribute values cannot contain non-text values")]
|
|
[(eq? c delimiter) (read-char in) null]
|
|
[(eq? c #\&)
|
|
(let ([entity (expand-entity (lex-entity in pos))])
|
|
(if (pcdata? entity)
|
|
(append (string->list (pcdata-string entity)) (read-more))
|
|
;; more here - do something with user defined entites
|
|
(read-more)))]
|
|
[else (read-char in) (cons c (read-more))]))))]
|
|
[else (if (char? delimiter)
|
|
(lex-error in pos "attribute values must be in ''s or in \"\"s")
|
|
delimiter)])])
|
|
(make-attribute start (pos) name value))))
|
|
|
|
;; skip-space : Input-port -> Void
|
|
;; deviation - should sometimes insist on at least one space
|
|
(define (skip-space in)
|
|
(let loop ()
|
|
(let ([c (peek-char-or-special in)])
|
|
(when (and (char? c)
|
|
(char-whitespace? c))
|
|
(read-char in)
|
|
(loop)))))
|
|
|
|
;; lex-pcdata : Input-port (-> Location) -> Pcdata
|
|
;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec
|
|
(define (lex-pcdata in pos)
|
|
(let ([start (pos)]
|
|
[data (let loop ()
|
|
(let ([next (peek-char-or-special in)])
|
|
(cond
|
|
[(or (eof-object? next)
|
|
(not (char? next))
|
|
(eq? next #\&)
|
|
(eq? next #\<))
|
|
null]
|
|
[(and (char-whitespace? next) (collapse-whitespace))
|
|
(skip-space in)
|
|
(cons #\space (loop))]
|
|
[else (cons (read-char in) (loop))])))])
|
|
(make-pcdata start
|
|
(pos)
|
|
(list->string data))))
|
|
|
|
;; lex-name : Input-port (-> Location) -> Symbol
|
|
(define (lex-name in pos)
|
|
(let ([c (non-eof read-char-or-special in pos)])
|
|
(unless (name-start? c)
|
|
(lex-error in pos "expected name, received ~e" c))
|
|
(string->symbol
|
|
(list->string
|
|
(cons c (let lex-rest ()
|
|
(let ([c (non-eof peek-char-or-special in pos)])
|
|
(cond
|
|
[(eq? c 'special)
|
|
(lex-error in pos "names cannot contain non-text values")]
|
|
[(name-char? c)
|
|
(cons (read-char in) (lex-rest))]
|
|
[else null]))))))))
|
|
|
|
;; skip-dtd : Input-port (-> Location) -> Void
|
|
(define (skip-dtd in pos)
|
|
(let skip ()
|
|
(case (non-eof read-char in pos)
|
|
[(#\') (read-until #\' in pos) (skip)]
|
|
[(#\") (read-until #\" in pos) (skip)]
|
|
[(#\<)
|
|
(case (non-eof read-char in pos)
|
|
[(#\!) (case (non-eof read-char in pos)
|
|
[(#\-) (read-char in) (lex-comment-contents in pos) (read-char in) (skip)]
|
|
[else (skip) (skip)])]
|
|
[(#\?) (lex-pi-data in pos) (skip)]
|
|
[else (skip) (skip)])]
|
|
[(#\>) (void)]
|
|
[else (skip)])))
|
|
|
|
;; name-start? : Char -> Bool
|
|
(define (name-start? ch)
|
|
(and (char? ch)
|
|
(or (char-alphabetic? ch)
|
|
(eq? ch #\_)
|
|
(eq? ch #\:))))
|
|
|
|
;; name-char? : Char -> Bool
|
|
(define (name-char? ch)
|
|
(and (char? ch)
|
|
(or (name-start? ch)
|
|
(char-numeric? ch)
|
|
(eq? ch #\.)
|
|
(eq? ch #\-))))
|
|
|
|
;; read-until : Char Input-port (-> Location) -> String
|
|
;; discards the stop character, too
|
|
(define (read-until char in pos)
|
|
(list->string
|
|
(let read-more ()
|
|
(let ([c (non-eof read-char in pos)])
|
|
(cond
|
|
[(eq? c char) null]
|
|
[else (cons c (read-more))])))))
|
|
|
|
;; non-eof : (Input-port -> (U Char Eof)) Input-port (-> Location) -> Char
|
|
(define (non-eof f in pos)
|
|
(let ([c (f in)])
|
|
(cond
|
|
[(eof-object? c) (lex-error in pos "unexpected eof")]
|
|
[else c])))
|
|
|
|
;; gen-read-until-string : String -> Input-port (-> Location) -> String
|
|
;; uses Knuth-Morris-Pratt from
|
|
;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
|
|
;; discards stop from input
|
|
;; ---
|
|
;; Modified by Jay to look more like the version on Wikipedia after discovering a bug when parsing CDATA
|
|
;; The use of the hasheq table and the purely numeric code trades hash efficiency for stack/ec capture efficiency
|
|
(struct hash-string (port pos ht))
|
|
(define (hash-string-ref hs k)
|
|
(match-define (hash-string port pos ht) hs)
|
|
(hash-ref! ht k (lambda () (non-eof read-char port pos))))
|
|
|
|
(define (gen-read-until-string W)
|
|
(define Wlen (string-length W))
|
|
(define T (make-vector Wlen #f))
|
|
(vector-set! T 0 -1)
|
|
(vector-set! T 1 0)
|
|
(let kmp-table ([pos 2] [cnd 0])
|
|
(when (pos . < . Wlen)
|
|
(cond
|
|
[(char=? (string-ref W (sub1 pos)) (string-ref W cnd))
|
|
(vector-set! T pos (add1 cnd))
|
|
(kmp-table (add1 pos) (add1 cnd))]
|
|
[(cnd . > . 0)
|
|
(kmp-table pos (vector-ref T cnd))]
|
|
[(zero? cnd)
|
|
(vector-set! T pos 0)
|
|
(kmp-table (add1 pos) 0)])))
|
|
(lambda (S-as-port S-pos)
|
|
(define S (hash-string S-as-port S-pos (make-hasheq)))
|
|
(define W-starts-at
|
|
(let kmp-search ([m 0] [i 0])
|
|
(if (char=? (string-ref W i) (hash-string-ref S (+ m i)))
|
|
(let ([i (add1 i)])
|
|
(if (= i Wlen)
|
|
m
|
|
(kmp-search m i)))
|
|
(let* ([Ti (vector-ref T i)]
|
|
[m (+ m i (* -1 Ti))])
|
|
(if (Ti . > . -1)
|
|
(let ([i Ti])
|
|
(kmp-search m i))
|
|
(let ([i 0])
|
|
(kmp-search m i)))))))
|
|
(list->string
|
|
(for/list ([i (in-range 0 W-starts-at)])
|
|
(hash-string-ref S i)))))
|
|
|
|
;; "-->" makes more sense, but "--" follows the spec.
|
|
(define lex-comment-contents (gen-read-until-string "--"))
|
|
(define lex-pi-data (gen-read-until-string "?>"))
|
|
(define lex-cdata-contents (gen-read-until-string "]]>"))
|
|
|
|
;; positionify : Input-port -> Input-port (-> Location)
|
|
; This function predates port-count-lines! and port-next-location.
|
|
; Otherwise I would have used those directly at the call sites.
|
|
(define (positionify in)
|
|
(port-count-lines! in)
|
|
(values
|
|
in
|
|
(lambda ()
|
|
(let-values ([(line column offset) (port-next-location in)])
|
|
(make-location line column offset)))))
|
|
|
|
;; locs : (listof (list number number))
|
|
(define-struct (exn:xml exn:fail:read) ())
|
|
|
|
;; lex-error : Input-port String (-> Location) TST* -> alpha
|
|
;; raises a lexer error, using exn:xml
|
|
(define (lex-error in pos str . rest)
|
|
(let* ([the-pos (pos)]
|
|
[offset (location-offset the-pos)])
|
|
(raise
|
|
(make-exn:xml
|
|
(format "read-xml: lex-error: at position ~a: ~a"
|
|
(format-source the-pos)
|
|
(apply format str rest))
|
|
(current-continuation-marks)
|
|
(list
|
|
(make-srcloc (object-name in) #f #f offset 1))))))
|
|
|
|
;; parse-error : (listof srcloc) (listof TST) *-> alpha
|
|
;; raises a parsing error, using exn:xml
|
|
(define (parse-error src fmt . args)
|
|
(raise (make-exn:xml (string-append "read-xml: parse-error: "
|
|
(apply format fmt args))
|
|
(current-continuation-marks)
|
|
src)))
|
|
|
|
;; format-source : Location -> string
|
|
;; to format the source location for an error message
|
|
(define (format-source loc)
|
|
(if (location? loc)
|
|
(format "~a.~a/~a" (location-line loc) (location-char loc) (location-offset loc))
|
|
(format "~a" loc))) |