; Module header is generated automatically #cs(module id mzscheme (require "common.ss") (require "myenv.ss") (require "access-remote.ss") (require "sxpathlib.ss") ;; Creation and manipulation of the ID-index ;; Provides the DTD parser for extracting ID attribute declarations ; ; This software is in Public Domain. ; IT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. ; ; Please send bug reports and comments to: ; lizorkin@hotbox.ru Dmitry Lizorkin ; ; ID-index provides fast access to XML elements by their unique ID. ; ID-index has the following structure: ; id-index = ( (id . element) (id . element) ... ) ; i.e. ; id-index = (list ; (cons id element) ; (cons id element) ; ...) ; where ; id - (a string) element's unique ID ; element - an SXML presentation of an element ; ; Creation of an id-index generally consists of two steps. ; On the first step, a document declaration (internal and external DTD) ; is read and information of ID attributes is extracted. ; This is presented in a following form: ; id-attrs = ( (elem-name attr-name attr-name attr-name ...) ; (elem-name attr-name attr-name attr-name ...) ... ) ; i.e. ; id-attrs = (list ; (cons ; elem-name ; (list attr-name attr-name attr-name ...) ; (cons ; elem-name ; (list attr-name attr-name attr-name ...) ; ...) ; where ; elem-name - (a symbol) a name of the element ; attr-name - (a symbol) element's attribute having an ID type ; ; On the second step, if an SXML presentation of the document is available, ; 'id-attrs' are used for forming an 'id-index'. ; If there is no SXML presentation for a document yet, both steps are ; performed as a single function call - to a specialized SSAX parser. ; This parser constructs an SXML presentation and an 'id-index' ; in a single pass ; ; ATTENTION: ; 1. Only non-qualified 'elem-name' and 'attr-name' are correctly supported ; 2. Parameter entity reference (PEReference) is NOT supported ;========================================================================= ; Functions which read XML document declaration ;------------------------------------------------ ; Trivial functions that ignore symbols ; Function reads a whitespace (S production) (define (id:process-s port) (let ((symb (peek-char port))) (cond((eof-object? symb) symb) ((char=? symb #\space) (read-char port) (id:process-s port)) ((char=? symb #\return) (read-char port) (id:process-s port)) ((char=? symb #\newline)(read-char port) (id:process-s port)) ((char=? symb #\tab)(read-char port) (id:process-s port)) (else symb)))) ; Ignores all symbols until template-symbol (define (id:ignore-until templ-sym port) (let loop ((symb (peek-char port))) (cond((eof-object? symb) symb) ((equal? symb templ-sym) (read-char port) symb) (else (read-char port) (loop (peek-char port)))))) ;------------------------------------------------ ; These functions perform reading from a file ; Read N symbols from a port (define (id:read-n num port) (id:process-s port) (let loop ((num num) (res '())) (if(= num 0) (list->string (reverse res)) (let((symb (peek-char port))) (cond((eof-object? symb) symb) (else (read-char port) (loop (- num 1) (cons symb res)))))))) ; This function reads a name - a sequence of characters ending with ; a whitespace or '<'. '>', '(', ')', '[', ']', '|' (define (id:read-name port) (id:process-s port) (let loop ((res "")) (let ((symb (peek-char port))) (cond((eof-object? symb) res) ((member symb '(#\space #\tab #\return #\newline #\< #\> #\( #\) #\[ #\] #\|)) res) (else (loop (string-append res (string (read-char port))))))))) ; This function reads a literal ; literal ::= ('"' [^"]* '"') | ("'" [^']* "'") ; A string is returned (define (id:process-literal port) (id:process-s port) (let((quot (peek-char port))) (if(eof-object? quot) ; an incorrect situaltion "" (let((quot (if (char=? (read-char port) #\") #\" #\'))) (let loop ((res '())) (let((symb (peek-char port))) (cond ((eof-object? symb) (list->string (reverse res))) ((char=? symb quot) ; end of the string (read-char port) (list->string (reverse res))) (else (read-char port) (loop (cons symb res)))))))))) ;------------------------------------------------ ; Miscellaneous ; Converts a string into small letters (define (id:to-small str) (let loop ((arg (string->list str)) (res '())) (cond((null? arg) (list->string (reverse res))) ((char-upper-case? (car arg)) (loop (cdr arg) (cons (char-downcase (car arg)) res))) (else (loop (cdr arg) (cons (car arg) res)))))) ; Takes an 'id-attrs' which can contain equal element names ; Returns a new 'id-attrs' where all element names are unique (define (id:unite-id-attrs id-attrs) (let loop ((id-attrs id-attrs) (new '())) (if (null? id-attrs) new (let rpt ((elem-name (caar id-attrs)) (atts (cdar id-attrs)) (rest (cdr id-attrs)) (id-attrs '())) (cond ((null? rest) (loop id-attrs (cons (cons elem-name atts) new))) ((equal? (caar rest) elem-name) (rpt elem-name (append atts (cdar rest)) (cdr rest) id-attrs)) (else (rpt elem-name atts (cdr rest) (cons (car rest) id-attrs)))))))) ;------------------------------------------------ ; Parsing XML productions concerning document declaration ; These functions are not intendes for error detection, they assume that ; the document is correct ; This function ignores information related to a PI production [16] ; [16] PI ::= '' Char*)))? '?>' ; It looks for an ending '?>' template (define (id:ignore-PI port) (id:ignore-until #\? port) (let ((symb (peek-char port))) (cond((eof-object? symb) symb) ((equal? symb #\>) (read-char port) symb) (else (id:ignore-PI port))))) ; This function ignores information related to a Comment production [15] ; [15] Comment ::= '' ; The starting '' template (define (id:ignore-comment port) (read-char port) ; it is '-' (read-char port) ; it is '-' (id:ignore-until #\- port) (let((sym1 (peek-char port))) (cond((eof-object? sym1) sym1) ((char=? sym1 #\-) (read-char port) (let((sym2 (read-char port))) ; must be '>' sym2)) (else (id:ignore-comment port))))) ; This function processes AttType production ([54]-[59] in XML specification) ; [54] AttType ::= StringType | TokenizedType | EnumeratedType ; [55] StringType ::= 'CDATA' ; [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' ; | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' ; [57] EnumeratedType ::= NotationType | Enumeration ; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' ; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' ; The function returnd #t if the attribute has an ID type and #f otherwise (define (id:AttType-ID? port) (let((type (id:to-small (id:read-name port)))) (cond((string=? type "id") #t) ((string=? type "notation") (id:process-s port) (read-char port) ; it is #\( (id:ignore-until #\) port) #f) ((and (string=? type "") (char=? (peek-char port) #\()) ; see [59] (id:ignore-until #\) port) #f) (else #f)))) ; This function processes DefaultDecl production ([60] in XML specification) ; [60] DefaultDecl ::= '#REQUIRED' ; | '#IMPLIED' ; | (('#FIXED' S)? AttValue) ; The result is always #t (define (id:process-DefaultDecl port) (let((type (id:to-small (id:read-name port)))) (cond((string=? type "#fixed") (id:read-name port) ; reads a default value #t) (else #t)))) ; This function processes AttDef production ([53] in XML specification) ; [53] AttDef ::= S Name S AttType S DefaultDecl ; If an attribute has an ID type, (list attribule-name) is returned ; (a list of one element). Otherwise, function returns an empty list (define (id:process-AttDef port) (let((att-name (string->symbol (id:read-name port)))) (let((bool (id:AttType-ID? port))) (id:process-DefaultDecl port) (if bool (list att-name) '())))) ; The function processes AttlistDecl production ([52] in XML specification) ; [52] AttlistDecl ::= '' ; The starting 'symbol (id:read-name port)))) (let loop ((atts '())) (id:process-s port) (cond((char=? (peek-char port) #\>) ; no more attributes will be declared (read-char port) (if(null? atts) '() (list (cons element-name atts)))) (else (loop (append (id:process-AttDef port) atts))))))) ; This function processes a multiple markupdecl production [29] ; [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl ; | NotationDecl | PI | Comment ; 'id-attrs' are returned as a result (define (id:process-markupdecl* port) (let loop ((id-attrs '())) (let((beg (id:read-n 2 port))) (cond((eof-object? beg) id-attrs) ; the file is over ((string=? beg "]>") id-attrs) ; the end of the markupdecl ((string=? beg " port) (loop id-attrs))))) (else ; an error condition (cerr "Error in markupdecl production: unexpected " beg nl) (id:ignore-until #\> port) id-attrs))))) ; This function processes a doctypedecl production ([75] in XML specification) ; [75] ExternalID ::= 'SYSTEM' S SystemLiteral ; | 'PUBLIC' S PubidLiteral S SystemLiteral ; The function ignores a PubidLiteral ; 'id-attrs' are returned as a result (define (id:process-ExternalID port) (let((system-literal (let((name (id:to-small (id:read-name port)))) (cond ((string=? name "system") (id:process-literal port)) ((string=? name "public") (id:process-literal port) (id:process-literal port)) (else #f))))) (if(not system-literal) '() ; an incorrect situation (let((external-port (open-input-resource system-literal))) (if(not external-port) '() ; a failure (let((id-attrs (id:process-markupdecl* external-port))) (close-input-port external-port) id-attrs)))))) ; This function processes a doctypedecl production ([28] in XML specification) ; [28] doctypedecl ::= '' ; The function doesn't process a DeclSep (this is a PEReference which ; this programme doesn't support) ; The starting '' ; [27] Misc ::= Comment | PI | S ; 'id-attrs' are returned as a result (define (id:process-prolog port) (let((beg (id:read-n 2 port))) (cond((eof-object? beg) '()) ; a file is over - strange... ((string=? beg "SXML+id document id-attrs) (let((aux-subtrees (let((aux ((select-kids (ntype?? '@@)) document))) (if(null? aux) '() (let rpt ((res '()) (to-see (cdar aux))) (cond ((null? to-see) (reverse res)) ((equal? (caar to-see) 'id-index) (rpt res (cdr to-see))) (else (rpt (cons (car to-see) res) (cdr to-see))))))))) (let loop ((nodeset (list document)) (id-index '())) (if(null? nodeset) (let((kids ((select-kids (lambda (node) (not (and (pair? node) (equal? (car node) '@@))))) document))) (cons* '*TOP* (cons* '@@ (cons 'id-index id-index) aux-subtrees) kids)) (let((cur-node (car nodeset))) (cond ((not (pair? cur-node)) ; a text node (loop (cdr nodeset) id-index)) ((assoc (car cur-node) id-attrs) => (lambda (lst) (let((id-values ((select-kids (lambda (x) #t)) ((sxml:filter (lambda (x) (member (car x) (cdr lst)))) ((select-kids (lambda (x) #t)) ((select-kids (ntype?? '@)) cur-node)))))) (loop (append ((select-kids (ntype?? '*)) (car nodeset)) (cdr nodeset)) (append id-index (map (lambda (x) (cons x cur-node)) id-values)))))) (else (loop (append ((select-kids (ntype?? '*)) (car nodeset)) (cdr nodeset)) id-index)))))))) ;========================================================================= ; Some stuff for a SSAX multi parser ;------------------------------------------------ ; Id-related part of the seed ; id:seed = (list id-attrs id-index) ; id-attrs, id-index - see a head comment ; Mutator (define (id:make-seed id-attrs id-index) (list id-attrs id-index)) ; Accessors (define (id:seed-attrs id:seed) (car id:seed)) (define (id:seed-index id:seed) (cadr id:seed)) ;------------------------------------------------ ; Handler units ; This function is called by the NEW-LEVEL-SEED handler ; A new 'id:seed' is returned (define (id:new-level-seed-handler id:seed) id:seed) ; This function is called by the FINISH-ELEMENT handler ; A new 'id:seed' is returned (define (id:finish-element-handler elem-gi attributes id:seed element) (cond ((assoc elem-gi (id:seed-attrs id:seed)) => (lambda (lst) (let loop ((atts attributes) (id-index (id:seed-index id:seed))) (if (null? atts) (id:make-seed (id:seed-attrs id:seed) id-index) (let((att (car atts))) (cond ((pair? (car att)) ; namespace aware (loop (cdr atts) id-index)) ((member (car att) (cdr lst)) (loop (cdr atts) (cons (cons (cdr att) element) id-index))) (else (loop (cdr atts) id-index)))))))) (else id:seed))) ; This function is called by the DOCTYPE handler ; A new 'id:seed' is returned (define (id:doctype-handler port systemid internal-subset?) (let((id-attrs (if (not systemid) '() ; systemid not supplied (let((external-port (open-input-resource systemid))) (if (not external-port) '() ; a failure (let((id-attrs (id:process-markupdecl* external-port))) (close-input-port external-port) id-attrs)))))) (let((id-attrs (if internal-subset? (id:unite-id-attrs (append id-attrs (id:process-markupdecl* port))) (id:unite-id-attrs id-attrs)))) (id:make-seed id-attrs '())))) ; This function constructs the member of an axuiliary list (define (id:ending-action id:seed) (let((id-index (id:seed-index id:seed))) (cons 'id-index id-index))) (provide (all-defined)))