- add xml/e4x and xml/dom dataMode options

- parse XML encoding declarations in translate.js
- fix errors importing MODS from clipboard
This commit is contained in:
Simon Kornblith 2009-07-13 22:45:10 +00:00
parent e3e8881282
commit 40443c6b91
2 changed files with 200 additions and 163 deletions

View File

@ -1820,55 +1820,23 @@ Zotero.Translate.prototype._importDoneSniffing = function(charset) {
/* /*
* set up import for IO * set up import for IO
*/ */
Zotero.Translate.prototype._importConfigureIO = function(charset) { Zotero.Translate.prototype._importConfigureIO = function(charset) {
if(this.configOptions.dataMode && (this.configOptions.dataMode == "rdf" || this.configOptions.dataMode == "rdf/n3")) {
if(!this._rdf) {
Zotero.debug("Translate: initializing data store");
// initialize data store
this._rdf = new Zotero.RDF.AJAW.RDFIndexedFormula();
Zotero.debug("Translate: loading data");
// load data into store
var IOService = Components.classes['@mozilla.org/network/io-service;1']
.getService(Components.interfaces.nsIIOService);
if(this._storage) {
// parse from string
var baseURI = (this.location ? IOService.newURI(this.location, "utf-8", null) : null);
var nodeTree = (new DOMParser()).parseFromString(this._storage, 'text/xml');
} else {
// get URI
var fileHandler = IOService.getProtocolHandler("file")
.QueryInterface(Components.interfaces.nsIFileProtocolHandler);
var baseURI = fileHandler.getURLSpecFromFile(this.location);
// load XML from file using xmlhttp for charset detection
var xmlhttp = Components.classes["@mozilla.org/xmlextras/xmlhttprequest;1"].
createInstance(Components.interfaces.nsIXMLHttpRequest);
xmlhttp.overrideMimeType("text/xml");
xmlhttp.open("GET", baseURI, false); // Synchronous
xmlhttp.send("");
var nodeTree = xmlhttp.responseXML;
if(nodeTree.getElementsByTagName("parsererror").length) {
this._rdf = false;
throw("RDF/XML parse error; loading data into data store failed");
}
}
var parser = new Zotero.RDF.AJAW.RDFParser(this._rdf);
parser.parse(nodeTree, baseURI);
}
Zotero.debug("adding apis");
// add RDF features to sandbox
this._sandbox.Zotero.RDF = new Zotero.Translate.RDF(this._rdf);
return;
}
if(this._storage) { if(this._storage) {
// import from string // import from string
this._storageFunctions(true);
this._storagePointer = 0; this._storagePointer = 0;
if(this.configOptions.dataMode && this.configOptions.dataMode == "xml/dom" || this.configOptions.dataMode == "rdf") {
// for DOM XML, handle with parseFromString
if(this.configOptions.dataMode == "xml/dom" || !this._rdf) {
var xmlNodes = Components.classes["@mozilla.org/xmlextras/domparser;1"]
.createInstance(Components.interfaces.nsIDOMParser)
.parseFromString(this._storage, "text/xml");
}
} else if(this.configOptions.dataMode && this.configOptions.dataMode == "xml/e4x") {
var xmlNodes = new XML(this._storage.replace(/<\?xml[^>]+\?>/, ""));
} else {
this._storageFunctions(true);
}
} else { } else {
// import from file // import from file
@ -1885,8 +1853,10 @@ Zotero.Translate.prototype._importConfigureIO = function(charset) {
this._streams.push(this._inputStream); this._streams.push(this._inputStream);
} }
var sStream = null;
var bomLength = 0; var bomLength = 0;
if(charset === undefined || (charset && charset.length > 3 && charset.substr(0, 3) == "UTF")) { if(!charset && this._charset) charset = this._charset;
if(!charset || (charset && charset.length > 3 && charset.substr(0, 3) == "UTF")) {
// seek past BOM // seek past BOM
var bomCharset = this._importGetBOM(); var bomCharset = this._importGetBOM();
var bomLength = (bomCharset ? BOMs[bomCharset].length : 0); var bomLength = (bomCharset ? BOMs[bomCharset].length : 0);
@ -1895,80 +1865,185 @@ Zotero.Translate.prototype._importConfigureIO = function(charset) {
if(bomCharset) charset = this._charset = bomCharset; if(bomCharset) charset = this._charset = bomCharset;
} }
var intlStream = null; // look for/seek past XML charset declaration
if(charset) { if(this.configOptions.dataMode && (this.configOptions.dataMode == "xml/e4x"
// if have detected charset || this.configOptions.dataMode == "xml/dom"
Zotero.debug("Translate: Using detected character set "+charset, 3); || this.configOptions.dataMode == "rdf")) {
// convert from detected charset
intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"] sStream = Components.classes["@mozilla.org/scriptableinputstream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream); .createInstance(Components.interfaces.nsIScriptableInputStream);
intlStream.init(this._inputStream, charset, 65535, sStream.init(this._inputStream);
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
me._streams.push(intlStream);
}
// allow translator to set charset
this._sandbox.Zotero.setCharacterSet = function(charset) {
// seek back to the beginning
me._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
.seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, bomLength);
intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream);
try {
intlStream.init(me._inputStream, charset, 65535,
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
} catch(e) {
throw "Text encoding not supported";
}
me._streams.push(intlStream);
}
var str = new Object();
if(this.configOptions.dataMode && this.configOptions.dataMode == "line") { // line by line reading
this._inputStream.QueryInterface(Components.interfaces.nsILineInputStream);
this._sandbox.Zotero.read = function() {
if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) {
var amountRead = intlStream.readLine(str);
} else {
var amountRead = me._inputStream.readLine(str);
}
if(amountRead) {
return str.value;
} else {
return false;
}
}
} else { // block reading
var sStream;
this._sandbox.Zotero.read = function(amount) {
if(intlStream) {
// read from international stream, if one is available
var amountRead = intlStream.readString(amount, str);
if(amountRead) {
return str.value;
} else {
return false;
}
} else {
// allocate sStream on the fly
if(!sStream) {
sStream = Components.classes["@mozilla.org/scriptableinputstream;1"]
.createInstance(Components.interfaces.nsIScriptableInputStream);
sStream.init(me._inputStream);
}
// read from the scriptable input stream
var string = sStream.read(amount);
return string;
}
}
// attach sStream to stack of streams to close
this._streams.push(sStream); this._streams.push(sStream);
// read until we see if the file begins with a parse instruction
const whitespaceRe = /\s/g;
var read;
do {
read = sStream.read(1);
} while(whitespaceRe.test(read))
if(read != "<") throw "XML load error: text does not start with <";
var firstPart = read + sStream.read(4);
if(firstPart == "<?xml") {
// got a parse instruction, read until it ends
read = true;
while((read !== false) && (read !== ">")) {
read = sStream.read(1);
firstPart += read;
}
var encodingRe = /encoding=['"]([^'"]+)['"]/;
var m = encodingRe.exec(firstPart);
if(m) {
try {
var charconv = Components.classes["@mozilla.org/charset-converter-manager;1"]
.getService(Components.interfaces.nsICharsetConverterManager)
.getCharsetTitle(m[1]);
if(charconv) charset = this._charset = m[1];
} catch(e) {}
}
} else {
this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
.seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, bomLength);
}
if(!charset) charset = "UTF-8";
}
if(this.configOptions.dataMode && this.configOptions.dataMode == "xml/dom" || this.configOptions.dataMode == "rdf") {
// for DOM XML, pass charset to parseFromStream
if(this.configOptions.dataMode == "xml/dom" || !this._rdf) {
var xmlNodes = Components.classes["@mozilla.org/xmlextras/domparser;1"]
.createInstance(Components.interfaces.nsIDOMParser)
.parseFromStream(this._inputStream, charset, this.location.fileSize, "text/xml");
}
} else {
var intlStream = null;
if(charset) {
// if have detected charset
Zotero.debug("Translate: Using detected character set "+charset, 3);
// convert from detected charset
intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream);
intlStream.init(this._inputStream, charset, 65535,
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
me._streams.push(intlStream);
}
if(this.configOptions.dataMode && this.configOptions.dataMode == "xml/e4x") {
// read in 16384 byte increments
var xmlNodes = "";
var str = {};
while(intlStream.readString(16384, str)) {
xmlNodes += str.value;
}
// create xml
xmlNodes = new XML(xmlNodes);
} else {
// standard text reading tools
// allow translator to set charset
this._sandbox.Zotero.setCharacterSet = function(charset) {
// seek back to the beginning
me._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
.seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, bomLength);
intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream);
try {
intlStream.init(me._inputStream, charset, 65535,
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
} catch(e) {
throw "Text encoding not supported";
}
me._streams.push(intlStream);
}
var str = new Object();
if(this.configOptions.dataMode && this.configOptions.dataMode == "line") { // line by line reading
this._inputStream.QueryInterface(Components.interfaces.nsILineInputStream);
this._sandbox.Zotero.read = function() {
if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) {
var amountRead = intlStream.readLine(str);
} else {
var amountRead = me._inputStream.readLine(str);
}
if(amountRead) {
return str.value;
} else {
return false;
}
}
} else { // block reading
this._sandbox.Zotero.read = function(amount) {
if(intlStream) {
// read from international stream, if one is available
var amountRead = intlStream.readString(amount, str);
if(amountRead) {
return str.value;
} else {
return false;
}
} else {
// allocate sStream on the fly
if(!sStream) {
sStream = Components.classes["@mozilla.org/scriptableinputstream;1"]
.createInstance(Components.interfaces.nsIScriptableInputStream);
sStream.init(me._inputStream);
this._streams.push(sStream);
}
// read from the scriptable input stream
var string = sStream.read(amount);
return string;
}
}
}
}
}
}
if(this.configOptions.dataMode) {
// make sure DOM XML actually got parsed
if(xmlNodes && (this.configOptions.dataMode == "rdf" || this.configOptions.dataMode == "xml/dom")
&& xmlNodes.getElementsByTagName("parsererror").length) {
this._rdf = undefined;
throw("XML parser error: loading data into data store failed");
}
if(this.configOptions.dataMode == "rdf") {
// set up RDF
if(!this._rdf) {
// get URI
var IOService = Components.classes['@mozilla.org/network/io-service;1']
.getService(Components.interfaces.nsIIOService);
if(this._storage) {
var baseURI = this.location ? this.location : null;
} else {
var fileHandler = IOService.getProtocolHandler("file")
.QueryInterface(Components.interfaces.nsIFileProtocolHandler);
var baseURI = fileHandler.getURLSpecFromFile(this.location);
}
Zotero.debug("Translate: initializing data store");
this._rdf = new Zotero.RDF.AJAW.RDFIndexedFormula();
Zotero.debug("Translate: loading data");
var parser = new Zotero.RDF.AJAW.RDFParser(this._rdf);
parser.parse(xmlNodes, "");
}
// add RDF features to sandbox
this._sandbox.Zotero.RDF = new Zotero.Translate.RDF(this._rdf);
} else if(this.configOptions.dataMode == "xml/e4x" || this.configOptions.dataMode == "xml/dom") {
// add getXML function
this._sandbox.Zotero.getXML = function() {
return xmlNodes;
}
} }
} }
} }

View File

@ -4,7 +4,7 @@
"label":"MODS", "label":"MODS",
"creator":"Simon Kornblith", "creator":"Simon Kornblith",
"target":"xml", "target":"xml",
"minVersion":"1.0.8", "minVersion":"2.0b6.3",
"maxVersion":"", "maxVersion":"",
"priority":50, "priority":50,
"inRepository":true, "inRepository":true,
@ -12,13 +12,11 @@
} }
Zotero.addOption("exportNotes", true); Zotero.addOption("exportNotes", true);
Zotero.configure("dataMode", "xml/e4x");
function detectImport() { function detectImport() {
var read = Zotero.read(512); var name = Zotero.getXML().name();
var modsTagRegexp = /<mods[^>]+>/ return name.uri == "http://www.loc.gov/mods/v3" && (name.localName == "modsCollection" || name.localName == "mods");
if(modsTagRegexp.test(read)) {
return true;
}
} }
var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"];
@ -341,47 +339,11 @@ function doImport() {
"web site":"webpage" "web site":"webpage"
}; };
var read;
// read until we see if the file begins with a parse instruction
read = " ";
while(read == " " || read == "\n" || read == "\r") {
read = Zotero.read(1);
}
var firstPart = read + Zotero.read(4);
if(firstPart == "<?xml") {
// got a parse instruction, read until it ends
read = true;
while((read !== false) && (read !== ">")) {
read = Zotero.read(1);
firstPart += read;
}
var encodingRe = /encoding=['"]([^'"]+)['"]/;
var m = encodingRe.exec(firstPart);
// set character set
try {
Zotero.setCharacterSet(m[1]);
} catch(e) {
Zotero.setCharacterSet("utf-8");
}
} else {
Zotero.setCharacterSet("utf-8");
}
// read in 16384 byte increments
var text = "";
while(read = Zotero.read(16384)) {
text += read;
}
text = text.replace(/<\?xml[^>]+\?>/, "");
// parse with E4X // parse with E4X
var m = new Namespace("http://www.loc.gov/mods/v3"); var m = new Namespace("http://www.loc.gov/mods/v3");
// why does this default namespace declaration not work!? // why does this default namespace declaration not work!?
default xml namespace = m; default xml namespace = m;
var xml = new XML(text); var xml = Zotero.getXML();
if(xml.m::mods.length()) { if(xml.m::mods.length()) {
var modsElements = xml.m::mods; var modsElements = xml.m::mods;