/* ***** BEGIN LICENSE BLOCK ***** Copyright © 2009 Center for History and New Media George Mason University, Fairfax, Virginia, USA http://zotero.org This file is part of Zotero. Zotero is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Zotero is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Zotero. If not, see . ***** END LICENSE BLOCK ***** */ const BOMs = { "UTF-8":"\xEF\xBB\xBF", "UTF-16BE":"\xFE\xFF", "UTF-16LE":"\xFF\xFE", "UTF-32BE":"\x00\x00\xFE\xFF", "UTF-32LE":"\xFF\xFE\x00\x00" } Components.utils.import("resource://gre/modules/NetUtil.jsm"); /** * @class Manages the translator sandbox * @param {Zotero.Translate} translate * @param {String|window} sandboxLocation */ Zotero.Translate.SandboxManager = function(translate, sandboxLocation) { this.sandbox = new Components.utils.Sandbox(sandboxLocation); this.sandbox.Zotero = {}; this._translate = translate; // import functions missing from global scope into Fx sandbox this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; this.sandbox.DOMParser = function() { // get principal if(typeof sandboxLocation !== "string") { // if sandbox specified by DOM document var principal = sandboxLocation.nodePrincipal; } else { // if sandbox specified by URI var secMan = Components.classes["@mozilla.org/scriptsecuritymanager;1"] .getService(Components.interfaces.nsIScriptSecurityManager); var ioService = Components.classes["@mozilla.org/network/io-service;1"] .getService(Components.interfaces.nsIIOService); var uri = ioService.newURI(sandboxLocation, "UTF-8", null); var principal = secMan.getCodebasePrincipal(uri); } // initialize DOM parser var _DOMParser = Components.classes["@mozilla.org/xmlextras/domparser;1"] .createInstance(Components.interfaces.nsIDOMParser); _DOMParser.init(principal, uri, uri); // expose parseFromString this.__exposedProps__ = {"parseFromString":"r"}; this.parseFromString = function(str, contentType) _DOMParser.parseFromString(str, contentType); } this.sandbox.DOMParser.__exposedProps__ = {"prototype":"r"}; this.sandbox.DOMParser.prototype = {}; } Zotero.Translate.SandboxManager.prototype = { /** * Evaluates code in the sandbox */ "eval":function(code) { Components.utils.evalInSandbox(code, this.sandbox); }, /** * Imports an object into the sandbox * * @param {Object} object Object to be imported (under Zotero) * @param {Boolean} passTranslateAsFirstArgument Whether the translate instance should be passed * as the first argument to the function. */ "importObject":function(object, passAsFirstArgument, attachTo) { if(!attachTo) attachTo = this.sandbox.Zotero; var newExposedProps = false; if(!object.__exposedProps__) newExposedProps = {}; for(var key in (newExposedProps ? object : object.__exposedProps__)) { let localKey = key; if(newExposedProps) newExposedProps[localKey] = "r"; // magical XPCSafeJSObjectWrappers for sandbox if(typeof object[localKey] === "function" || typeof object[localKey] === "object") { if(attachTo == this.sandbox) Zotero.debug(localKey); attachTo[localKey] = function() { var args = (passAsFirstArgument ? [passAsFirstArgument] : []); for(var i=0; i")) { read = sStream.read(1); firstPart += read; } const encodingRe = /encoding=['"]([^'"]+)['"]/; var m = encodingRe.exec(firstPart); if(m) { try { var charconv = Components.classes["@mozilla.org/charset-converter-manager;1"] .getService(Components.interfaces.nsICharsetConverterManager) .getCharsetTitle(m[1]); if(charconv) this._charset = m[1]; } catch(e) {} } // if we know for certain document is XML, we also know for certain that the // default charset for XML is UTF-8 if(!this._charset) this._charset = "UTF-8"; } } // If we managed to get a charset here, then translators shouldn't be able to override it, // since it's almost certainly correct. Otherwise, we allow override. this._allowCharsetOverride = !!this._charset; this._seekToStart(); if(!this._charset) { // No XML parse instruction or BOM. // Check whether the user has specified a charset preference var charsetPref = Zotero.Prefs.get("import.charset"); if(charsetPref == "auto") { Zotero.debug("Translate: Checking whether file is UTF-8"); // For auto-detect, we are basically going to check if the file could be valid // UTF-8, and if this is true, we will treat it as UTF-8. Prior likelihood of // UTF-8 is very high, so this should be a reasonable strategy. // from http://codex.wordpress.org/User:Hakre/UTF8 const UTF8Regex = new RegExp('^(?:' + '[\x09\x0A\x0D\x20-\x7E]' + // ASCII '|[\xC2-\xDF][\x80-\xBF]' + // non-overlong 2-byte '|\xE0[\xA0-\xBF][\x80-\xBF]' + // excluding overlongs '|[\xE1-\xEC\xEE][\x80-\xBF]{2}' + // 3-byte, but exclude U-FFFE and U-FFFF '|\xEF[\x80-\xBE][\x80-\xBF]' + '|\xEF\xBF[\x80-\xBD]' + '|\xED[\x80-\x9F][\x80-\xBF]' + // excluding surrogates '|\xF0[\x90-\xBF][\x80-\xBF]{2}' + // planes 1-3 '|[\xF1-\xF3][\x80-\xBF]{3}' + // planes 4-15 '|\xF4[\x80-\x8F][\x80-\xBF]{2}' + // plane 16 ')*$'); // Read all currently available bytes from file. This seems to be the entire file, // since the IO is blocking anyway. this._charset = "UTF-8"; let bytesAvailable; while(bytesAvailable = this._rawStream.available()) { // read 131072 bytes let fileContents = binStream.readBytes(Math.min(131072, bytesAvailable)); // on failure, try reading up to 3 more bytes and see if that makes this // valid (since we have chunked it) let isUTF8; for(let i=1; !(isUTF8 = UTF8Regex.test(fileContents)) && i <= 3; i++) { if(this._rawStream.available()) { fileContents += binStream.readBytes(1); } } // if the regexp continues to fail, this is not UTF-8 if(!isUTF8) { // Can't be UTF-8; see if a default charset is defined this._charset = Zotero.Prefs.get("intl.charset.default", true); // ISO-8859-1 by default if(!this._charset) this._charset = "ISO-8859-1"; break; } } // Seek back to beginning of file this._seekToStart(); } else { // No need to auto-detect; user has specified a charset this._charset = charsetPref; } } } Zotero.debug("Translate: Detected file charset as "+this._charset); // We know the charset now. Open a converter stream. if(mode) this.reset(mode); } Zotero.Translate.IO.Read.prototype = { "__exposedProps__":{ "_getXML":"r", "RDF":"r", "read":"r", "setCharacterSet":"r" }, "_seekToStart":function() { this._rawStream.QueryInterface(Components.interfaces.nsISeekableStream) .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, this._bomLength); this.bytesRead = this._bomLength; }, "_readToString":function() { var str = {}; this.inputStream.readString(this.file.fileSize, str); return str.value; }, "_initRDF":function() { // get URI var IOService = Components.classes['@mozilla.org/network/io-service;1'] .getService(Components.interfaces.nsIIOService); var fileHandler = IOService.getProtocolHandler("file") .QueryInterface(Components.interfaces.nsIFileProtocolHandler); var baseURI = fileHandler.getURLSpecFromFile(this.file); Zotero.debug("Translate: Initializing RDF data store"); this._dataStore = new Zotero.RDF.AJAW.RDFIndexedFormula(); var parser = new Zotero.RDF.AJAW.RDFParser(this._dataStore); var nodes = Zotero.Translate.IO.parseDOMXML(this._rawStream, this._charset, this.file.fileSize); parser.parse(nodes, baseURI); this.RDF = new Zotero.Translate.IO._RDFSandbox(this._dataStore); }, "setCharacterSet":function(charset) { if(typeof charset !== "string") { throw "Translate: setCharacterSet: charset must be a string"; } // seek back to the beginning this._seekToStart(); if(this._allowCharsetOverride) { try { this.inputStream.init(this._rawStream, charset, 65535, Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER); } catch(e) { throw "Translate: setCharacterSet: text encoding not supported"; } } else { Zotero.debug("Translate: setCharacterSet: translate charset override ignored due to BOM or XML parse instruction"); } }, "read":function(bytes) { var str = {}; if(bytes) { // read number of bytes requested var amountRead = this.inputStream.readString(bytes, str); } else { // bytes not specified; read a line this.inputStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream); var amountRead = this.inputStream.readLine(str); } if(amountRead) { this.bytesRead += amountRead; return str.value; } else { return false; } }, "_getXML":function() { if(this._mode == "xml/dom") { return Zotero.Translate.IO.parseDOMXML(this._rawStream, this._charset, this.file.fileSize); } else { return this._readToString().replace(/<\?xml[^>]+\?>/, ""); } }, "reset":function(newMode) { this._seekToStart(); if(!this.inputStream) { this.inputStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"] .createInstance(Components.interfaces.nsIConverterInputStream); Zotero.Translate.IO.streamsToKeepOpen.push(this.inputStream); } this.inputStream.init(this._rawStream, this._charset, 65535, Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER); this._mode = newMode; if(Zotero.Translate.IO.rdfDataModes.indexOf(this._mode) !== -1 && !this.RDF) { this._initRDF(); } }, "close":function() { Zotero.Translate.IO.streamsToKeepOpen.splice(Zotero.Translate.IO.streamsToKeepOpen.indexOf(this._rawStream), 1); if(this.inputStream) { Zotero.Translate.IO.streamsToKeepOpen.splice(Zotero.Translate.IO.streamsToKeepOpen.indexOf(this.inputStream), 1); } this._rawStream.close(); } } Zotero.Translate.IO.String.prototype.__defineGetter__("contentLength", function() { return this.file.fileSize; }); /******* Write support *******/ Zotero.Translate.IO.Write = function(file, mode, charset) { this._rawStream = Components.classes["@mozilla.org/network/file-output-stream;1"] .createInstance(Components.interfaces.nsIFileOutputStream); Zotero.Translate.IO.streamsToKeepOpen.push(this._rawStream); this._rawStream.init(file, 0x02 | 0x08 | 0x20, 0664, 0); // write, create, truncate this._writtenToStream = false; if(mode || charset) this.reset(mode, charset); } Zotero.Translate.IO.Write.prototype = { "__exposedProps__":{ "RDF":"r", "write":"r", "setCharacterSet":"r" }, "_initRDF":function() { Zotero.debug("Translate: Initializing RDF data store"); this._dataStore = new Zotero.RDF.AJAW.RDFIndexedFormula(); this.RDF = new Zotero.Translate.IO._RDFSandbox(this._dataStore); }, "setCharacterSet":function(charset) { if(typeof charset !== "string") { throw "Translate: setCharacterSet: charset must be a string"; } if(!this.outputStream) { this.outputStream = Components.classes["@mozilla.org/intl/converter-output-stream;1"] .createInstance(Components.interfaces.nsIConverterOutputStream); Zotero.Translate.IO.streamsToKeepOpen.push(this.outputStream); } if(charset == "UTF-8xBOM") charset = "UTF-8"; this.outputStream.init(this._rawStream, charset, 1024, "?".charCodeAt(0)); this._charset = charset; }, "write":function(data) { if(!this._charset) this.setCharacterSet("UTF-8"); if(!this._writtenToStream && this._charset.substr(this._charset.length-4) == "xBOM" && BOMs[this._charset.substr(0, this._charset.length-4).toUpperCase()]) { // If stream has not yet been written to, and a UTF type has been selected, write BOM this._rawStream.write(BOMs[streamCharset], BOMs[streamCharset].length); } if(this._charset == "MACINTOSH") { // fix buggy Mozilla MacRoman var splitData = data.split(/([\r\n]+)/); for(var i=0; i