From 539957a93b7ff3d0c0444cf73dadd4fd38e97f4e Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Fri, 8 Sep 2006 20:44:05 +0000 Subject: [PATCH] - closes #281, look for BOM when importing to override charset. the BOM is a nice way to detect UTF encodings, although it won't help distinguish, e.g., ISO 8859-1 from MacRoman. since EndNote adds a BOM to all of its export files, this means non-ASCII charaacters should now be preserved when exported from EndNote. - better error handling for translators ("Could Not Add Item" should now pop up in all circumstances) --- .../content/scholar/xpcom/translate.js | 163 +++++++++++++++--- 1 file changed, 136 insertions(+), 27 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js index df0e8ddc0..908388704 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/translate.js +++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -65,6 +65,7 @@ * _storage - the stored string to be treated as input * _storageLength - the length of the stored string * _exportFileDirectory - the directory to which files will be exported + * _hasBOM - whether the given file ready to be imported has a BOM or not * * WEB-ONLY PRIVATE PROPERTIES: * @@ -371,6 +372,9 @@ Scholar.Translate.prototype.setHandler = function(type, handler) { * itemType - the type of item this scraper says it will scrape */ Scholar.Translate.prototype.getTranslators = function() { + // clear BOM + this._hasBOM = null; + if(Scholar.Translate.cache) { var translators = Scholar.Translate.cache[this.type]; } else { @@ -445,9 +449,14 @@ Scholar.Translate.prototype._loadTranslator = function() { try { Components.utils.evalInSandbox(this.translator[0].code, this._sandbox); } catch(e) { - Scholar.debug(e+' in parsing code for '+this.translator[0].label); - this._translationComplete(false); - return false; + var error = e+' in parsing code for '+this.translator[0].label; + if(this._parentTranslator) { + throw error; + } else { + Scholar.debug(error); + this._translationComplete(false); + return false; + } } return true; @@ -459,10 +468,14 @@ Scholar.Translate.prototype._loadTranslator = function() { Scholar.Translate.prototype.translate = function() { Scholar.debug("translate called"); + /* + * initialize properties + */ this.newItems = new Array(); this.newCollections = new Array(); this._IDMap = new Array(); this._complete = false; + this._hasBOM = null; if(!this.translator || !this.translator.length) { throw("cannot translate: no translator specified"); @@ -1274,8 +1287,13 @@ Scholar.Translate.prototype._web = function() { try { this._sandbox.doWeb(this.document, this.location); } catch(e) { - Scholar.debug(e+' in executing code for '+this.translator[0].label); - return false; + var error = e+' in executing code for '+this.translator[0].label; + if(this._parentTranslator) { + throw error; + } else { + Scholar.debug(); + return false; + } } return true; @@ -1304,8 +1322,14 @@ Scholar.Translate.prototype._import = function() { try { this._sandbox.doImport(); } catch(e) { - Scholar.debug(e+' in executing code for '+this.translator[0].label); - return false; + Scholar.debug(e.toSource()); + var error = e+' in executing code for '+this.translator[0].label; + if(this._parentTranslator) { + throw error; + } else { + Scholar.debug(error); + return false; + } } return true; @@ -1370,27 +1394,32 @@ Scholar.Translate.prototype._importConfigureIO = function() { this._streams.push(this._inputStream); } - var intlStream = null; var filePosition = 0; - - // allow translator to set charset - this._sandbox.Scholar.setCharacterSet = function(charset) { - // seek - if(filePosition != 0) { - me._inputStream.QueryInterface(Components.interfaces.nsISeekableStream) - .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, filePosition); - me._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream); + var intlStream = this._importDefuseBOM(); + if(intlStream) { + // found a UTF BOM at the beginning of the file; don't allow + // translator to set the character set + this._sandbox.Scholar.setCharacterSet = function() {} + } else { + // allow translator to set charset + this._sandbox.Scholar.setCharacterSet = function(charset) { + // seek + if(filePosition != 0) { + me._inputStream.QueryInterface(Components.interfaces.nsISeekableStream) + .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, filePosition); + me._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream); + } + + intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"] + .createInstance(Components.interfaces.nsIConverterInputStream); + try { + intlStream.init(me._inputStream, charset, 1024, + Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER); + } catch(e) { + throw "Text encoding not supported"; + } + me._streams.push(intlStream); } - - intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"] - .createInstance(Components.interfaces.nsIConverterInputStream); - try { - intlStream.init(me._inputStream, charset, 1024, - Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER); - } catch(e) { - throw "Text encoding not supported"; - } - me._streams.push(intlStream); } var str = new Object(); @@ -1398,7 +1427,8 @@ Scholar.Translate.prototype._importConfigureIO = function() { this._inputStream.QueryInterface(Components.interfaces.nsILineInputStream); this._sandbox.Scholar.read = function() { - if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) { + if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) { + Scholar.debug("using intlStream"); var amountRead = intlStream.readLine(str); } else { var amountRead = me._inputStream.readLine(str); @@ -1446,6 +1476,85 @@ Scholar.Translate.prototype._importConfigureIO = function() { } } +/* + * searches for a UTF BOM at the beginning of the input stream. if one is found, + * returns an appropriate converter-input-stream for the UTF type, and sets + * _hasBOM to the UTF type. if one is not found, returns false, and sets + * _hasBOM to false to prevent further checking. + */ +Scholar.Translate.prototype._importDefuseBOM = function() { + // if already found not to have a BOM, skip + if(this._hasBOM === false) { + return; + } + + if(!this._hasBOM) { + // if not checked for a BOM, open a binary input stream and read + var binStream = Components.classes["@mozilla.org/binaryinputstream;1"]. + createInstance(Components.interfaces.nsIBinaryInputStream); + binStream.setInputStream(this._inputStream); + + // read the first byte + var byte1 = binStream.read8(); + + // at the moment, we don't support UTF-32 or UTF-7. while mozilla + // supports these encodings, they add slight additional complexity to + // the function and anyone using them for storing bibliographic metadata + // is insane. + if(byte1 == 0xEF) { // UTF-8: EF BB BF + var byte2 = binStream.read8(); + if(byte2 == 0xBB) { + var byte3 = binStream.read8(); + if(byte3 == 0xBF) { + this._hasBOM = "UTF-8"; + } + } + } else if(byte1 == 0xFE) { // UTF-16BE: FE FF + var byte2 = binStream.read8(); + if(byte2 == 0xFF) { + this._hasBOM = "UTF-16BE"; + } + } else if(byte1 == 0xFF) { // UTF-16LE: FF FE + var byte2 = binStream.read8(); + if(byte2 == 0xFE) { + this._hasBOM = "UTF16-LE"; + } + } + + if(!this._hasBOM) { + // seek back to begining of file + this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream) + .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, 0); + this._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream); + + // say there's no BOM + this._hasBOM = false; + + return false; + } + } else { + // if it had a BOM the last time, it has one this time, too. seek to the + // correct position. + + if(this._hasBOM == "UTF-8") { + var seekPosition = 3; + } else { + var seekPosition = 2; + } + + this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream) + .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, seekPosition); + this._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream); + } + + // if we know what kind of BOM it has, generate an input stream + intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"] + .createInstance(Components.interfaces.nsIConverterInputStream); + intlStream.init(this._inputStream, this._hasBOM, 1024, + Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER); + return intlStream; +} + /* * does the actual export, after code has been loaded and parsed */