- closes #281, look for BOM when importing to override charset. the BOM is a nice way to detect UTF encodings, although it won't help distinguish, e.g., ISO 8859-1 from MacRoman. since EndNote adds a BOM to all of its export files, this means non-ASCII charaacters should now be preserved when exported from EndNote.

- better error handling for translators ("Could Not Add Item" should now pop up in all circumstances)
This commit is contained in:
Simon Kornblith 2006-09-08 20:44:05 +00:00
parent ad5ce20c82
commit 539957a93b

View File

@ -65,6 +65,7 @@
* _storage - the stored string to be treated as input * _storage - the stored string to be treated as input
* _storageLength - the length of the stored string * _storageLength - the length of the stored string
* _exportFileDirectory - the directory to which files will be exported * _exportFileDirectory - the directory to which files will be exported
* _hasBOM - whether the given file ready to be imported has a BOM or not
* *
* WEB-ONLY PRIVATE PROPERTIES: * WEB-ONLY PRIVATE PROPERTIES:
* *
@ -371,6 +372,9 @@ Scholar.Translate.prototype.setHandler = function(type, handler) {
* itemType - the type of item this scraper says it will scrape * itemType - the type of item this scraper says it will scrape
*/ */
Scholar.Translate.prototype.getTranslators = function() { Scholar.Translate.prototype.getTranslators = function() {
// clear BOM
this._hasBOM = null;
if(Scholar.Translate.cache) { if(Scholar.Translate.cache) {
var translators = Scholar.Translate.cache[this.type]; var translators = Scholar.Translate.cache[this.type];
} else { } else {
@ -445,10 +449,15 @@ Scholar.Translate.prototype._loadTranslator = function() {
try { try {
Components.utils.evalInSandbox(this.translator[0].code, this._sandbox); Components.utils.evalInSandbox(this.translator[0].code, this._sandbox);
} catch(e) { } catch(e) {
Scholar.debug(e+' in parsing code for '+this.translator[0].label); var error = e+' in parsing code for '+this.translator[0].label;
if(this._parentTranslator) {
throw error;
} else {
Scholar.debug(error);
this._translationComplete(false); this._translationComplete(false);
return false; return false;
} }
}
return true; return true;
} }
@ -459,10 +468,14 @@ Scholar.Translate.prototype._loadTranslator = function() {
Scholar.Translate.prototype.translate = function() { Scholar.Translate.prototype.translate = function() {
Scholar.debug("translate called"); Scholar.debug("translate called");
/*
* initialize properties
*/
this.newItems = new Array(); this.newItems = new Array();
this.newCollections = new Array(); this.newCollections = new Array();
this._IDMap = new Array(); this._IDMap = new Array();
this._complete = false; this._complete = false;
this._hasBOM = null;
if(!this.translator || !this.translator.length) { if(!this.translator || !this.translator.length) {
throw("cannot translate: no translator specified"); throw("cannot translate: no translator specified");
@ -1274,9 +1287,14 @@ Scholar.Translate.prototype._web = function() {
try { try {
this._sandbox.doWeb(this.document, this.location); this._sandbox.doWeb(this.document, this.location);
} catch(e) { } catch(e) {
Scholar.debug(e+' in executing code for '+this.translator[0].label); var error = e+' in executing code for '+this.translator[0].label;
if(this._parentTranslator) {
throw error;
} else {
Scholar.debug();
return false; return false;
} }
}
return true; return true;
} }
@ -1304,9 +1322,15 @@ Scholar.Translate.prototype._import = function() {
try { try {
this._sandbox.doImport(); this._sandbox.doImport();
} catch(e) { } catch(e) {
Scholar.debug(e+' in executing code for '+this.translator[0].label); Scholar.debug(e.toSource());
var error = e+' in executing code for '+this.translator[0].label;
if(this._parentTranslator) {
throw error;
} else {
Scholar.debug(error);
return false; return false;
} }
}
return true; return true;
} }
@ -1370,9 +1394,13 @@ Scholar.Translate.prototype._importConfigureIO = function() {
this._streams.push(this._inputStream); this._streams.push(this._inputStream);
} }
var intlStream = null;
var filePosition = 0; var filePosition = 0;
var intlStream = this._importDefuseBOM();
if(intlStream) {
// found a UTF BOM at the beginning of the file; don't allow
// translator to set the character set
this._sandbox.Scholar.setCharacterSet = function() {}
} else {
// allow translator to set charset // allow translator to set charset
this._sandbox.Scholar.setCharacterSet = function(charset) { this._sandbox.Scholar.setCharacterSet = function(charset) {
// seek // seek
@ -1392,6 +1420,7 @@ Scholar.Translate.prototype._importConfigureIO = function() {
} }
me._streams.push(intlStream); me._streams.push(intlStream);
} }
}
var str = new Object(); var str = new Object();
if(this._configOptions.dataMode == "line") { // line by line reading if(this._configOptions.dataMode == "line") { // line by line reading
@ -1399,6 +1428,7 @@ Scholar.Translate.prototype._importConfigureIO = function() {
this._sandbox.Scholar.read = function() { this._sandbox.Scholar.read = function() {
if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) { if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) {
Scholar.debug("using intlStream");
var amountRead = intlStream.readLine(str); var amountRead = intlStream.readLine(str);
} else { } else {
var amountRead = me._inputStream.readLine(str); var amountRead = me._inputStream.readLine(str);
@ -1446,6 +1476,85 @@ Scholar.Translate.prototype._importConfigureIO = function() {
} }
} }
/*
* searches for a UTF BOM at the beginning of the input stream. if one is found,
* returns an appropriate converter-input-stream for the UTF type, and sets
* _hasBOM to the UTF type. if one is not found, returns false, and sets
* _hasBOM to false to prevent further checking.
*/
Scholar.Translate.prototype._importDefuseBOM = function() {
// if already found not to have a BOM, skip
if(this._hasBOM === false) {
return;
}
if(!this._hasBOM) {
// if not checked for a BOM, open a binary input stream and read
var binStream = Components.classes["@mozilla.org/binaryinputstream;1"].
createInstance(Components.interfaces.nsIBinaryInputStream);
binStream.setInputStream(this._inputStream);
// read the first byte
var byte1 = binStream.read8();
// at the moment, we don't support UTF-32 or UTF-7. while mozilla
// supports these encodings, they add slight additional complexity to
// the function and anyone using them for storing bibliographic metadata
// is insane.
if(byte1 == 0xEF) { // UTF-8: EF BB BF
var byte2 = binStream.read8();
if(byte2 == 0xBB) {
var byte3 = binStream.read8();
if(byte3 == 0xBF) {
this._hasBOM = "UTF-8";
}
}
} else if(byte1 == 0xFE) { // UTF-16BE: FE FF
var byte2 = binStream.read8();
if(byte2 == 0xFF) {
this._hasBOM = "UTF-16BE";
}
} else if(byte1 == 0xFF) { // UTF-16LE: FF FE
var byte2 = binStream.read8();
if(byte2 == 0xFE) {
this._hasBOM = "UTF16-LE";
}
}
if(!this._hasBOM) {
// seek back to begining of file
this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
.seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, 0);
this._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
// say there's no BOM
this._hasBOM = false;
return false;
}
} else {
// if it had a BOM the last time, it has one this time, too. seek to the
// correct position.
if(this._hasBOM == "UTF-8") {
var seekPosition = 3;
} else {
var seekPosition = 2;
}
this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
.seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, seekPosition);
this._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
}
// if we know what kind of BOM it has, generate an input stream
intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream);
intlStream.init(this._inputStream, this._hasBOM, 1024,
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
return intlStream;
}
/* /*
* does the actual export, after code has been loaded and parsed * does the actual export, after code has been loaded and parsed
*/ */