closes #250, figure out proper text encodings for import/export

MODS uses the encoding as specified in the <?xml tag, or else UTF-8
RIS uses IBM850, since the spec says "IBM Extended Character Set" and it's the only code page Mozilla supports. (should I do this? or just use unicode?)
MARC uses UTF-8, since I don't think there's any way to get full MARC-8 support, and UTF-8 is now the preferred encoding anyway
This commit is contained in:
Simon Kornblith 2006-09-05 07:51:55 +00:00
parent cec35d7566
commit 045780d9ac
3 changed files with 175 additions and 56 deletions

View File

@ -321,7 +321,7 @@ var Scholar_File_Interface = new function() {
// create UTF-8 output stream
var os = Components.classes["@mozilla.org/intl/converter-output-stream;1"].
createInstance(Components.interfaces.nsIConverterOutputStream);
os.init(fStream, "UTF-8", 0, "?");
os.init(fStream, "UTF-8", 0, "?".charCodeAt(0));
os.writeString(html);

View File

@ -389,6 +389,8 @@ Scholar.Translate.prototype.getTranslators = function() {
// see which translators can translate
var possibleTranslators = this._findTranslators(translators);
this._closeStreams();
return possibleTranslators;
}
@ -647,7 +649,13 @@ Scholar.Translate.prototype._generateSandbox = function() {
/*
* Check to see if _scraper_ can scrape this document
*/
Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtensions) {
Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtensions) {
if((this.type == "import" || this.type == "web") && !this.location) {
// if no location yet (e.g., getting list of possible web translators),
// just return true
return true;
}
// Test location with regular expression
if(translator.target && (this.type == "import" || this.type == "web")) {
var canTranslate = false;
@ -662,9 +670,10 @@ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtension
if(translator.importRegexp) {
var regularExpression = translator.importRegexp;
} else {
var regularExpression = new RegExp("\."+translator.target+"$", "i");
var regularExpression = new RegExp("\\."+translator.target+"$", "i");
}
}
Scholar.debug("path is "+this.path);
if(regularExpression.test(this.path)) {
canTranslate = true;
@ -861,23 +870,6 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) {
} else {
Scholar.debug("translation complete");
// serialize RDF and unregister dataSource
if(this._rdf) {
if(this._rdf.serializer) {
this._rdf.serializer.Serialize(this._streams[0]);
}
try {
var rdfService = Components.classes["@mozilla.org/rdf/rdf-service;1"].
getService(Components.interfaces.nsIRDFService);
rdfService.UnregisterDataSource(this._rdf.dataSource);
} catch(e) {
Scholar.debug("could not unregister data source");
}
delete this._rdf.dataSource;
}
// close open streams
this._closeStreams();
@ -902,6 +894,21 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) {
* closes open file streams, if any exist
*/
Scholar.Translate.prototype._closeStreams = function() {
// serialize RDF and unregister dataSource
if(this._rdf) {
if(this._rdf.serializer) {
this._rdf.serializer.Serialize(this._streams[0]);
}
try {
var rdfService = Components.classes["@mozilla.org/rdf/rdf-service;1"].
getService(Components.interfaces.nsIRDFService);
rdfService.UnregisterDataSource(this._rdf.dataSource);
} catch(e) {}
delete this._rdf.dataSource;
}
if(this._streams.length) {
for(var i in this._streams) {
var stream = this._streams[i];
@ -924,8 +931,10 @@ Scholar.Translate.prototype._closeStreams = function() {
}
}
}
delete this._streams;
this._streams = new Array();
this._inputStream = null;
}
/*
@ -1327,49 +1336,105 @@ Scholar.Translate.prototype._importConfigureIO = function() {
this._storagePointer = 0;
}
} else {
var me = this;
if(this._configOptions.dataMode == "rdf") {
this._rdf = new Object()
var IOService = Components.classes['@mozilla.org/network/io-service;1']
.getService(Components.interfaces.nsIIOService);
var fileHandler = IOService.getProtocolHandler("file")
.QueryInterface(Components.interfaces.nsIFileProtocolHandler);
var URL = fileHandler.getURLSpecFromFile(this.location);
var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1']
.getService(Components.interfaces.nsIRDFService);
this._rdf.dataSource = RDFService.GetDataSourceBlocking(URL);
// make an instance of the RDF handler
this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(this._rdf.dataSource);
if(!this._rdf) {
this._rdf = new Object()
var IOService = Components.classes['@mozilla.org/network/io-service;1']
.getService(Components.interfaces.nsIIOService);
var fileHandler = IOService.getProtocolHandler("file")
.QueryInterface(Components.interfaces.nsIFileProtocolHandler);
var URL = fileHandler.getURLSpecFromFile(this.location);
var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1']
.getService(Components.interfaces.nsIRDFService);
this._rdf.dataSource = RDFService.GetDataSourceBlocking(URL);
// make an instance of the RDF handler
this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(this._rdf.dataSource);
}
} else {
// open file and set read methods
var fStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
.createInstance(Components.interfaces.nsIFileInputStream);
fStream.init(this.location, 0x01, 0664, 0);
this._streams.push(fStream);
if(this._inputStream) {
this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
.seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, 0);
this._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
} else {
this._inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
.createInstance(Components.interfaces.nsIFileInputStream);
this._inputStream.init(this.location, 0x01, 0664, 0);
this._streams.push(this._inputStream);
}
if(this._configOptions.dataMode == "line") { // line by line reading
var notEof = true;
var lineData = new Object();
var intlStream = null;
var filePosition = 0;
// allow translator to set charset
this._sandbox.Scholar.setCharacterSet = function(charset) {
// seek
if(filePosition != 0) {
me._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
.seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, filePosition);
me._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
}
fStream.QueryInterface(Components.interfaces.nsILineInputStream);
intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream);
try {
intlStream.init(me._inputStream, charset, 1024,
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
} catch(e) {
throw "Text encoding not supported";
}
me._streams.push(intlStream);
}
var str = new Object();
if(this._configOptions.dataMode == "line") { // line by line reading
this._inputStream.QueryInterface(Components.interfaces.nsILineInputStream);
this._sandbox.Scholar.read = function() {
if(notEof) {
notEof = fStream.readLine(lineData);
return lineData.value;
if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) {
var amountRead = intlStream.readLine(str);
} else {
var amountRead = me._inputStream.readLine(str);
}
if(amountRead) {
filePosition += amountRead;
return str.value;
} else {
return false;
}
}
} else { // block reading
var sStream = Components.classes["@mozilla.org/scriptableinputstream;1"]
.createInstance(Components.interfaces.nsIScriptableInputStream);
sStream.init(fStream);
var sStream;
this._sandbox.Scholar.read = function(amount) {
return sStream.read(amount);
if(intlStream) {
// read from international stream, if one is available
var amountRead = intlStream.readString(amount, str);
if(amountRead) {
filePosition += amountRead;
return str.value;
} else {
return false;
}
} else {
// allocate sStream on the fly
if(!sStream) {
sStream = Components.classes["@mozilla.org/scriptableinputstream;1"]
.createInstance(Components.interfaces.nsIScriptableInputStream);
sStream.init(me._inputStream);
}
// read from the scriptable input stream
var string = sStream.read(amount);
filePosition += string.length;
return string;
}
}
// attach sStream to stack of streams to close
@ -1473,8 +1538,24 @@ Scholar.Translate.prototype._exportConfigureIO = function() {
// make an instance of the RDF handler
this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(this._rdf.dataSource, this._rdf.serializer);
} else { // regular io; write just writes to file
this._sandbox.Scholar.write = function(data) { fStream.write(data, data.length) };
} else {
// regular io; write just writes to file
var intlStream = null;
// allow setting of character sets
this._sandbox.Scholar.setCharacterSet = function(charset) {
intlStream = Components.classes["@mozilla.org/intl/converter-output-stream;1"]
.createInstance(Components.interfaces.nsIConverterOutputStream);
intlStream.init(fStream, charset, 1024, "?".charCodeAt(0));
};
this._sandbox.Scholar.write = function(data) {
if(intlStream) {
intlStream.writeString(data);
} else {
fStream.write(data, data.length);
}
};
}
}
@ -1628,6 +1709,10 @@ Scholar.Translate.prototype._initializeInternalIO = function() {
*/
Scholar.Translate.prototype._storageFunctions = function(read, write) {
var me = this;
// add setCharacterSet method that does nothing
this._sandbox.Scholar.setCharacterSet = function() {}
if(write) {
// set up write() method
this._sandbox.Scholar.write = function(data) {

View File

@ -1,4 +1,4 @@
-- 79
-- 80
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00'));
@ -3840,6 +3840,7 @@ function detectImport() {
'var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"];
function doExport() {
Scholar.setCharacterSet("utf-8");
var modsCollection = <modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-2.xsd" />;
var item;
@ -4115,15 +4116,39 @@ function doImport() {
var text = "";
var read;
// read until we see if the file begins with a parse instruction
read = " ";
while(read == " " || read == "\n" || read == "\r") {
read = Scholar.read(1);
}
var firstPart = read + Scholar.read(4);
if(firstPart == "<?xml") {
// got a parse instruction, read until it ends
read = true;
while((read !== false) && (read !== ">")) {
read = Scholar.read(1);
firstPart += read;
}
var encodingRe = /encoding=[''"]([^''"]+)[''"]/;
var m = encodingRe.exec(firstPart);
// set character set
try {
Scholar.setCharacterSet(m[1]);
} catch(e) {
Scholar.setCharacterSet("utf-8");
}
} else {
Scholar.setCharacterSet("utf-8");
text += firstPart;
}
// read in 16384 byte increments
while(read = Scholar.read(16384)) {
text += read;
}
Scholar.Utilities.debug("read in");
// eliminate <?xml ?> heading so we can parse as XML
text = text.replace(/<\?xml[^?]+\?>/, "");
// parse with E4X
var m = new Namespace("http://www.loc.gov/mods/v3");
// why does this default namespace declaration not work!?
@ -5495,7 +5520,9 @@ function processTag(item, tag, value) {
}
function doImport(attachments) {
Scholar.Utilities.debug("hello");
// this is apparently the proper character set for RIS, although i''m not
// sure how many people follow this
Scholar.setCharacterSet("IBM850");
var line = true;
var tag = data = false;
@ -5560,6 +5587,10 @@ function addTag(tag, value) {
}
function doExport() {
// this is apparently the proper character set for RIS, although i''m not
// sure how many people follow this
Scholar.setCharacterSet("IBM850");
var item;
while(item = Scholar.nextItem()) {
@ -5974,6 +6005,9 @@ function doImport() {
var text;
var holdOver = ""; // part of the text held over from the last loop
Scholar.Utilities.debug("doing import: about to set character set");
Scholar.setCharacterSet("utf-8");
while(text = Scholar.read(4096)) { // read in 4096 byte increments
var records = text.split("\x1D");