closes #162, do sniffing for import formats

import should now work regardless of file extensions. this should make #86 (steal EndNote download links) fairly easy to implement.
2006-08-08 02:46:52 +00:00 · 2006-08-08 02:46:52 +00:00 · 504ebf8996
commit 504ebf8996
parent d67d96c321
2 changed files with 78 additions and 25 deletions
--- a/chrome/chromeFiles/content/scholar/xpcom/translate.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js
@ -261,21 +261,7 @@ Scholar.Translate.prototype.getTranslators = function() {
 		Scholar.debug("searching for translators for "+this.path);
 		
 		// see which translators can translate
-		for(var i in translators) {
-			if(this._canTranslate(translators[i])) {
-				Scholar.debug("found translator "+translators[i].label);
-				
-				// for some reason, and i'm not quite sure what this reason is,
-				// we HAVE to do this to get things to work right; we can't
-				// just push a normal translator object from an SQL statement
-				var translator = {translatorID:translators[i].translatorID,
-						label:translators[i].label,
-						target:translators[i].target,
-						itemType:translators[i].itemType}
-				
-				possibleTranslators.push(translator);
-			}
-		}
+		var possibleTranslators = this._findTranslators(translators);
 		
 		return possibleTranslators;
 	}
@ -289,6 +275,31 @@ Scholar.Translate.prototype.getTranslators = function() {
 Scholar.Translate.prototype.displayOptions = function() {
 }

+Scholar.Translate.prototype._findTranslators = function(translators, ignoreExtensions) {
+	var possibleTranslators = new Array();
+	for(var i in translators) {
+		if(this._canTranslate(translators[i], ignoreExtensions)) {
+			Scholar.debug("found translator "+translators[i].label);
+			
+			// for some reason, and i'm not quite sure what this reason is,
+			// we HAVE to do this to get things to work right; we can't
+			// just push a normal translator object from an SQL statement
+			var translator = {translatorID:translators[i].translatorID,
+					label:translators[i].label,
+					target:translators[i].target,
+					itemType:translators[i].itemType}
+			
+			possibleTranslators.push(translator);
+		}
+	}
+	if(!possibleTranslators.length && this.type == "import" && !ignoreExtensions) {
+		Scholar.debug("looking a second time");
+		// try search again, ignoring file extensions
+		return this._findTranslators(translators, true);
+	}
+	return possibleTranslators;
+}
+
 Scholar.Translate.prototype._loadTranslator = function() {
 	if(!this._sandbox || this.type == "search") {
 		// create a new sandbox if none exists, or for searching (so that it's
@ -467,7 +478,7 @@ Scholar.Translate.prototype._generateSandbox = function() {
 /*
 * Check to see if _scraper_ can scrape this document
 */
-Scholar.Translate.prototype._canTranslate = function(translator) {	
+Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtensions) {	
 	// Test location with regular expression
 	// If this is slow, we could preload all scrapers and compile regular
 	// expressions, so each check will be faster
@ -482,6 +493,12 @@ Scholar.Translate.prototype._canTranslate = function(translator) {
 		if(regularExpression.test(this.path)) {
 			canTranslate = true;
 		}
+		
+		if(ignoreExtensions) {
+			// if we're ignoring extensions, that means we already tried
+			// everything without ignoring extensions and it didn't work
+			canTranslate = !canTranslate;
+		}
 	} else {
 		var canTranslate = true;
 	}
@ -917,7 +934,7 @@ Scholar.Translate.prototype._import = function() {
 * sets up import for IO
 */
 Scholar.Translate.prototype._importConfigureIO = function() {
-	if(this._configOptions.dataMode == "rdf") {			 
+	if(this._configOptions.dataMode == "rdf") {
 		var IOService = Components.classes['@mozilla.org/network/io-service;1']
 						.getService(Components.interfaces.nsIIOService);
 		var fileHandler = IOService.getProtocolHandler("file")
--- a/scrapers.sql
+++ b/scrapers.sql
@ -1,7 +1,7 @@
-- 38
+-- 39

 -- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-07 01:09:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-07 21:55:00'));

 REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', 
 'function detectWeb(doc, url) {
@ -2163,7 +2163,7 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006
 	}
 }');

-REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 12, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)',
+REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-08-07 21:55:00', 12, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)',
 'function detectWeb(doc, url) {
 	if(doc.location.href.indexOf("list_uids=") >= 0) {
 		return "journalArticle";
@ -2738,7 +2738,15 @@ function doSearch(item) {

 REPLACE INTO "translators" VALUES ('0e2235e7-babf-413c-9acf-f27cce5f059c', '2006-07-05 23:40:00', 3, 'MODS (XML)', 'Simon Kornblith', 'xml',
 'Scholar.addOption("exportNotes", true);
-Scholar.addOption("exportFileData", true);',
+Scholar.addOption("exportFileData", true);
+
+function detectImport() {
+	var read = Scholar.read(512);
+	var modsTagRegexp = /<mods[^>]+>/
+	if(modsTagRegexp.test(read)) {
+		return true;
+	}
+}',
 'var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"];

 function doExport() {
@ -3664,7 +3672,17 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006
 }');

 REPLACE INTO "translators" VALUES ('5e3ad958-ac79-463d-812b-a86a9235c28f', '2006-07-15 17:09:00', 1, 'RDF', 'Simon Kornblith', 'rdf',
-'Scholar.configure("dataMode", "rdf");',
+'Scholar.configure("dataMode", "rdf");
+
+function detectImport() {
+	// unfortunately, Mozilla will let you create a data source from any type
+	// of XML, so we need to make sure there are actually nodes
+	
+	var nodes = Scholar.RDF.getAllResources();
+	if(nodes) {
+		return true;
+	}
+}',
 '// gets the first result set for a property that can be encoded in multiple
 // ontologies
 function getFirstResults(node, properties, onlyOneString) {
@ -4052,7 +4070,20 @@ function doImport() {

 REPLACE INTO "translators" VALUES ('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7', '2006-06-30 15:36:00', 3, 'RIS', 'Simon Kornblith', 'ris',
 'Scholar.configure("dataMode", "line");
-Scholar.addOption("exportNotes", true);',
+Scholar.addOption("exportNotes", true);
+
+function detectImport() {
+	var line
+	while(line = Scholar.read()) {
+		if(line.replace(/\s/g, "") != "") {
+			if(line.substr(0, 6) == "TY  - ") {
+				return true;
+			} else {
+				return false;
+			}
+		}
+	}
+}',
 'var itemsWithYears = ["book", "bookSection", "thesis", "film"];

 var fieldMap = {
@ -4325,7 +4356,13 @@ function doExport() {
 }');

 REPLACE INTO "translators" VALUES ('a6ee60df-1ddc-4aae-bb25-45e0537be973', '2006-07-16 17:18:00', 1, 'MARC', 'Simon Kornblith', 'marc',
-NULL,
+'function detectImport() {
+	var marcRecordRegexp = /^[0-9]{5}[a-z ]{3}$/
+	var read = Scholar.read(8);
+	if(marcRecordRegexp.test(read)) {
+		return true;
+	}
+}',
 '/*
 * Original version of MARC record library copyright (C) 2005 Stefano Bargioni,
 * licensed under the LGPL
@ -4801,7 +4838,6 @@ function doImport(url) {	// the URL is actually here for other translators
 	
 	while(text = Scholar.read(4096)) {	// read in 4096 byte increments
 		var records = text.split("\x1D");
-		Scholar.Utilities.debugPrint(records);
 		
 		if(records.length > 1) {
 			records[0] = holdOver + records[0];