closes #457, unAPI translator. there are currently some problems with recognizing document types of foreign MODS documents, but i hope to resolve these soon.

2007-03-19 22:29:49 +00:00 · 2007-03-19 22:29:49 +00:00 · c3bd2579cc
commit c3bd2579cc
parent 172cdb4cec
2 changed files with 242 additions and 5 deletions
--- a/chrome/content/zotero/xpcom/translate.js
+++ b/chrome/content/zotero/xpcom/translate.js
@ -537,8 +537,6 @@ Zotero.Translate.prototype._loadTranslator = function() {
 * does the actual translation
 */
 Zotero.Translate.prototype.translate = function() {
-	Zotero.debug("translate called");
-	
 	/*
 	 * initialize properties
 	 */
@ -2164,7 +2162,7 @@ Zotero.Translate.TranslatorSearch.prototype.complete = function(returnValue, err
 	
 	this.currentTranslator = undefined;
 	this.asyncMode = false;
-	
+		
 	// resume execution
 	this.execute();
 }
--- a/scrapers.sql
+++ b/scrapers.sql
@ -1,4 +1,4 @@
-- 181
+-- 182

 --  ***** BEGIN LICENSE BLOCK *****
 --  
@ -22,7 +22,7 @@


 -- Set the following timestamp to the most recent scraper update date
-REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-03-16 23:28:15'));
+REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-03-19 22:20:40'));

 REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-12-15 03:40:00', 1, 100, 4, 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 
 'function detectWeb(doc, url) {
@ -3155,6 +3155,245 @@ function doWeb(doc, url) {
 	}
 }');

+REPLACE INTO translators VALUES ('e7e01cac-1e37-4da6-b078-a0e8343b0e98', '1.0.0b4r1', '', '2007-03-19 22:20:40', '1', '90', '4', 'unAPI', 'Simon Kornblith', '', 
+'var RECOGNIZABLE_FORMATS = ["mods", "marc", "endnote", "ris", "bibtex", "rdf"];
+var FORMAT_GUIDS = {
+	"mods":"0e2235e7-babf-413c-9acf-f27cce5f059c",
+	"marc":"a6ee60df-1ddc-4aae-bb25-45e0537be973",
+	"endnote":"881f60f2-0802-411a-9228-ce5f47b64c7d",
+	"ris":"32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7",
+	"bibtex":"9cb70025-a888-4a29-a210-93ec52da40d4",
+	"rdf":"5e3ad958-ac79-463d-812b-a86a9235c28f"
+};
+
+var unAPIResolver, unsearchedIds, foundIds, foundItems, foundFormat, foundFormatName;
+
+function detectWeb(doc, url) {
+	// initialize variables
+	unsearchedIds = [];
+	foundIds = [];
+	foundItems = [];
+	foundFormat = [];
+	foundFormatName = [];
+	
+	var nsResolver = doc.createNSResolver(doc.documentElement);
+	
+	// look for a resolver
+	unAPIResolver = doc.evaluate(''//link[@rel="unapi-server"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	if(!unAPIResolver) return false;
+	unAPIResolver = unAPIResolver.getAttribute("href");
+	
+	// look for abbrs
+	var abbrs = doc.getElementsByTagName("abbr");
+	for each(var abbr in abbrs) {
+		if(abbr.getAttribute) {
+			if(abbr.getAttribute("class").split(" ").indexOf("unapi-id") != -1 && abbr.getAttribute("title")) {
+				// found an abbr
+				unsearchedIds.push(abbr.getAttribute("title"));
+			}
+		}
+	}
+	
+	if(!unsearchedIds.length) return false;
+	
+	// now we need to see if the server actually gives us bibliographic metadata.
+	
+	// one way to signal this is with a META tag
+	var zoteroMeta = doc.evaluate(''//meta[@name="ZoteroItemType"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	if(zoteroMeta) return zoteroMeta.getAttribute("content");
+	
+	// otherwise, things will be a bit more complicated, and we''ll have to do some HTTP requests
+	Zotero.wait();
+	
+	if(unsearchedIds.length == 1) {
+		// if there''s only one abbr tag, we should go ahead and retrieve types for it
+		getItemType();
+	} else {
+		// if there''s more than one, we should first see if the resolver gives metadata for all of them
+		Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) {
+			var format = checkFormats(text);
+			if(format) {
+				// move unsearchedIds to foundIds
+				foundIds = unsearchedIds;
+				unsearchedIds = [];
+				// save format and formatName
+				foundFormat = format[0];
+				foundFormatName = format[1];
+				
+				Zotero.done("multiple");
+			} else {
+				getItemType();
+			}
+		});
+	}
+}
+
+function getItemType() {
+	// if there are no items left to search, use the only item''s type (if there is one) or give up
+	if(!unsearchedIds.length) {
+		if(foundIds.length) {
+			getOnlyItem();
+		} else {
+			Zotero.done(false);
+		}
+		return;
+	}
+	
+	var id = unsearchedIds.shift();
+	Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) {
+		var format = checkFormats(text);
+		if(format) {
+			// save data
+			foundIds.push(id);
+			foundFormat.push(format[0]);
+			foundFormatName.push(format[1]);
+			
+			if(foundIds.length == 2) {
+				// this is our second; use multiple
+				Zotero.done("multiple");
+				return;
+			}
+		}
+		
+		// keep going
+		getItemType();
+	});
+}
+
+function checkFormats(text) {
+	text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "");
+	var xml = new XML(text);
+	
+	var foundFormat = new Object();
+	
+	// this is such an ugly, disgusting hack, and I hate how Mozilla decided to neuter an ECMA standard
+	for each(var format in xml.format) {
+		var name = format.@name.toString();
+		var lowerName = name.toLowerCase();
+		
+		if(format.@namespace_uri == "http://www.loc.gov/mods/v3" || lowerName == "mods" || format.@docs == "http://www.loc.gov/standards/mods/") {
+			if(!foundFormat["mods"] || lowerName.indexOf("full") != -1) {
+				foundFormat["mods"] = name;
+			}
+		} else if(lowerName.match(/^marc\b/)) {
+			if(!foundFormat["marc"] || lowerName.indexOf("utf8") != -1) {
+				foundFormat["marc"] = name;
+			}
+		} else if(lowerName == "rdf_dc") {
+			foundFormat["rdf"] = name;
+		} else if(format.@docs.text() == "http://www.refman.com/support/risformat_intro.asp" || lowerName.match(/^ris\b/)) {
+			if(!foundFormat["ris"] || lowerName.indexOf("utf8") != -1) {
+				foundFormat["ris"] = name;
+			}
+		} else if(lowerName == "bibtex") {
+			foundFormat["bibtex"] = name;
+		} else if(lowerName == "endnote") {
+			foundFormat["endnote"] = name;
+		}
+	}
+	
+	// loop through again, this time respecting preferences
+	for each(var format in RECOGNIZABLE_FORMATS) {
+		if(foundFormat[format]) return [format, foundFormat[format]];
+	}
+	
+	return false;
+}
+
+function getOnlyItem() {
+	// retrieve the only item
+	retrieveItem(foundIds[0], foundFormat[0], foundFormatName[0], function(obj, item) {
+		foundItems.push(item);
+		Zotero.done(item.itemType);
+	});
+}
+
+function retrieveItem(id, format, formatName, callback) {
+	// retrieve URL
+	Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id+"&format="+formatName, function(text) {
+		var translator = Zotero.loadTranslator("import");
+		translator.setTranslator(FORMAT_GUIDS[format]);
+		translator.setString(text);
+		translator.setHandler("itemDone", callback);
+		translator.translate();
+	});
+}', 
+'/**
+ * Get formats and names for all usable ids; when done, get all items
+ **/
+function getAllIds() {
+	if(!unsearchedIds.length) {
+		// once all ids have been gotten, get all items
+		getAllItems();
+		return;
+	}
+	
+	var id = unsearchedIds.shift();
+	Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) {
+		var format = checkFormats(text);
+		if(format) {
+			// save data
+			foundIds.push(id);
+			foundFormat.push(format[0]);
+			foundFormatName.push(format[1]);
+		}
+		
+		// keep going
+		getAllIds();
+	});
+}
+
+/**
+ * Get all items; when done, show selectItems or scrape
+ **/
+function getAllItems() {
+	if(foundItems.length == foundIds.length) {
+		if(foundItems.length == 1) {
+			// if only one item, send complete()
+			foundItems[0].complete();
+		} else if(foundItems.length > 0) {
+			// if multiple items, show selectItems
+			var itemTitles = [];
+			for(var i in foundItems) {
+				itemTitles[i] = foundItems[i].title;
+			}
+			
+			var chosenItems = Zotero.selectItems(itemTitles);
+			if(!chosenItems) Zotero.done(true);
+			
+			for(var i in chosenItems) {
+				foundItems[i].complete();
+			}
+		}
+		
+		Zotero.done();
+		return;
+	}
+	
+	var id = foundIds[foundItems.length];
+	// foundFormat can be either a string or an array
+	if(typeof(foundFormat) == "string") {
+		var format = foundFormat;
+		var formatName = foundFormatName;
+	} else {
+		var format = foundFormat[foundItems.length];
+		var formatName = foundFormatName[foundItems.length];
+	}
+	
+	// get item
+	retrieveItem(id, format, formatName, function(obj, item) {
+		foundItems.push(item);
+		getAllItems();
+	});
+}
+
+function doWeb() {
+	Zotero.wait();
+	
+	// retrieve data for all ids
+	getAllIds();
+}');
+
 REPLACE INTO translators VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '1.0.0b3.r1', '', '2007-01-07 17:00:00', 1, 100, 4, 'Google Books', 'Simon Kornblith', '^http://books\.google\.[a-z]+/books\?(.*vid=.*\&id=.*|.*q=.*)',
 'function detectWeb(doc, url) {
 	var re = new RegExp(''^http://books\\.google\\.[a-z]+/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');