Rewritten unAPI translator

2011-04-19 20:00:26 +00:00 · 2011-04-19 20:00:26 +00:00 · 3e30f21ede
commit 3e30f21ede
parent 5951768ada
1 changed files with 277 additions and 203 deletions
--- a/translators/unAPI.js
+++ b/translators/unAPI.js
@ -4,257 +4,331 @@
 	"label":"unAPI",
 	"creator":"Simon Kornblith",
 	"target":null,
-	"minVersion":"1.0.0b4.r1",
+	"minVersion":"2.1",
 	"maxVersion":"",
 	"priority":200,
 	"inRepository":true,
 	"detectXPath":"//link[@rel='unapi-server']",
-	"lastUpdated":"2010-09-23 04:19:20"
+	"lastUpdated":"2011-04-19 19:40:07"
 }

-var RECOGNIZABLE_FORMATS = ["mods", "marc", "endnote", "ris", "bibtex", "rdf"];
+var RECOGNIZABLE_FORMATS = ["rdf_zotero", "rdf_bibliontology", "mods", "marc", "unimarc", "ris",
+	"refer", "bibtex", "rdf_dc"];
 var FORMAT_GUIDS = {
+	"rdf_zotero":"5e3ad958-ac79-463d-812b-a86a9235c28f",
+	"rdf_bibliontology":"14763d25-8ba0-45df-8f52-b8d1108e7ac9",
 	"mods":"0e2235e7-babf-413c-9acf-f27cce5f059c",
 	"marc":"a6ee60df-1ddc-4aae-bb25-45e0537be973",
-	"endnote":"881f60f2-0802-411a-9228-ce5f47b64c7d",
+	"unimarc":"a6ee60df-1ddc-4aae-bb25-45e0537be973",
 	"ris":"32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7",
+	"refer":"881f60f2-0802-411a-9228-ce5f47b64c7d",
 	"bibtex":"9cb70025-a888-4a29-a210-93ec52da40d4",
-	"rdf":"5e3ad958-ac79-463d-812b-a86a9235c28f"
+	"rdf_dc":"5e3ad958-ac79-463d-812b-a86a9235c28f"
 };

-var unAPIResolver, unsearchedIds, foundIds, foundItems, foundFormat, foundFormatName, domain;
+var unAPIResolver = false;
+var defaultFormat, unAPIIDs;

-function detectWeb(doc, url) {
-	// initialize variables
-	unsearchedIds = [];
-	foundIds = [];
-	foundItems = [];
-	foundFormat = [];
-	foundFormatName = [];
+/**
+ * A class to describe an unAPI format description
+ * @property isSupported {Boolean} Whether Zotero supports a format contained in this description
+ * @property name {String} The unAPI format name, used to retrieve item descriptions
+ * @property translatorID {String} The ID of the translator used to read this format
+ *
+ * @constructor
+ * @param {String} aXML unAPI format description XML
+ */
+UnAPIFormat = function(aXML) {
+	var parser = new DOMParser();
+	var doc = parser.parseFromString(aXML.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""), "text/xml");
 	
-	// Set the domain we're scraping
-	domain = doc.location.href.match(/https?:\/\/([^/]+)/);
+	var foundFormat = new Object();
 	
-	// This and the x: prefix in the XPath are to work around an issue with pages
-	// served as application/xhtml+xml
-	//
-	// https://developer.mozilla.org/en/Introduction_to_using_XPath_in_JavaScript#Implementing_a_default_namespace_for_XML_documents
-	function nsResolver() {
-		return 'http://www.w3.org/1999/xhtml';
+	// Loop through to determine format name
+	var nodes = doc.documentElement.getElementsByTagName("format");
+	var nNodes = nodes.length;
+	var node, name, lowerName, format;
+	for(var i=0; i<nNodes; i++) {
+		node = nodes[i];
+		name = node.getAttribute("name");
+		lowerName = name.toLowerCase();
+		format = false;
+		
+		// Look for formats we can recognize
+		if(["rdf_zotero", "rdf_bibliontology", "bibtex", "endnote", "rdf_dc"].indexOf(lowerName) != -1) {
+			format = lowerName;
+		} else if(lowerName == "rdf_bibliontology") {
+			format = "rdf_bibliontology";
+		} else if(lowerName === "mods"
+				|| node.getAttribute("namespace_uri") === "http://www.loc.gov/mods/v3"
+				|| node.getAttribute("docs") === "http://www.loc.gov/standards/mods/"
+				|| node.getAttribute("type") === "application/mods+xml") {
+			format = "mods";
+		} else if(lowerName.match(/^marc\b/)
+				|| node.getAttribute("type") === "application/marc") {
+			format = "marc";
+		} else if(lowerName.match(/^unimarc\b/)
+				|| node.getAttribute("type") === "application/unimarc") {
+			format = "unimarc";
+		} else if(node.getAttribute("docs") == "http://www.refman.com/support/risformat_intro.asp"
+				|| lowerName.match(/^ris\b/)) {
+			format = "ris";
+		}
+		
+		if(format) foundFormat[format] = name;
 	}
 	
-	// look for a resolver
-	unAPIResolver = doc.evaluate('//x:link[@rel="unapi-server"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
-	if(!unAPIResolver) return false;
-	unAPIResolver = unAPIResolver.getAttribute("href");
-	
-	// look for abbrs
-	var abbrs = doc.getElementsByTagName("abbr");
-	for each(var abbr in abbrs) {
-		if(abbr.getAttribute && abbr.getAttribute("class") &&
-		   abbr.getAttribute("class").split(" ").indexOf("unapi-id") != -1 && abbr.getAttribute("title")) {
-			// found an abbr
-			unsearchedIds.push(escape(abbr.getAttribute("title")));
+	// Loop through again to determine optimal supported format
+	for(var i=0; i<RECOGNIZABLE_FORMATS.length; i++) {
+		if(foundFormat[RECOGNIZABLE_FORMATS[i]]) {
+			this.isSupported = true;
+			this.name = foundFormat[RECOGNIZABLE_FORMATS[i]];
+			this.translatorID = FORMAT_GUIDS[RECOGNIZABLE_FORMATS[i]];
+			return;
 		}
 	}
-	if(!unsearchedIds.length) return false;
 	
-	// now we need to see if the server actually gives us bibliographic metadata.
-	Zotero.wait();
+	this.isSupported = false;
+}

-	if(unsearchedIds.length == 1) {
-		// if there's only one abbr tag, we should go ahead and retrieve types for it
-		getItemType();
-	} else {
-		// if there's more than one, we should first see if the resolver gives metadata for all of them
-		Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) {
-			var format = checkFormats(text);
-			if(format) {
-				// move unsearchedIds to foundIds
-				foundIds = unsearchedIds;
-				unsearchedIds = [];
-				// save format and formatName
-				foundFormat = format[0];
-				foundFormatName = format[1];
+/**
+ * A class encapsulating an UnAPI ID
+ * @property format {UnAPIFormat} Information regarding the format
+ * @property items {Zotero.Item[]} Items corresponding to this ID
+ *
+ * @constructor
+ * @param {String} id The ID contained in an abbr tag
+ */
+UnAPIID = function(id) {
+	this.id = id;
+	unAPIIDs[id] = this;
+}

-				Zotero.done("multiple");
+UnAPIID.prototype = {
+	/**
+	 * Gets the item type for this item
+	 * @param {Function} callback Callback to be passed itemType when it is known
+	 */
+	"getItemType":function(callback) {
+		var me = this;
+		this.getItems(function(items) {
+			if(items.length === 0) {
+				callback(false);
+			} else if(items.length === 1) {
+				callback(items[0].itemType);
 			} else {
-				getItemType();
+				callback("multiple");
+			}
+		});
+	},
+	
+	/**
+	 * Gets items associated with this ID
+	 * @param {Function} callback Callback to be passed items when they have been retrieved
+	 */
+	"getItems":function(callback) {
+		if(this.items) {
+			callback(me.items);
+			return;
+		}
+		
+		var me = this;
+		this.items = [];
+		this.isSupported(function(isSupported) {
+			if(!isSupported) {
+				callback([]);
+				return;
+			}
+			
+			Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+me.id+"&format="+me.format.name, function(text) {
+				var translator = Zotero.loadTranslator("import");
+				translator.setTranslator(me.format.translatorID);
+				translator.setString(text);
+				translator.setHandler("itemDone", function(obj, item) {
+					// add item to array
+					me.items.push(item);
+				});
+				translator.setHandler("done", function(obj) {
+					// run callback on item array
+					callback(me.items);
+				});
+				translator.translate();
+			});
+		});
+	},
+	
+	/**
+	 * Determines whether Zotero can handle this ID
+	 * @param {Function} callback Callback to be passed isSupported when it is known
+	 */
+	"isSupported":function(callback) {
+		if(this.hasOwnProperty("format")) {
+			callback(me.format.isSupported);
+			return;
+		}
+		
+		var me = this;
+		
+		getDefaultFormat(function() {
+			// first try default format, since this won't require >1 HTTP request
+			if(defaultFormat.isSupported) {
+				me.format = defaultFormat;
+				callback(true);
+			} else {
+				// if no supported default format, try format for this item
+				Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+me.id, function(text) {
+					me.format = UnAPIFormat(text);
+					callback(!!me.format.isSupported);
+				});
 			}
 		});
 	}
 }

-function getItemType() {
-	// if there are no items left to search, use the only item's type (if there is one) or give up
-	if(!unsearchedIds.length) {
-		if(foundIds.length) {
-			getOnlyItem();
+/**
+ * This and the x: prefix in the XPath are to work around an issue with pages
+ * served as application/xhtml+xml
+ *
+ * https://developer.mozilla.org/en/Introduction_to_using_XPath_in_JavaScript#Implementing_a_default_namespace_for_XML_documents
+ */
+function nsResolver() {
+	return 'http://www.w3.org/1999/xhtml';
+}
+
+/**
+ * Extracts UnAPIIDs from a document
+ * @param {document} A document object from which to extract unAPIIds
+ * @return {UnAPIID[]} The unAPI ID objects extracted from the document
+ */
+function getUnAPIIDs(doc) {
+	// look for a resolver
+	var newUnAPIResolver = doc.evaluate('//x:link[@rel="unapi-server"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	if(!newUnAPIResolver) return [];
+	newUnAPIResolver = newUnAPIResolver.getAttribute("href");
+	if(unAPIResolver !== newUnAPIResolver) {
+		// if unAPI resolver has changed, clear
+		defaultFormat = false;
+		unAPIResolver = newUnAPIResolver;
+		unAPIIDs = [];
+	}
+	
+	// look for abbrs
+	var abbrs = doc.evaluate('//x:abbr[contains(@class, " unapi-id") or contains(@class, "unapi-id ") or @class="unapi-id"][@title]',
+		doc, nsResolver, XPathResult.ANY_TYPE, null);
+	var abbr;
+	var ids = [];
+	while(abbr = abbrs.iterateNext()) {
+		var id = abbr.getAttribute("title");
+		ids.push(unAPIIDs[id] ? unAPIIDs[id] : new UnAPIID(id));
+	}
+	
+	return ids;
+}
+
+
+/**
+ * Retrieves the list of formats available for all items accessible through this unAPI resolver
+ * @param {Function} callback A callback to be passed the format when it is available
+ */
+function getDefaultFormat(callback) {
+	if(defaultFormat) {
+		callback(defaultFormat);
+	} else {
+		Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) {
+			// determine format of this item
+			defaultFormat = new UnAPIFormat(text);
+			callback(defaultFormat);
+		});
+	}
+}
+/**
+ * Determines itemType for detection
+ */
+function determineDetectItemType(ids, supportedId) {
+	var id = ids.shift();
+	id.isSupported(function(isSupported) {
+		if(isSupported && supportedId !== undefined) {
+			// If there are multiple items with valid itemTypes, use "multiple"
+			Zotero.done("multiple");
+		} else if(ids.length) {
+			// If IDs remain to be handled, handle the next one
+			determineDetectItemType(ids, (isSupported ? id : supportedId));
 		} else {
-			Zotero.done(false);
+			// If all IDs have been handled, get foundItemType for only supported ID
+			supportedId.getItemType(Zotero.done);
 		}
-		return;
-	}
-	
-	var id = unsearchedIds.shift();
-	Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) {
-		var format = checkFormats(text);
-		if(format) {
-			// save data
-			foundIds.push(id);
-			foundFormat.push(format[0]);
-			foundFormatName.push(format[1]);
-			
-			if(foundIds.length == 2) {
-				// this is our second; use multiple
-				Zotero.done("multiple");
-				return;
-			}
-		}
-		
-		// keep going
-		getItemType();
-	});
-}
-
-function checkFormats(text) {
-	text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "");
-	var xml = new XML(text);
-	
-	var foundFormat = new Object();
-	
-	// this is such an ugly, disgusting hack, and I hate how Mozilla decided to neuter an ECMA standard
-	for each(var format in xml.format) {
-		var name = format.@name.toString();
-		var lowerName = name.toLowerCase();
-		
-		if(format.@namespace_uri == "http://www.loc.gov/mods/v3" || lowerName == "mods" || format.@docs == "http://www.loc.gov/standards/mods/") {
-			if(!foundFormat["mods"] || lowerName.indexOf("full") != -1) {
-				foundFormat["mods"] = escape(name);
-			}
-		} else if(lowerName.match(/^marc\b/)) {
-			if(!foundFormat["marc"] || lowerName.indexOf("utf8") != -1) {
-				foundFormat["marc"] = escape(name);
-			}
-		} else if(lowerName == "rdf_dc") {
-			foundFormat["rdf"] = escape(name);
-		} else if(format.@docs.text() == "http://www.refman.com/support/risformat_intro.asp" || lowerName.match(/^ris\b/)) {
-			if(!foundFormat["ris"] || lowerName.indexOf("utf8") != -1) {
-				foundFormat["ris"] = escape(name);
-			}
-		} else if(lowerName == "bibtex") {
-			foundFormat["bibtex"] = escape(name);
-		} else if(lowerName == "endnote") {
-			foundFormat["endnote"] = escape(name);
-		}
-	}
-	
-	// loop through again, this time respecting preferences
-	for each(var format in RECOGNIZABLE_FORMATS) {
-		if(foundFormat[format]) return [format, foundFormat[format]];
-	}
-	
-	return false;
-}
-
-function getOnlyItem() {
-	// retrieve the only item
-	retrieveItem(foundIds[0], foundFormat[0], foundFormatName[0], function(obj, item) {
-		foundItems.push(item);
-		Zotero.done(item.itemType);
-	});
-}
-
-function retrieveItem(id, format, formatName, callback) {
-	// retrieve URL
-	Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id+"&format="+formatName, function(text) {
-		var translator = Zotero.loadTranslator("import");
-		translator.setTranslator(FORMAT_GUIDS[format]);
-		translator.setString(text);
-		translator.setHandler("itemDone", callback);
-		translator.translate();
 	});
 }

 /**
- * Get formats and names for all usable ids; when done, get all items
+ * Get all items
+ * @param {UnAPIID[]} ids List of UnAPI IDs
+ * @param {Function} callback Function to pass item array to when items have been retrieved
+ * @param {Zotero.Item[]} items Item array; used for recursive calls
 **/
-function getAllIds() {
-	if(!unsearchedIds.length) {
-		// once all ids have been gotten, get all items
-		getAllItems();
-		return;
-	}
+function getAllItems(ids, callback, items) {
+	var id = ids.shift();
+	id.getItems(function(retrievedItems) {
+		var collectedItems = (items ? items.concat(retrievedItems) : retrievedItems);
 		
-	var id = unsearchedIds.shift();
-	Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) {
-		var format = checkFormats(text);
-		if(format) {
-			// save data
-			foundIds.push(id);
-			foundFormat.push(format[0]);
-			foundFormatName.push(format[1]);
+		if(ids.length) {
+			getAllItems(ids, callback, collectedItems);
+		} else {
+			callback(collectedItems);
 		}
-		
-		// keep going
-		getAllIds();
 	});
 }

-/**
- * Get all items; when done, show selectItems or scrape
- **/
-function getAllItems() {
-	if(foundItems.length == foundIds.length) {
-		if(foundItems.length == 1) {
-			// Set the item Repository to the domain
-			foundItems[0].repository = domain[1];
-			// if only one item, send complete()
-			foundItems[0].complete();
-		} else if(foundItems.length > 0) {
-			// if multiple items, show selectItems
+function detectWeb(doc, url) {
+	// get unAPI IDs
+	var ids = getUnAPIIDs(doc);
+	if(!ids.length) return false;
+	
+	// now we need to see if the server actually gives us bibliographic metadata, and determine the
+	// type
+	Zotero.wait();
+	
+	if(!ids.length === 1) {
+		// Only one item, so we will just get its item type
+		ids[0].getItemType(Zotero.done);
+	} else {
+		// Several items. We will need to call determineDetectItemType
+		determineDetectItemType(ids);
+	}
+}
+
+function doWeb(doc, url) {
+	var ids = getUnAPIIDs(doc);
+	
+	Zotero.wait();
+	
+	getAllItems(ids, function(items) {
+		// get the domain we're scraping, so we can use it for libraryCatalog
+		domain = doc.location.href.match(/https?:\/\/([^/]+)/);
+		
+		if(items.length == 1) {
+			// If only one item, just complete it
+			items[0].libraryCatalog = domain[1];
+			items[0].complete();
+		} else if(items.length > 0) {
+			// If multiple items, extract their titles
 			var itemTitles = [];
-			for(var i in foundItems) {
-				itemTitles[i] = foundItems[i].title;
+			for(var i in items) {
+				itemTitles[i] = items[i].title;
 			}
 			
+			// Show item selection dialog
 			var chosenItems = Zotero.selectItems(itemTitles);
 			if(!chosenItems) Zotero.done(true);
 			
+			// Complete items
 			for(var i in chosenItems) {
-				// Set the item Repository to the domain
-				foundItems[i].repository = domain[1];
-				foundItems[i].complete();
+				items[i].libraryCatalog = domain[1];
+				items[i].complete();
 			}
 		}
 		
-		// reset items
-		foundItems = [];
-		
 		Zotero.done();
 		return;
-	}
-	
-	var id = foundIds[foundItems.length];
-	// foundFormat can be either a string or an array
-	if(typeof(foundFormat) == "string") {
-		var format = foundFormat;
-		var formatName = foundFormatName;
-	} else {
-		var format = foundFormat[foundItems.length];
-		var formatName = foundFormatName[foundItems.length];
-	}
-	
-	// get item
-	retrieveItem(id, format, formatName, function(obj, item) {
-		foundItems.push(item);
-		getAllItems();
 	});
 }
-
-function doWeb() {
-	Zotero.wait();
-	
-	// retrieve data for all ids
-	getAllIds();
-}