Rewritten unAPI translator
This commit is contained in:
parent
5951768ada
commit
3e30f21ede
|
@ -4,257 +4,331 @@
|
|||
"label":"unAPI",
|
||||
"creator":"Simon Kornblith",
|
||||
"target":null,
|
||||
"minVersion":"1.0.0b4.r1",
|
||||
"minVersion":"2.1",
|
||||
"maxVersion":"",
|
||||
"priority":200,
|
||||
"inRepository":true,
|
||||
"detectXPath":"//link[@rel='unapi-server']",
|
||||
"lastUpdated":"2010-09-23 04:19:20"
|
||||
"lastUpdated":"2011-04-19 19:40:07"
|
||||
}
|
||||
|
||||
var RECOGNIZABLE_FORMATS = ["mods", "marc", "endnote", "ris", "bibtex", "rdf"];
|
||||
var RECOGNIZABLE_FORMATS = ["rdf_zotero", "rdf_bibliontology", "mods", "marc", "unimarc", "ris",
|
||||
"refer", "bibtex", "rdf_dc"];
|
||||
var FORMAT_GUIDS = {
|
||||
"rdf_zotero":"5e3ad958-ac79-463d-812b-a86a9235c28f",
|
||||
"rdf_bibliontology":"14763d25-8ba0-45df-8f52-b8d1108e7ac9",
|
||||
"mods":"0e2235e7-babf-413c-9acf-f27cce5f059c",
|
||||
"marc":"a6ee60df-1ddc-4aae-bb25-45e0537be973",
|
||||
"endnote":"881f60f2-0802-411a-9228-ce5f47b64c7d",
|
||||
"unimarc":"a6ee60df-1ddc-4aae-bb25-45e0537be973",
|
||||
"ris":"32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7",
|
||||
"refer":"881f60f2-0802-411a-9228-ce5f47b64c7d",
|
||||
"bibtex":"9cb70025-a888-4a29-a210-93ec52da40d4",
|
||||
"rdf":"5e3ad958-ac79-463d-812b-a86a9235c28f"
|
||||
"rdf_dc":"5e3ad958-ac79-463d-812b-a86a9235c28f"
|
||||
};
|
||||
|
||||
var unAPIResolver, unsearchedIds, foundIds, foundItems, foundFormat, foundFormatName, domain;
|
||||
var unAPIResolver = false;
|
||||
var defaultFormat, unAPIIDs;
|
||||
|
||||
function detectWeb(doc, url) {
|
||||
// initialize variables
|
||||
unsearchedIds = [];
|
||||
foundIds = [];
|
||||
foundItems = [];
|
||||
foundFormat = [];
|
||||
foundFormatName = [];
|
||||
/**
|
||||
* A class to describe an unAPI format description
|
||||
* @property isSupported {Boolean} Whether Zotero supports a format contained in this description
|
||||
* @property name {String} The unAPI format name, used to retrieve item descriptions
|
||||
* @property translatorID {String} The ID of the translator used to read this format
|
||||
*
|
||||
* @constructor
|
||||
* @param {String} aXML unAPI format description XML
|
||||
*/
|
||||
UnAPIFormat = function(aXML) {
|
||||
var parser = new DOMParser();
|
||||
var doc = parser.parseFromString(aXML.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""), "text/xml");
|
||||
|
||||
// Set the domain we're scraping
|
||||
domain = doc.location.href.match(/https?:\/\/([^/]+)/);
|
||||
var foundFormat = new Object();
|
||||
|
||||
// This and the x: prefix in the XPath are to work around an issue with pages
|
||||
// served as application/xhtml+xml
|
||||
//
|
||||
// https://developer.mozilla.org/en/Introduction_to_using_XPath_in_JavaScript#Implementing_a_default_namespace_for_XML_documents
|
||||
function nsResolver() {
|
||||
return 'http://www.w3.org/1999/xhtml';
|
||||
// Loop through to determine format name
|
||||
var nodes = doc.documentElement.getElementsByTagName("format");
|
||||
var nNodes = nodes.length;
|
||||
var node, name, lowerName, format;
|
||||
for(var i=0; i<nNodes; i++) {
|
||||
node = nodes[i];
|
||||
name = node.getAttribute("name");
|
||||
lowerName = name.toLowerCase();
|
||||
format = false;
|
||||
|
||||
// Look for formats we can recognize
|
||||
if(["rdf_zotero", "rdf_bibliontology", "bibtex", "endnote", "rdf_dc"].indexOf(lowerName) != -1) {
|
||||
format = lowerName;
|
||||
} else if(lowerName == "rdf_bibliontology") {
|
||||
format = "rdf_bibliontology";
|
||||
} else if(lowerName === "mods"
|
||||
|| node.getAttribute("namespace_uri") === "http://www.loc.gov/mods/v3"
|
||||
|| node.getAttribute("docs") === "http://www.loc.gov/standards/mods/"
|
||||
|| node.getAttribute("type") === "application/mods+xml") {
|
||||
format = "mods";
|
||||
} else if(lowerName.match(/^marc\b/)
|
||||
|| node.getAttribute("type") === "application/marc") {
|
||||
format = "marc";
|
||||
} else if(lowerName.match(/^unimarc\b/)
|
||||
|| node.getAttribute("type") === "application/unimarc") {
|
||||
format = "unimarc";
|
||||
} else if(node.getAttribute("docs") == "http://www.refman.com/support/risformat_intro.asp"
|
||||
|| lowerName.match(/^ris\b/)) {
|
||||
format = "ris";
|
||||
}
|
||||
|
||||
if(format) foundFormat[format] = name;
|
||||
}
|
||||
|
||||
// look for a resolver
|
||||
unAPIResolver = doc.evaluate('//x:link[@rel="unapi-server"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
if(!unAPIResolver) return false;
|
||||
unAPIResolver = unAPIResolver.getAttribute("href");
|
||||
|
||||
// look for abbrs
|
||||
var abbrs = doc.getElementsByTagName("abbr");
|
||||
for each(var abbr in abbrs) {
|
||||
if(abbr.getAttribute && abbr.getAttribute("class") &&
|
||||
abbr.getAttribute("class").split(" ").indexOf("unapi-id") != -1 && abbr.getAttribute("title")) {
|
||||
// found an abbr
|
||||
unsearchedIds.push(escape(abbr.getAttribute("title")));
|
||||
// Loop through again to determine optimal supported format
|
||||
for(var i=0; i<RECOGNIZABLE_FORMATS.length; i++) {
|
||||
if(foundFormat[RECOGNIZABLE_FORMATS[i]]) {
|
||||
this.isSupported = true;
|
||||
this.name = foundFormat[RECOGNIZABLE_FORMATS[i]];
|
||||
this.translatorID = FORMAT_GUIDS[RECOGNIZABLE_FORMATS[i]];
|
||||
return;
|
||||
}
|
||||
}
|
||||
if(!unsearchedIds.length) return false;
|
||||
|
||||
// now we need to see if the server actually gives us bibliographic metadata.
|
||||
Zotero.wait();
|
||||
this.isSupported = false;
|
||||
}
|
||||
|
||||
if(unsearchedIds.length == 1) {
|
||||
// if there's only one abbr tag, we should go ahead and retrieve types for it
|
||||
getItemType();
|
||||
} else {
|
||||
// if there's more than one, we should first see if the resolver gives metadata for all of them
|
||||
Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) {
|
||||
var format = checkFormats(text);
|
||||
if(format) {
|
||||
// move unsearchedIds to foundIds
|
||||
foundIds = unsearchedIds;
|
||||
unsearchedIds = [];
|
||||
// save format and formatName
|
||||
foundFormat = format[0];
|
||||
foundFormatName = format[1];
|
||||
/**
|
||||
* A class encapsulating an UnAPI ID
|
||||
* @property format {UnAPIFormat} Information regarding the format
|
||||
* @property items {Zotero.Item[]} Items corresponding to this ID
|
||||
*
|
||||
* @constructor
|
||||
* @param {String} id The ID contained in an abbr tag
|
||||
*/
|
||||
UnAPIID = function(id) {
|
||||
this.id = id;
|
||||
unAPIIDs[id] = this;
|
||||
}
|
||||
|
||||
Zotero.done("multiple");
|
||||
UnAPIID.prototype = {
|
||||
/**
|
||||
* Gets the item type for this item
|
||||
* @param {Function} callback Callback to be passed itemType when it is known
|
||||
*/
|
||||
"getItemType":function(callback) {
|
||||
var me = this;
|
||||
this.getItems(function(items) {
|
||||
if(items.length === 0) {
|
||||
callback(false);
|
||||
} else if(items.length === 1) {
|
||||
callback(items[0].itemType);
|
||||
} else {
|
||||
getItemType();
|
||||
callback("multiple");
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* Gets items associated with this ID
|
||||
* @param {Function} callback Callback to be passed items when they have been retrieved
|
||||
*/
|
||||
"getItems":function(callback) {
|
||||
if(this.items) {
|
||||
callback(me.items);
|
||||
return;
|
||||
}
|
||||
|
||||
var me = this;
|
||||
this.items = [];
|
||||
this.isSupported(function(isSupported) {
|
||||
if(!isSupported) {
|
||||
callback([]);
|
||||
return;
|
||||
}
|
||||
|
||||
Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+me.id+"&format="+me.format.name, function(text) {
|
||||
var translator = Zotero.loadTranslator("import");
|
||||
translator.setTranslator(me.format.translatorID);
|
||||
translator.setString(text);
|
||||
translator.setHandler("itemDone", function(obj, item) {
|
||||
// add item to array
|
||||
me.items.push(item);
|
||||
});
|
||||
translator.setHandler("done", function(obj) {
|
||||
// run callback on item array
|
||||
callback(me.items);
|
||||
});
|
||||
translator.translate();
|
||||
});
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* Determines whether Zotero can handle this ID
|
||||
* @param {Function} callback Callback to be passed isSupported when it is known
|
||||
*/
|
||||
"isSupported":function(callback) {
|
||||
if(this.hasOwnProperty("format")) {
|
||||
callback(me.format.isSupported);
|
||||
return;
|
||||
}
|
||||
|
||||
var me = this;
|
||||
|
||||
getDefaultFormat(function() {
|
||||
// first try default format, since this won't require >1 HTTP request
|
||||
if(defaultFormat.isSupported) {
|
||||
me.format = defaultFormat;
|
||||
callback(true);
|
||||
} else {
|
||||
// if no supported default format, try format for this item
|
||||
Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+me.id, function(text) {
|
||||
me.format = UnAPIFormat(text);
|
||||
callback(!!me.format.isSupported);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function getItemType() {
|
||||
// if there are no items left to search, use the only item's type (if there is one) or give up
|
||||
if(!unsearchedIds.length) {
|
||||
if(foundIds.length) {
|
||||
getOnlyItem();
|
||||
/**
|
||||
* This and the x: prefix in the XPath are to work around an issue with pages
|
||||
* served as application/xhtml+xml
|
||||
*
|
||||
* https://developer.mozilla.org/en/Introduction_to_using_XPath_in_JavaScript#Implementing_a_default_namespace_for_XML_documents
|
||||
*/
|
||||
function nsResolver() {
|
||||
return 'http://www.w3.org/1999/xhtml';
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts UnAPIIDs from a document
|
||||
* @param {document} A document object from which to extract unAPIIds
|
||||
* @return {UnAPIID[]} The unAPI ID objects extracted from the document
|
||||
*/
|
||||
function getUnAPIIDs(doc) {
|
||||
// look for a resolver
|
||||
var newUnAPIResolver = doc.evaluate('//x:link[@rel="unapi-server"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
if(!newUnAPIResolver) return [];
|
||||
newUnAPIResolver = newUnAPIResolver.getAttribute("href");
|
||||
if(unAPIResolver !== newUnAPIResolver) {
|
||||
// if unAPI resolver has changed, clear
|
||||
defaultFormat = false;
|
||||
unAPIResolver = newUnAPIResolver;
|
||||
unAPIIDs = [];
|
||||
}
|
||||
|
||||
// look for abbrs
|
||||
var abbrs = doc.evaluate('//x:abbr[contains(@class, " unapi-id") or contains(@class, "unapi-id ") or @class="unapi-id"][@title]',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var abbr;
|
||||
var ids = [];
|
||||
while(abbr = abbrs.iterateNext()) {
|
||||
var id = abbr.getAttribute("title");
|
||||
ids.push(unAPIIDs[id] ? unAPIIDs[id] : new UnAPIID(id));
|
||||
}
|
||||
|
||||
return ids;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the list of formats available for all items accessible through this unAPI resolver
|
||||
* @param {Function} callback A callback to be passed the format when it is available
|
||||
*/
|
||||
function getDefaultFormat(callback) {
|
||||
if(defaultFormat) {
|
||||
callback(defaultFormat);
|
||||
} else {
|
||||
Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) {
|
||||
// determine format of this item
|
||||
defaultFormat = new UnAPIFormat(text);
|
||||
callback(defaultFormat);
|
||||
});
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Determines itemType for detection
|
||||
*/
|
||||
function determineDetectItemType(ids, supportedId) {
|
||||
var id = ids.shift();
|
||||
id.isSupported(function(isSupported) {
|
||||
if(isSupported && supportedId !== undefined) {
|
||||
// If there are multiple items with valid itemTypes, use "multiple"
|
||||
Zotero.done("multiple");
|
||||
} else if(ids.length) {
|
||||
// If IDs remain to be handled, handle the next one
|
||||
determineDetectItemType(ids, (isSupported ? id : supportedId));
|
||||
} else {
|
||||
Zotero.done(false);
|
||||
// If all IDs have been handled, get foundItemType for only supported ID
|
||||
supportedId.getItemType(Zotero.done);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
var id = unsearchedIds.shift();
|
||||
Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) {
|
||||
var format = checkFormats(text);
|
||||
if(format) {
|
||||
// save data
|
||||
foundIds.push(id);
|
||||
foundFormat.push(format[0]);
|
||||
foundFormatName.push(format[1]);
|
||||
|
||||
if(foundIds.length == 2) {
|
||||
// this is our second; use multiple
|
||||
Zotero.done("multiple");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// keep going
|
||||
getItemType();
|
||||
});
|
||||
}
|
||||
|
||||
function checkFormats(text) {
|
||||
text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "");
|
||||
var xml = new XML(text);
|
||||
|
||||
var foundFormat = new Object();
|
||||
|
||||
// this is such an ugly, disgusting hack, and I hate how Mozilla decided to neuter an ECMA standard
|
||||
for each(var format in xml.format) {
|
||||
var name = format.@name.toString();
|
||||
var lowerName = name.toLowerCase();
|
||||
|
||||
if(format.@namespace_uri == "http://www.loc.gov/mods/v3" || lowerName == "mods" || format.@docs == "http://www.loc.gov/standards/mods/") {
|
||||
if(!foundFormat["mods"] || lowerName.indexOf("full") != -1) {
|
||||
foundFormat["mods"] = escape(name);
|
||||
}
|
||||
} else if(lowerName.match(/^marc\b/)) {
|
||||
if(!foundFormat["marc"] || lowerName.indexOf("utf8") != -1) {
|
||||
foundFormat["marc"] = escape(name);
|
||||
}
|
||||
} else if(lowerName == "rdf_dc") {
|
||||
foundFormat["rdf"] = escape(name);
|
||||
} else if(format.@docs.text() == "http://www.refman.com/support/risformat_intro.asp" || lowerName.match(/^ris\b/)) {
|
||||
if(!foundFormat["ris"] || lowerName.indexOf("utf8") != -1) {
|
||||
foundFormat["ris"] = escape(name);
|
||||
}
|
||||
} else if(lowerName == "bibtex") {
|
||||
foundFormat["bibtex"] = escape(name);
|
||||
} else if(lowerName == "endnote") {
|
||||
foundFormat["endnote"] = escape(name);
|
||||
}
|
||||
}
|
||||
|
||||
// loop through again, this time respecting preferences
|
||||
for each(var format in RECOGNIZABLE_FORMATS) {
|
||||
if(foundFormat[format]) return [format, foundFormat[format]];
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
function getOnlyItem() {
|
||||
// retrieve the only item
|
||||
retrieveItem(foundIds[0], foundFormat[0], foundFormatName[0], function(obj, item) {
|
||||
foundItems.push(item);
|
||||
Zotero.done(item.itemType);
|
||||
});
|
||||
}
|
||||
|
||||
function retrieveItem(id, format, formatName, callback) {
|
||||
// retrieve URL
|
||||
Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id+"&format="+formatName, function(text) {
|
||||
var translator = Zotero.loadTranslator("import");
|
||||
translator.setTranslator(FORMAT_GUIDS[format]);
|
||||
translator.setString(text);
|
||||
translator.setHandler("itemDone", callback);
|
||||
translator.translate();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get formats and names for all usable ids; when done, get all items
|
||||
* Get all items
|
||||
* @param {UnAPIID[]} ids List of UnAPI IDs
|
||||
* @param {Function} callback Function to pass item array to when items have been retrieved
|
||||
* @param {Zotero.Item[]} items Item array; used for recursive calls
|
||||
**/
|
||||
function getAllIds() {
|
||||
if(!unsearchedIds.length) {
|
||||
// once all ids have been gotten, get all items
|
||||
getAllItems();
|
||||
return;
|
||||
}
|
||||
function getAllItems(ids, callback, items) {
|
||||
var id = ids.shift();
|
||||
id.getItems(function(retrievedItems) {
|
||||
var collectedItems = (items ? items.concat(retrievedItems) : retrievedItems);
|
||||
|
||||
var id = unsearchedIds.shift();
|
||||
Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) {
|
||||
var format = checkFormats(text);
|
||||
if(format) {
|
||||
// save data
|
||||
foundIds.push(id);
|
||||
foundFormat.push(format[0]);
|
||||
foundFormatName.push(format[1]);
|
||||
if(ids.length) {
|
||||
getAllItems(ids, callback, collectedItems);
|
||||
} else {
|
||||
callback(collectedItems);
|
||||
}
|
||||
|
||||
// keep going
|
||||
getAllIds();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all items; when done, show selectItems or scrape
|
||||
**/
|
||||
function getAllItems() {
|
||||
if(foundItems.length == foundIds.length) {
|
||||
if(foundItems.length == 1) {
|
||||
// Set the item Repository to the domain
|
||||
foundItems[0].repository = domain[1];
|
||||
// if only one item, send complete()
|
||||
foundItems[0].complete();
|
||||
} else if(foundItems.length > 0) {
|
||||
// if multiple items, show selectItems
|
||||
function detectWeb(doc, url) {
|
||||
// get unAPI IDs
|
||||
var ids = getUnAPIIDs(doc);
|
||||
if(!ids.length) return false;
|
||||
|
||||
// now we need to see if the server actually gives us bibliographic metadata, and determine the
|
||||
// type
|
||||
Zotero.wait();
|
||||
|
||||
if(!ids.length === 1) {
|
||||
// Only one item, so we will just get its item type
|
||||
ids[0].getItemType(Zotero.done);
|
||||
} else {
|
||||
// Several items. We will need to call determineDetectItemType
|
||||
determineDetectItemType(ids);
|
||||
}
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var ids = getUnAPIIDs(doc);
|
||||
|
||||
Zotero.wait();
|
||||
|
||||
getAllItems(ids, function(items) {
|
||||
// get the domain we're scraping, so we can use it for libraryCatalog
|
||||
domain = doc.location.href.match(/https?:\/\/([^/]+)/);
|
||||
|
||||
if(items.length == 1) {
|
||||
// If only one item, just complete it
|
||||
items[0].libraryCatalog = domain[1];
|
||||
items[0].complete();
|
||||
} else if(items.length > 0) {
|
||||
// If multiple items, extract their titles
|
||||
var itemTitles = [];
|
||||
for(var i in foundItems) {
|
||||
itemTitles[i] = foundItems[i].title;
|
||||
for(var i in items) {
|
||||
itemTitles[i] = items[i].title;
|
||||
}
|
||||
|
||||
// Show item selection dialog
|
||||
var chosenItems = Zotero.selectItems(itemTitles);
|
||||
if(!chosenItems) Zotero.done(true);
|
||||
|
||||
// Complete items
|
||||
for(var i in chosenItems) {
|
||||
// Set the item Repository to the domain
|
||||
foundItems[i].repository = domain[1];
|
||||
foundItems[i].complete();
|
||||
items[i].libraryCatalog = domain[1];
|
||||
items[i].complete();
|
||||
}
|
||||
}
|
||||
|
||||
// reset items
|
||||
foundItems = [];
|
||||
|
||||
Zotero.done();
|
||||
return;
|
||||
}
|
||||
|
||||
var id = foundIds[foundItems.length];
|
||||
// foundFormat can be either a string or an array
|
||||
if(typeof(foundFormat) == "string") {
|
||||
var format = foundFormat;
|
||||
var formatName = foundFormatName;
|
||||
} else {
|
||||
var format = foundFormat[foundItems.length];
|
||||
var formatName = foundFormatName[foundItems.length];
|
||||
}
|
||||
|
||||
// get item
|
||||
retrieveItem(id, format, formatName, function(obj, item) {
|
||||
foundItems.push(item);
|
||||
getAllItems();
|
||||
});
|
||||
}
|
||||
|
||||
function doWeb() {
|
||||
Zotero.wait();
|
||||
|
||||
// retrieve data for all ids
|
||||
getAllIds();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user