From c3bd2579ccb33a50c57be9f83f1903cf77747418 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Mon, 19 Mar 2007 22:29:49 +0000 Subject: [PATCH] closes #457, unAPI translator. there are currently some problems with recognizing document types of foreign MODS documents, but i hope to resolve these soon. --- chrome/content/zotero/xpcom/translate.js | 4 +- scrapers.sql | 243 ++++++++++++++++++++++- 2 files changed, 242 insertions(+), 5 deletions(-) diff --git a/chrome/content/zotero/xpcom/translate.js b/chrome/content/zotero/xpcom/translate.js index 6e06657cb..549ac5d01 100644 --- a/chrome/content/zotero/xpcom/translate.js +++ b/chrome/content/zotero/xpcom/translate.js @@ -537,8 +537,6 @@ Zotero.Translate.prototype._loadTranslator = function() { * does the actual translation */ Zotero.Translate.prototype.translate = function() { - Zotero.debug("translate called"); - /* * initialize properties */ @@ -2164,7 +2162,7 @@ Zotero.Translate.TranslatorSearch.prototype.complete = function(returnValue, err this.currentTranslator = undefined; this.asyncMode = false; - + // resume execution this.execute(); } diff --git a/scrapers.sql b/scrapers.sql index 26088c4e3..bbef74798 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 181 +-- 182 -- ***** BEGIN LICENSE BLOCK ***** -- @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-03-16 23:28:15')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-03-19 22:20:40')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-12-15 03:40:00', 1, 100, 4, 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -3155,6 +3155,245 @@ function doWeb(doc, url) { } }'); +REPLACE INTO translators VALUES ('e7e01cac-1e37-4da6-b078-a0e8343b0e98', '1.0.0b4r1', '', '2007-03-19 22:20:40', '1', '90', '4', 'unAPI', 'Simon Kornblith', '', +'var RECOGNIZABLE_FORMATS = ["mods", "marc", "endnote", "ris", "bibtex", "rdf"]; +var FORMAT_GUIDS = { + "mods":"0e2235e7-babf-413c-9acf-f27cce5f059c", + "marc":"a6ee60df-1ddc-4aae-bb25-45e0537be973", + "endnote":"881f60f2-0802-411a-9228-ce5f47b64c7d", + "ris":"32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7", + "bibtex":"9cb70025-a888-4a29-a210-93ec52da40d4", + "rdf":"5e3ad958-ac79-463d-812b-a86a9235c28f" +}; + +var unAPIResolver, unsearchedIds, foundIds, foundItems, foundFormat, foundFormatName; + +function detectWeb(doc, url) { + // initialize variables + unsearchedIds = []; + foundIds = []; + foundItems = []; + foundFormat = []; + foundFormatName = []; + + var nsResolver = doc.createNSResolver(doc.documentElement); + + // look for a resolver + unAPIResolver = doc.evaluate(''//link[@rel="unapi-server"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(!unAPIResolver) return false; + unAPIResolver = unAPIResolver.getAttribute("href"); + + // look for abbrs + var abbrs = doc.getElementsByTagName("abbr"); + for each(var abbr in abbrs) { + if(abbr.getAttribute) { + if(abbr.getAttribute("class").split(" ").indexOf("unapi-id") != -1 && abbr.getAttribute("title")) { + // found an abbr + unsearchedIds.push(abbr.getAttribute("title")); + } + } + } + + if(!unsearchedIds.length) return false; + + // now we need to see if the server actually gives us bibliographic metadata. + + // one way to signal this is with a META tag + var zoteroMeta = doc.evaluate(''//meta[@name="ZoteroItemType"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(zoteroMeta) return zoteroMeta.getAttribute("content"); + + // otherwise, things will be a bit more complicated, and we''ll have to do some HTTP requests + Zotero.wait(); + + if(unsearchedIds.length == 1) { + // if there''s only one abbr tag, we should go ahead and retrieve types for it + getItemType(); + } else { + // if there''s more than one, we should first see if the resolver gives metadata for all of them + Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) { + var format = checkFormats(text); + if(format) { + // move unsearchedIds to foundIds + foundIds = unsearchedIds; + unsearchedIds = []; + // save format and formatName + foundFormat = format[0]; + foundFormatName = format[1]; + + Zotero.done("multiple"); + } else { + getItemType(); + } + }); + } +} + +function getItemType() { + // if there are no items left to search, use the only item''s type (if there is one) or give up + if(!unsearchedIds.length) { + if(foundIds.length) { + getOnlyItem(); + } else { + Zotero.done(false); + } + return; + } + + var id = unsearchedIds.shift(); + Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) { + var format = checkFormats(text); + if(format) { + // save data + foundIds.push(id); + foundFormat.push(format[0]); + foundFormatName.push(format[1]); + + if(foundIds.length == 2) { + // this is our second; use multiple + Zotero.done("multiple"); + return; + } + } + + // keep going + getItemType(); + }); +} + +function checkFormats(text) { + text = text.replace(/]*>/, "").replace(/<\?xml[^>]*\?>/, ""); + var xml = new XML(text); + + var foundFormat = new Object(); + + // this is such an ugly, disgusting hack, and I hate how Mozilla decided to neuter an ECMA standard + for each(var format in xml.format) { + var name = format.@name.toString(); + var lowerName = name.toLowerCase(); + + if(format.@namespace_uri == "http://www.loc.gov/mods/v3" || lowerName == "mods" || format.@docs == "http://www.loc.gov/standards/mods/") { + if(!foundFormat["mods"] || lowerName.indexOf("full") != -1) { + foundFormat["mods"] = name; + } + } else if(lowerName.match(/^marc\b/)) { + if(!foundFormat["marc"] || lowerName.indexOf("utf8") != -1) { + foundFormat["marc"] = name; + } + } else if(lowerName == "rdf_dc") { + foundFormat["rdf"] = name; + } else if(format.@docs.text() == "http://www.refman.com/support/risformat_intro.asp" || lowerName.match(/^ris\b/)) { + if(!foundFormat["ris"] || lowerName.indexOf("utf8") != -1) { + foundFormat["ris"] = name; + } + } else if(lowerName == "bibtex") { + foundFormat["bibtex"] = name; + } else if(lowerName == "endnote") { + foundFormat["endnote"] = name; + } + } + + // loop through again, this time respecting preferences + for each(var format in RECOGNIZABLE_FORMATS) { + if(foundFormat[format]) return [format, foundFormat[format]]; + } + + return false; +} + +function getOnlyItem() { + // retrieve the only item + retrieveItem(foundIds[0], foundFormat[0], foundFormatName[0], function(obj, item) { + foundItems.push(item); + Zotero.done(item.itemType); + }); +} + +function retrieveItem(id, format, formatName, callback) { + // retrieve URL + Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id+"&format="+formatName, function(text) { + var translator = Zotero.loadTranslator("import"); + translator.setTranslator(FORMAT_GUIDS[format]); + translator.setString(text); + translator.setHandler("itemDone", callback); + translator.translate(); + }); +}', +'/** + * Get formats and names for all usable ids; when done, get all items + **/ +function getAllIds() { + if(!unsearchedIds.length) { + // once all ids have been gotten, get all items + getAllItems(); + return; + } + + var id = unsearchedIds.shift(); + Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) { + var format = checkFormats(text); + if(format) { + // save data + foundIds.push(id); + foundFormat.push(format[0]); + foundFormatName.push(format[1]); + } + + // keep going + getAllIds(); + }); +} + +/** + * Get all items; when done, show selectItems or scrape + **/ +function getAllItems() { + if(foundItems.length == foundIds.length) { + if(foundItems.length == 1) { + // if only one item, send complete() + foundItems[0].complete(); + } else if(foundItems.length > 0) { + // if multiple items, show selectItems + var itemTitles = []; + for(var i in foundItems) { + itemTitles[i] = foundItems[i].title; + } + + var chosenItems = Zotero.selectItems(itemTitles); + if(!chosenItems) Zotero.done(true); + + for(var i in chosenItems) { + foundItems[i].complete(); + } + } + + Zotero.done(); + return; + } + + var id = foundIds[foundItems.length]; + // foundFormat can be either a string or an array + if(typeof(foundFormat) == "string") { + var format = foundFormat; + var formatName = foundFormatName; + } else { + var format = foundFormat[foundItems.length]; + var formatName = foundFormatName[foundItems.length]; + } + + // get item + retrieveItem(id, format, formatName, function(obj, item) { + foundItems.push(item); + getAllItems(); + }); +} + +function doWeb() { + Zotero.wait(); + + // retrieve data for all ids + getAllIds(); +}'); + REPLACE INTO translators VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '1.0.0b3.r1', '', '2007-01-07 17:00:00', 1, 100, 4, 'Google Books', 'Simon Kornblith', '^http://books\.google\.[a-z]+/books\?(.*vid=.*\&id=.*|.*q=.*)', 'function detectWeb(doc, url) { var re = new RegExp(''^http://books\\.google\\.[a-z]+/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');