From 1c911ff169470a4e1750b89dc34c52948535b05f Mon Sep 17 00:00:00 2001 From: Sean Takats Date: Wed, 10 Oct 2007 16:29:32 +0000 Subject: [PATCH] Fixes Project MUSE translator. At last! --- scrapers.sql | 147 +++++++++++++++++++-------------------------------- 1 file changed, 55 insertions(+), 92 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 2d94f8a4a..01f8eeff4 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-10-03 08:00:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-10-10 20:00:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -7063,22 +7063,22 @@ REPLACE INTO translators VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '1.0.0b Zotero.wait(); }'); -REPLACE INTO translators VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '1.0.0b3.r1', '', '2007-06-12 23:30:00', 1, 100, 4, 'Project MUSE', 'Simon Kornblith', '^https?://muse\.jhu\.edu[^/]*/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', +REPLACE INTO translators VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '1.0.0b4.r1', '', '2007-10-10 20:00:00', '1', '100', '4', 'Project MUSE', 'Simon Kornblith', '^https?://muse\.jhu\.edu[^/]*/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/results)', 'function detectWeb(doc, url) { - var searchRe = new RegExp("^https?://[^/]+/search/pia\.cgi"); + var searchRe = new RegExp("^https?://[^/]+/search/results"); if(searchRe.test(url)) { return "multiple"; } else { return "journalArticle"; } -}', +}', 'function doWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - var searchRe = new RegExp("^https?://[^/]+/search/pia\.cgi"); + var searchRe = new RegExp("^https?://[^/]+/search/results"); if(searchRe.test(doc.location.href)) { var items = new Array(); var attachments = new Array(); @@ -7090,11 +7090,11 @@ REPLACE INTO translators VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '1.0.0b var tableRow; // Go through table rows while(tableRow = tableRows.iterateNext()) { - // article_id is what we need to get it all as one file - var input = doc.evaluate(''./tbody/tr/td/input[@name="article_id"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - var link = doc.evaluate(''.//b/i/text()'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if(input && input.value && link && link.nodeValue) { - items[input.value] = link.nodeValue; + // aid (article id) is what we need to get it all as one file + var input = doc.evaluate(''./tbody/tr/td/input[@name="aid"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var title = doc.evaluate(''.//b/i/text()'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(input && input.value && title && title.nodeValue) { + items[input.value] = title.nodeValue; var aTags = tableRow.getElementsByTagName("a"); @@ -7119,100 +7119,63 @@ REPLACE INTO translators VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '1.0.0b return true; } - try { - var search_id = doc.forms.namedItem("results").elements.namedItem("search_id").value; - } catch(e) { - var search_id = ""; - } var articleString = ""; var newAttachments = new Array(); for(var i in items) { - articleString += "&article_id="+i; + articleString += "&aid="+i; newAttachments.push(attachments[i]); } - var savePostString = "actiontype=save&search_id="+search_id+articleString; - Zotero.Utilities.HTTP.doGet("http://muse.jhu.edu/search/save.cgi?"+savePostString, function() { - Zotero.Utilities.HTTP.doGet("http://muse.jhu.edu/search/export.cgi?exporttype=endnote"+articleString, function(text) { - Zotero.debug(text); - // load translator for RIS - var translator = Zotero.loadTranslator("import"); - translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7"); - translator.setString(text); - translator.setHandler("itemDone", function(obj, item) { - if(item.notes && item.notes[0]) { - Zotero.debug(item.notes); - item.extra = item.notes[0].note; - - delete item.notes; - item.notes = undefined; - } - item.attachments = newAttachments.shift(); - Zotero.debug(item.attachments); - item.complete(); - }); - translator.translate(); - Zotero.done(); - }, function() {}); + Zotero.Utilities.HTTP.doGet("http://muse.jhu.edu/search/export.cgi?exporttype=endnote"+articleString, function(text) { + // load translator for RIS + var translator = Zotero.loadTranslator("import"); + translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7"); + translator.setString(text); + translator.setHandler("itemDone", function(obj, item) { + if(item.notes && item.notes[0]) { + item.extra = item.notes[0].note; + delete item.notes; + item.notes = undefined; + } + item.attachments = newAttachments.shift(); + item.complete(); + }); + translator.translate(); + Zotero.done(); }, function() {}); Zotero.wait(); } else { - var newItem = new Zotero.Item("journalArticle"); - newItem.url = url; - newItem.attachments.push({document:doc, title:"Project MUSE Snapshot"}); - + var hostRe = new RegExp("^(http://[^/]+)/"); + var m = hostRe.exec(url); + var host = m[1]; + var getPDF = doc.evaluate(''//a[text() = "[Access article in PDF]"]'', doc, - nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if(getPDF) { - newItem.attachments.push({title:"Project MUSE Full Text PDF", mimeType:"application/pdf", - url:getPDF.href}); - } + nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - var elmts = doc.evaluate(''//comment()'', doc, nsResolver, - XPathResult.ANY_TYPE, null); - - var headerRegexp = /HeaderData((?:.|\n)*)\#\#EndHeaders/i - while(elmt = elmts.iterateNext()) { - if(elmt.nodeValue.substr(0, 10) == "HeaderData") { - var m = headerRegexp.exec(elmt.nodeValue); - var headerData = m[1]; - } - } - - // Use E4X rather than DOM/XPath, because the Mozilla gods have decided not to - // expose DOM/XPath to sandboxed scripts - var newDOM = new XML(headerData); - - newItem.publicationTitle = newDOM.journal.text(); - newItem.volume = newDOM.volume.text(); - newItem.issue = newDOM.issue.text(); - newItem.date = newDOM.pubdate.text().toString(); - if(!newItem.date) { - newItem.date = newDOM.year.text(); - } - newItem.title = newDOM.doctitle.text(); - newItem.ISSN = newDOM.issn.text(); - - // Do pages - var fpage = newDOM.fpage.text(); - var lpage = newDOM.lpage.text(); - if(fpage != "") { - newItem.pages = fpage; - if(lpage) { - newItem.pages += "-"+lpage; - } - } - - // Do authors - var elmts = newDOM.docauthor; - for(var i in elmts) { - var fname = elmts[i].fname.text(); - var surname = elmts[i].surname.text(); - newItem.creators.push({firstName:fname, lastName:surname, creatorType:"author"}); - } - - newItem.complete(); + var newUrl = url.replace(host, host+"/metadata/zotero"); + Zotero.Utilities.HTTP.doGet(newUrl, function(text) { + var translator = Zotero.loadTranslator("import"); + //set RIS translator + translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7"); + translator.setString(text); + translator.setHandler("itemDone", function(obj, item) { + if(item.notes && item.notes[0]) { + item.extra = item.notes[0].note; + delete item.notes; + item.notes = undefined; + } + item.attachments.splice(0); + item.attachments.push({document:doc, title:"Project MUSE Snapshot"}); + if(getPDF) { + item.attachments.push({title:"Project MUSE Full Text PDF", mimeType:"application/pdf", + url:getPDF.href}); + } + + item.complete(); + }); + translator.translate(); + }); } }');