From 68149c59c6a6f9eca9b705013ff8f1f015bb0d09 Mon Sep 17 00:00:00 2001 From: Sean Takats Date: Fri, 30 Nov 2007 20:44:04 +0000 Subject: [PATCH] Addresses #731. Single articles from LexisNexis should now work. --- scrapers.sql | 158 ++++++++++++++++++++------------------------------- 1 file changed, 61 insertions(+), 97 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 09a73a392..62ec7beaf 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-11-29 21:00:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-11-30 21:00:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -6640,107 +6640,71 @@ function doWeb(doc, url) { Zotero.wait(); }'); -REPLACE INTO translators VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '1.0.0b3.r1', '', '2007-03-24 22:20:00', 1, 100, 4, 'LexisNexis', 'Simon Kornblith', '^https?://web\.lexis-?nexis\.com[^/]*/universe/(?:document|doclist)', +REPLACE INTO translators VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '1.0.0b3.r1', '', '2007-11-30 21:00:00', '1', '100', '4', 'LexisNexis', 'Sean Takats', '^https?://(?:www\.|web\.)?lexis-?nexis\.com[^/]*/us/lnacademic', 'function detectWeb(doc, url) { - var detailRe = new RegExp("^https?://[^/]+/universe/document"); - if(detailRe.test(doc.location.href)) { - return "newspaperArticle"; - } else { - return "multiple"; - } -}', -'function scrape(doc) { - var newItem = new Zotero.Item(); - newItem.attachments.push({document:doc, title:"LexisNexis Snapshot"}); - - var citationDataDiv; - var divs = doc.getElementsByTagName("div"); - for(var i=0; i]*>/gi); - newItem.publicationTitle = elementParts[elementParts.length-1]; - - var dateRegexp = /]*>(?:)?([A-Z][a-z]+)(?:<\/b>)? ([0-9]+, [0-9]{4})/; - var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); - if(m) { - newItem.date = m[1]+" "+m[2]; - } else { - var elementParts = centerElements[centerElements.length-1].innerHTML.split(/]*>/gi); - newItem.date = elementParts[1]; - } - - var cutIndex = citationDataDiv.innerHTML.indexOf("BODY:"); - if(cutIndex < 0) { - cutIndex = citationDataDiv.innerHTML.indexOf("TEXT:"); - } - if(cutIndex > 0) { - citationData = citationDataDiv.innerHTML.substring(0, cutIndex); - } else { - citationData = citationDataDiv.innerHTML; - } - - citationData = Zotero.Utilities.cleanTags(citationData); - - var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/; - var m = headlineRegexp.exec(citationData); - if(m) { - newItem.title = Zotero.Utilities.cleanTags(m[1]); - } - - var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; - var m = bylineRegexp.exec(citationData); - if(m) { // there is a byline; use it as an author - if(m[1].substring(0, 3).toLowerCase() == "by ") { - m[1] = m[1].substring(3); - } - newItem.creators.push(Zotero.Utilities.cleanAuthor(m[1], "author")); - - newItem.itemType = "newspaperArticle"; - } else { // no byline; must be a journal - newItem.itemType = "journalArticle"; - } - - // other ways authors could be encoded - var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; - var m = authorRegexp.exec(citationData); - if(m) { - var authors = m[1].split(/, (?:and )?/); - for(var i in authors) { - newItem.creators.push(Zotero.Utilities.cleanAuthor(authors[i].replace(" *", ""), "author")); - } - } - - newItem.complete(); -} +}', +'function doWeb(doc, url) { + var hostRe = new RegExp("^http(?:s)?://[^/]+"); + var m = hostRe.exec(doc.location.href); + var host = m[0]; -function doWeb(doc, url) { - var detailRe = new RegExp("^https?://[^/]+/universe/document"); - if(detailRe.test(doc.location.href)) { - scrape(doc); - } else { - var items = Zotero.Utilities.getItemArray(doc, doc, "^https?://[^/]+/universe/document"); - items = Zotero.selectItems(items); - - if(!items) { - return true; - } - - var uris = new Array(); - for(var i in items) { - uris.push(i); - } - - Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, - function() { Zotero.done(); }, null); - - Zotero.wait(); + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + var risb = doc.evaluate(''//input[@name="risb"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().value; + var cisb = doc.evaluate(''//input[@name="cisb"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().value; + var uri = host+"/us/lnacademic/results/listview/delPrep.do?cisb="+cisb+"&risb="+risb+"&mode=delivery_refworks"; + var hiddenInputs = doc.evaluate(''//form[@name="results_docview_DocumentForm"]//input[@type="hidden"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + var hiddenInput; + var poststring=""; + while(hiddenInput = hiddenInputs.iterateNext()) { + poststring = poststring+"&"+hiddenInput.name+"="+encodeURIComponent(hiddenInput.value); } + poststring = poststring + "&hiddensearchfield=Narrow+Search&reloadClassif=&format=GNBFI&focusTerms=&nextSteps=0"; + Zotero.Utilities.HTTP.doPost(uri, poststring, function(text) { + uri = host+"/us/lnacademic/delivery/refExport.do"; + var disb = text.match(//); + var poststring = "delRange=cur&selDocs=&disb="+disb[1]+"&initializationPage=0"; + Zotero.Utilities.HTTP.doPost(uri, poststring, function(text) { + uri = text.match(/&url=([^'']+)''/) + uri = decodeURIComponent(uri[1]); + var uris = new Array(); + uris.push(uri); + Zotero.Utilities.processDocuments(uris, function(newDoc){ + var newItem = new Zotero.Item("newspaperArticle"); + var title = newDoc.evaluate(''//div[@class="HEADLINE"]'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + newItem.title = title; + var date = newDoc.evaluate(''//meta[@name="_lndateissue"]/@content'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + date = date.substr(0,4) + "-" + date.substr(4,2) + "-" + date.substr(6,2); + newItem.date = date; + var publicationTitle = newDoc.evaluate(''//div[@class="PUB"]'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + newItem.publicationTitle = publicationTitle; + var section = newDoc.evaluate(''//div[@class="SECTION-INFO"]'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + newItem.section = section; + var authors = newDoc.evaluate(''//div[@class="BYLINE"]'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + newItem.creators.push(Zotero.Utilities.cleanAuthor(authors, "author")); + newItem.respository = "lexisnexis.com"; + newItem.url = url; + newItem.complete(); + Zotero.done(); + }); + }); + }); + Zotero.wait(); }'); REPLACE INTO translators VALUES ('5e3e6245-83da-4f55-a39b-b712df54a935', '1.0.0b3.r1', '', '2007-08-27 05:00:00', '0', '90', '4', 'Melvyl', 'Sean Takats', '^https?://(?:melvyl.cdlib.org|melvyl-dev.cdlib.org:8162)/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find|\?func=scan)',