From 33e953aa025f3ab1ca59e326cb6a9c89ed7a9ae0 Mon Sep 17 00:00:00 2001 From: Sean Takats Date: Mon, 17 Mar 2008 05:31:18 +0000 Subject: [PATCH] LexisNexis translator now supports search results pages. --- scrapers.sql | 139 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 92 insertions(+), 47 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 74c6ecfac..7b4cc9612 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-03-14 20:15:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-03-17 08:00:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -8300,13 +8300,13 @@ function doWeb(doc, url) { Zotero.wait(); }'); -REPLACE INTO translators VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '1.0.0b3.r1', '', '2008-01-29 23:00:00', '1', '100', '4', 'LexisNexis', 'Sean Takats', 'https?://[^/]*lexis-?nexis\.com[^/]*/us/lnacademic', +REPLACE INTO translators VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '1.0.0b3.r1', '', '2008-03-17 08:00:00', '1', '100', '4', 'LexisNexis', 'Sean Takats', 'https?://[^/]*lexis-?nexis\.com[^/]*/us/lnacademic', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - Zotero.debug(doc.title); + if (doc.title.substr(doc.title.length-8, 8)=="Document"){ var xpath = ''//input[@name="cisb"]''; var elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); @@ -8314,31 +8314,73 @@ REPLACE INTO translators VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '1.0.0b return "newspaperArticle"; } } + var xpath = ''//input[@name="frm_tagged_documents" and @type="checkbox"]''; + var elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + if (elmt.iterateNext()){ + return "multiple"; + } }', -'function doWeb(doc, url) { - var hostRe = new RegExp("^http(?:s)?://[^/]+"); - var m = hostRe.exec(doc.location.href); - var host = m[0]; - +'function doWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - var risb = doc.evaluate(''//input[@name="risb"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().value; - var cisb = doc.evaluate(''//input[@name="cisb"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().value; - var uri = host+"/us/lnacademic/results/listview/delPrep.do?cisb="+cisb+"&risb="+risb+"&mode=delivery_refworks"; - var hiddenInputs = doc.evaluate(''//form[@name="results_docview_DocumentForm"]//input[@type="hidden"]'', doc, nsResolver, + + // define results navigation frame doc for export buttons and hidden fields + var rfDoc = doc.defaultView.window.top.frames[1].document; + var xpath = ''//img[@title="Export Bibliographic References"]''; + + var elmt = doc.evaluate(xpath, rfDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + + var hostRe = new RegExp("^http(?:s)?://[^/]+"); + var m = hostRe.exec(doc.location.href); + var host = m[0]; + + var risb = doc.evaluate(''//input[@name="risb"]'', rfDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().value; + var cisb = doc.evaluate(''//input[@name="cisb"]'', rfDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().value; + var uri = host+"/us/lnacademic/results/listview/delPrep.do?cisb="+cisb+"&risb="+risb+"&mode=delivery_refworks"; + var hiddenInputs = doc.evaluate(''//form[@name="results_docview_DocumentForm"]//input[@type="hidden" and not(@name="tagData")]'', rfDoc, nsResolver, XPathResult.ANY_TYPE, null); var hiddenInput; var poststring=""; while(hiddenInput = hiddenInputs.iterateNext()) { poststring = poststring+"&"+hiddenInput.name+"="+encodeURIComponent(hiddenInput.value); } - poststring = poststring + "&hiddensearchfield=Narrow+Search&reloadClassif=&format=GNBFI&focusTerms=&nextSteps=0"; + + var xpath = ''//input[@name="frm_tagged_documents" and @type="checkbox"]''; + var elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + if (doc.title.substr(doc.title.length-8, 8)=="Document"){ + // single page + var delRange = "cur"; + poststring = poststring + "&hiddensearchfield=Narrow+Search&reloadClassif=&format=GNBFI&focusTerms=&nextSteps=0"; + } else { + // get multiple item titles and tags + var xpath = ''//tr[td/input[@name="frm_tagged_documents"]]''; + var rows = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var title; + var tagNumber; + var items = new Object(); + while (row = rows.iterateNext()){ + title = doc.evaluate(''.//a'', row, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + tagNumber = doc.evaluate(''./td/input[@name="frm_tagged_documents"]'', row, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().value; + items[tagNumber] = title; + } + var tagData = ""; + items = Zotero.selectItems(items); + if (!items) { + return true; + } + for (var i in items) { + tagData += "-"+i; + } + tagData = tagData.substr(1); + var delRange = "tag"; + poststring = poststring + "&tagData=" + tagData + "&hiddensearchfield=Narrow+Search&reloadClassif=&selDomainID=4&format=GNBLIST&focusTerms=&sort=RELEVANCE&nextSteps=0"; + } Zotero.Utilities.HTTP.doPost(uri, poststring, function(text) { uri = host+"/us/lnacademic/delivery/refExport.do"; var disb = text.match(//); - var poststring = "delRange=cur&selDocs=&disb="+disb[1]+"&initializationPage=0"; + poststring = "delRange="+delRange+"&selDocs=&disb="+disb[1]+"&initializationPage=0"; Zotero.Utilities.HTTP.doPost(uri, poststring, function(text) { uri = text.match(/&url=([^'']+)''/) uri = decodeURIComponent(uri[1]); @@ -8346,41 +8388,44 @@ REPLACE INTO translators VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '1.0.0b var uris = new Array(); uris.push(uri); Zotero.Utilities.processDocuments(uris, function(newDoc){ - var newItem = new Zotero.Item("newspaperArticle"); - var title = newDoc.evaluate(''//div[@class="HEADLINE"]'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if (title){ - newItem.title = title.textContent; - }else{ - newItem.title = " "; - } - var date = newDoc.evaluate(''//meta[@name="_lndateissue"]/@content'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if (date){ - date = date.nodeValue; - var m = date.match(/([^T]+)T/); - date = m[1]; - Zotero.debug(date); - if (date.length == 8){ - date = date.substr(0,4) + "-" + date.substr(4,2) + "-" + date.substr(6,2); - } else if (date.length == 6){ - date = date.substr(0,4) + "-" + date.substr(4,2); + var elmts =newDoc.evaluate(''//html'', newDoc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt; + while (elmt = elmts.iterateNext()){ + var newItem = new Zotero.Item("newspaperArticle"); + var title = newDoc.evaluate(''.//div[@class="HEADLINE"]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (title.textContent){ + newItem.title = title.textContent; + }else{ + newItem.title = " "; } - newItem.date = date; + var date = newDoc.evaluate(''.//meta[@name="_lndateissue"]/@content'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (date){ + date = date.nodeValue; + var m = date.match(/([^T]+)T/); + date = m[1]; + if (date.length == 8){ + date = date.substr(0,4) + "-" + date.substr(4,2) + "-" + date.substr(6,2); + } else if (date.length == 6){ + date = date.substr(0,4) + "-" + date.substr(4,2); + } + newItem.date = date; + } + var publicationTitle = newDoc.evaluate(''.//div[@class="PUB"]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (publicationTitle){ + newItem.publicationTitle = publicationTitle.textContent; + } + var section = newDoc.evaluate(''.//div[@class="SECTION-INFO"]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (section){ + newItem.section = section.textContent; + } + var author = newDoc.evaluate(''.//div[@class="BYLINE"]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (author){ + newItem.creators.push(Zotero.Utilities.cleanAuthor(author.textContent, "author")); + } + newItem.respository = "lexisnexis.com"; + newItem.url = url; + newItem.complete() } - var publicationTitle = newDoc.evaluate(''//div[@class="PUB"]'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if (publicationTitle){ - newItem.publicationTitle = publicationTitle.textContent; - } - var section = newDoc.evaluate(''//div[@class="SECTION-INFO"]'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if (section){ - newItem.section = section.textContent; - } - var author = newDoc.evaluate(''//div[@class="BYLINE"]'', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if (author){ - newItem.creators.push(Zotero.Utilities.cleanAuthor(author.textContent, "author")); - } - newItem.respository = "lexisnexis.com"; - newItem.url = url; - newItem.complete(); Zotero.done(); }); });