From 448faedab52158cccb9300437593e3292ef01ba8 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Sun, 17 Dec 2006 01:27:42 +0000 Subject: [PATCH] - added a "copy" feature to Scaffold, which copies a translator to the clipboard - implemented ability to test regex and run detectCode from within Scaffold. it is now possible to generate an entire translator from within the environment. - added Factiva translator, which should work, although Factiva just went down for maintenance a few minutes ago --- scrapers.sql | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 2 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 1178a3fb6..83a786760 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 136 +-- 137 -- ***** BEGIN LICENSE BLOCK ***** -- @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-12-16 16:29:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-11-06 20:20:46')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-12-15 03:40:00', 1, 100, 4, 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -4971,6 +4971,130 @@ REPLACE INTO translators VALUES ('92d4ed84-8d0-4d3c-941f-d4b9124cfbb', '1.0.0b2. Zotero.wait(); }'); +REPLACE INTO translators VALUES ('7bdb79e-a47f-4e3d-b317-ccd5a0a74456', '1.0.0b3r1', '', '2006-11-06 20:20:46', '1', '100', '4', 'Factiva', 'Simon Kornblith', '^http://global\.factiva\.com/ha/default\.aspx$', +'function detectWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + if(doc.evaluate(''//tr[@class="headline"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + if(doc.body.className == ''articleView'') { + return "newspaperArticle"; + } else { + return "multiple"; + } + } +}', +'function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var items = new Array(); + var singlePage = doc.body.className == ''articleView''; + + var tableRows = doc.evaluate(''//tr[@class="headline"]'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var tableRow; + while(tableRow = tableRows.iterateNext()) { + var hdl = doc.evaluate(''.//input[@name="hdl"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, + null).iterateNext().value; + if(!singlePage){ + items[hdl] = Zotero.Utilities.cleanString(tableRow.getElementsByTagName("a")[0].textContent); + } else { + var m = doc.evaluate(''.//td[@class="count"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, + null).iterateNext().textContent.match(/[0-9]+/); + items[m[0]] = hdl; + } + } + + if(!singlePage) { + items = Zotero.selectItems(items); + if(!items) return true; + + var hdls = new Array(); + for(var hdl in items) { + hdls.push(hdl); + } + } else { + var m = doc.evaluate(''//div[@class="articleHeader"][@id="artHdr1"]/span[substring(text(), 1, 7) = "Article"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.match(/[0-9]+/); + var hdls = [items[m[0]]]; + } + + var post = ""; + + var hiddenInputs = doc.evaluate(''//form[@name="PageBaseForm"]//input[@type="hidden"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + var hiddenInput; + while(hiddenInput = hiddenInputs.iterateNext()) { + // this is some weird shit, but apparently they''re very picky + post = post+"&"+hiddenInput.name+"="+escape(hiddenInput.value).replace(/\+/g, "%2B").replace(/\%20/g, "+"); + } + + var selects = doc.evaluate(''//form[@name="PageBaseForm"]//select'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + var select; + while(select = selects.iterateNext()) { + post = post+"&"+select.name+"="+escape(select.options[select.selectedIndex].value); + } + + for each(var hdl in hdls) { + post += "&hdl="+escape(hdl); + } + post = post.substr(1); + + Zotero.Utilities.HTTP.doPost("http://global.factiva.com/pps/default.aspx?pp=XML", post, function(text) { + // Remove xml parse instruction and doctype + text = text.replace(/]*>/, "").replace(/<\?xml[^>]*\?>/, ""); + // kill the XML namespace, too, because we have no way of knowing what it will be, which presents a problem + text = text.replace(//, ""); + // kill hlt tags; they just make parsing harder + text = text.replace(/<\/?hlt>/g, ""); + var xml = new XML(text); + + // loop through articles + for each(var ppsarticle in xml[0]..ppsarticle) { + var article = ppsarticle.article; + var newItem = new Zotero.Item("newspaperArticle"); + + newItem.title = Zotero.Utilities.cleanString(article.headline.paragraph.text().toString()); + newItem.publicationTitle = Zotero.Utilities.cleanString(article.sourceName.text().toString()); + for each(var tag in article..name) { + newItem.tags.push(tag.text().toString()); + } + newItem.date = Zotero.Utilities.formatDate(Zotero.Utilities.strToDate(article.publicationDate.date.text().toString())); + if(article.byline.length()) { + var byline = Zotero.Utilities.cleanString(article.byline.text().toString().replace(/By/i, "")); + var authors = byline.split(/ (?:\&|and) /i); + for each(var author in authors) { + newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author")); + } + } + newItem.section = article.sectionName.text().toString(); + newItem.edition = article.edition.text().toString(); + + if(article.pages.length()) { + newItem.pages = ""; + for each(var page in article.pages.page) { + newItem.pages += ","+page.text().toString(); + } + newItem.pages = newItem.pages.substr(1); + } + + var m = article.volume.text().toString().match(/ISSN[:\s]*([\-0-9]{8,9})/i); + if(m) newItem.ISSN = m[1]; + + newItem.complete(); + } + + Zotero.done(); + }); + + Zotero.wait(); +}'); + REPLACE INTO translators VALUES ('e07e9b8c-0e98-4915-bb5a-32a08cb2f365', '1.0.0b2.r2', '', '2006-10-02 17:00:00', 1, 100, 8, 'Open WorldCat', 'Simon Kornblith', 'http://partneraccess.oclc.org/', 'function detectSearch(item) { if(item.itemType == "book" || item.itemType == "bookSection") {