From a17e06469b0b8f863dd5623b18c761336361e610 Mon Sep 17 00:00:00 2001 From: Michael Berkowitz Date: Wed, 9 Jul 2008 18:00:50 +0000 Subject: [PATCH] -Adds AlterNet translator from Zotero-Dev list. --- scrapers.sql | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/scrapers.sql b/scrapers.sql index fb10a5f22..f8e30f251 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -5862,6 +5862,176 @@ REPLACE INTO translators VALUES ('f203db7f-7b7b-4dc4-b018-115b7885fe3b', '1.0.0b } }'); +REPLACE INTO translators VALUES ('ea531652-cdeb-4ec2-940e-627d4b107263', '1.0.0b4.r1', '', '2008-07-09 13:41:54', '0', '100', '4', 'AlterNet', 'Jesse Johnson', '^http://(?:www\.)alternet.org', +'function detectWeb(doc, url) { + // identifies articles according to the presence of an article ID + // number in the URL + var index = url.toString().indexOf(''.org/'') + 5; + index += url.toString().substr(index).indexOf(''/''); + if (index != -1) { + // ordinary aritcle + var id = url.toString().substr(index + 1, 5); + Zotero.Utilities.cleanString(id); + if (Number(id)) { + return "magazineArticle"; + } + //columnist or blog article + index += url.toString().substr(index + 1).indexOf(''/''); + id = url.toString().substr(index + 2, 5); + Zotero.Utilities.cleanString(id); + if (Number(id) && url.toString().search(''blog'') == -1) { + return "magazineArticle"; + } + else if (Number(id)) { + return "blogPost"; + } + } + + return null; +}', +'function scrape(doc, url, title) { + var index = url.toString().indexOf(''.org/'') + 5; + index += url.toString().substr(index).indexOf(''/''); + if (index != -1) { + // ordinary aritcle + var id = url.toString().substr(index + 1, 5); + Zotero.Utilities.cleanString(id); + if (Number(id)) { + var newItem = new Zotero.Item("magazineArticle"); + } + //columnist or blog article + index += url.toString().substr(index + 1).indexOf(''/''); + id = url.toString().substr(index + 2, 5); + Zotero.Utilities.cleanString(id); + if (Number(id) && url.toString().search(''blog'') == -1) { + var newItem = new Zotero.Item("magazineArticle"); + } + else if (Number(id)) { + var newItem = new Zotero.Item("blogPost"); + } + } + + newItem.url = url; + newItem.title = title; + + if (newItem.itemType == "magazineArticle") { + newItem.publicationTitle = "AlterNet"; + newItem.repository = "alternet.org"; + } + else if (newItem.itemType == "blogPost") { + newItem.websiteType = "AlterNet Blog"; + } + + + // general scraping variables + var xpath; + + + // author + if (newItem.itemType == "magazineArticle") { + xpath = ''//p[@class="storybyline"]//a[contains(@href,"author")]''; + } + else if (newItem.itemType == "blogPost") { + xpath = ''//p[@class="storybyline"]//a[contains(@href,"bloggers")]''; + } + temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); + if (temp) { + var author = Zotero.Utilities.trimInternal(temp.textContent); + if(author.substr(0, 3).toLowerCase() == "by ") { + author = author.substr(3); + } + + var authors = author.split(","); + for each (var author in authors) { + newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author")); + } + } + + // date + if (newItem.itemType == "magazineArticle") { + xpath = ''//p[@class="storybyline"]//a[contains(@href,"date")]''; + temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); + var date = Zotero.Utilities.strToDate(temp.textContent); + } + else if (newItem.itemType == "blogPost") { + xpath = ''//p[@class="storybyline"]/b''; + temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); + var begin = temp.textContent.lastIndexOf(" on "); + temp = temp.textContent.substr(begin + 4); + var date = Zotero.Utilities.strToDate(temp.substr(0, temp.length - 1)); + } + if (date != null) { + var strdate; + + date.month = date.month + 1; + + strdate = date.year + ''-''; + if (date.month < 10) { + strdate += ''0'' + date.month; + } + else { + strdate += date.month; + } + if (date.day > 10) { + strdate += ''-'' + date.day; + } + else { + strdate += ''-0'' + date.day; + } + + newItem.date = strdate; + } + + // abstract + xpath = ''//div[@class="teaser"]//div[contains(@class,"teaser")]''; + temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); + if (temp) { + newItem.abstractNote = Zotero.Utilities.trimInternal(temp.textContent); + } + + // article snapshot + // grabs 5-digit article code from url and uses it to derive printable page url for use in article snapshot + var index = url.toString().indexOf(''.org/'') + 5; + index += url.toString().substr(index).indexOf(''/''); + if (index != -1) { + var printurl; + // ordinary article + var id = url.toString().substr(index + 1, 5); + if (Number(id)) { + printurl = "http://www.alternet.org/module/printversion/" + id; + newItem.attachments.push({url:printurl, title:"AlterNet Article Snapshot", mimeType:"text/html"}); + } + // columnist article + else { + index += url.toString().substr(index + 1).indexOf(''/''); + id = url.toString().substr(index + 2, 5); + Zotero.Utilities.cleanString(id); + if (Number(id)) { + printurl = "http://www.alternet.org/module/printversion/" + id; + if (newItem.itemType == "blogPost") { + printurl += "/?type=blog"; + } + newItem.attachments.push({url:printurl, title:"AlterNet Article Snapshot", mimeType:"text/html"}); + } + } + } + + newItem.complete(); +} + + + +function doWeb(doc, url) { + // ordinary and columnist articles + var xpath = ''//p[@class="storyheadline"]''; + var title; + if (title = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { + scrape(doc, url, title.textContent); + } + + return null; +}'); + REPLACE INTO translators VALUES ('56ea09bc-57ee-4f50-976e-cf7cb1f6c6d8', '1.0.0b4.r5', '', '2008-04-23 09:45:00', '0', '100', '4', 'Royal Society Publishing', 'Michael Berkowitz', 'http://journals.royalsociety.org/', 'function detectWeb(doc, url) { if (doc.evaluate(''//div[@class="listItemName"]/a'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {