From 6fdc78548989f6cbb2a9227dfa9c4f42ca3b66b0 Mon Sep 17 00:00:00 2001 From: Michael Berkowitz Date: Tue, 31 Jul 2007 15:03:46 +0000 Subject: [PATCH] --- scrapers.sql | 500 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 498 insertions(+), 2 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 695de8fd0..4e7362145 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 249 +-- 250 -- ***** BEGIN LICENSE BLOCK ***** -- @@ -1639,7 +1639,503 @@ REPLACE INTO translators VALUES ('e4660e05-a935-43ec-8eec-df0347362e4c', '1.0.0b } }'); -REPLACE INTO translators VALUES ('84bd421d-c6d1-4223-ab80-a156f98a8e30', '1.0.0b4r1', '', '2007-06-27 02:00:00', '0', '100', '4', 'International Herald Tribune', 'Michael Berkowitz', '^http://(www.)*iht.com/*', +REPLACE INTO translators VALUES ('d9be934c-edb9-490c-a88d-34e2ee106cd7', '1.0.0b3r1', '', '2007-07-30 22:00:00', '0', '100', '4', 'Time.com', 'Michael Berkowitz', '^http://www.time.com/time/*', +'function detectWeb(doc, url) { + if (doc.title == "TIME Magazine - Search Results") { + return "multiple"; + } else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x") return namespace; else return null; + } : null; + + var xpath = ''//meta[@name="byline"]''; + var xpath2 = ''//div[@class="byline"]''; + var xpath3 = ''//div[@class="copy"]/div[@class="byline"]''; + if ((doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() || doc.evaluate(xpath2, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() || doc.evaluate(xpath3, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) ) { + if (url.substr(-4,4) == "html") { + return "newspaperArticle"; + } + } + } +} +', +'function associateMeta(newItem, metaTags, field, zoteroField) { + if (metaTags[field]) { + newItem[zoteroField] = metaTags[field]; + } +} + +function scrape(doc, url) { + var newItem = new Zotero.Item("newspaperArticle"); + newItem.publicationTitle = "Time Magazine"; + newItem.ISSN = "0040-718X"; + newItem.url = doc.location.href; + + var metaTags = new Object(); + + var metaTagHTML = doc.getElementsByTagName("meta") + for (var i = 0 ; i < metaTagHTML.length ; i++) { + metaTags[metaTagHTML[i].getAttribute("name")] = metaTagHTML[i].getAttribute("content"); + } + + if (metaTags["head"]) { + associateMeta(newItem, metaTags, "head", "title"); + } else { + newItem.title = doc.title.substr(0, doc.title.length - 7); + } + + if (metaTags["description"]) { + associateMeta(newItem, metaTags, "description", "abstractNote"); + } + + if (metaTags["date"]) { + var date = metaTags["date"]; + var months = new Object(); + months["jan"] = "January"; + months["feb"] = "February"; + months["mar"] = "March"; + months["apr"] = "April"; + months["may"] = "May"; + months["jun"] = "June"; + months["jul"] = "July"; + months["aug"] = "August"; + months["sep"] = "September"; + months["oct"] = "October"; + months["nov"] = "November"; + months["dec"] = "December"; + date = date.split(".").join("").split(", ").slice(1); + date[0] = months[date[0].split(" ")[0].toLowerCase()] + " " + date[0].split(" ")[1]; + newItem.date = date.join(", "); + } + if (metaTags["keywords"]) { + newItem.tags = Zotero.Utilities.cleanString(metaTags["keywords"]).split(", "); + for (var i in newItem.tags) { + if (newItem.tags[i] == "" || newItem.tags[i] == " ") { + break; + } else { + var words = newItem.tags[i].split(" "); + for (var j = 0 ; j < words.length ; j++) { + Zotero.debug(words[j]); + if (words[j][0] == words[j][0].toLowerCase() && words[j][0]) { + words[j] = words[j][0].toUpperCase() + words[j].substr(1).toLowerCase(); + } + } + } + newItem.tags[i] = words.join(" "); + } + } + + if (metaTags["byline"]) { + var byline = Zotero.Utilities.cleanString(metaTags["byline"]); + var byline1 = byline.split(" and "); + for (var i = 0 ; i < byline1.length ; i++) { + var byline2 = byline1[i].split("/"); + for (var j = 0 ; j < byline2.length ; j++) { + byline2[j] = Zotero.Utilities.cleanString(byline2[j]); + if (byline2[j].indexOf(" ") == -1) { + if (byline2[j].length == 2) { + newItem.extra = byline2[j]; + } else { + newItem.extra = byline2[j][0].toUpperCase() + byline2[j].substr(1).toLowerCase(); + } + } else { + byline3 = byline2[j].split(" "); + for (var x = 0 ; x < byline3.length ; x++) { + byline3[x] = byline3[x][0].toUpperCase() + byline3[x].substr(1).toLowerCase(); + } + byline3 = byline3.join(" "); + newItem.creators.push(Zotero.Utilities.cleanAuthor(byline3, "author")); + } + } + } + } + newItem.complete(); +} + + +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x") return namespace; else return null; + } : null; + + var urls = new Array(); + if (doc.title == "TIME Magazine - Search Results") { + var items = new Array(); + var items = Zotero.Utilities.getItemArray(doc, doc.getElementById("search_results").getElementsByTagName("h3"), ''^http://www.time.com/time/.*\.html$''); + Zotero.debug(items); + + items = Zotero.selectItems(items); + + if (!items) { + return true; + } + + for (var i in items) { + if (i.match("covers") == null) { + urls.push(i); + } + } + } else if (doc.evaluate(''//meta[@name="byline"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() || doc.evaluate(''//div[@class="byline"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() || doc.evaluate(''//div[@class="copy"]/div[@class="byline"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() ) { + urls.push(doc.location.href); + } + Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); } ); + + Zotero.wait(); +}'); + +REPLACE INTO translators VALUES ('b33bbb49-03d2-4175-91c4-3840501bc953', '1.0.0b3r1', '', '2007-07-30 22:00:00', '1', '100', '4', 'Time-Blog.com', 'Michael Berkowitz', '^http://time-blog.com/.*', +'function detectWeb(doc, url) { + if (url.substr(-4,4) == "html") { + return "blogPost"; + } else { + return "multiple"; + } +}', +'function scrape(doc, url) { + var newItem = new Zotero.Item("blogPost"); + + newItem.url = doc.location.href; + newItem.title = doc.title.substr(0, doc.title.indexOf(" - ")); + + var titleRE = new RegExp(''^http://time-blog.com/([^/]*)/''); + var title = titleRE.exec(doc.location.href)[1].split("_"); + for (var i = 0 ; i < title.length ; i++) { + title[i] = title[i][0].toUpperCase() + title[i].substr(1).toLowerCase(); + } + newItem.blogTitle = title.join(" "); + var metaTags = new Object(); + + var metaTagHTML = doc.getElementsByTagName("meta"); + for (var i = 0 ; i < metaTagHTML.length ; i++) { + metaTags[metaTagHTML[i].getAttribute("name")] = metaTagHTML[i].getAttribute("content"); + } + + if (metaTags["description"]) { + newItem.abstractNote = Zotero.Utilities.cleanString(Zotero.Utilities.cleanTags(metaTags["description"])); + } + + if (metaTags["date"]) { + var date = metaTags["date"]; + var months = new Object(); + months["jan"] = "January"; + months["feb"] = "February"; + months["mar"] = "March"; + months["apr"] = "April"; + months["may"] = "May"; + months["jun"] = "June"; + months["jul"] = "July"; + months["aug"] = "August"; + months["sep"] = "September"; + months["oct"] = "October"; + months["nov"] = "November"; + months["dec"] = "December"; + date = date.split(".").join("").split(", "); + date[0] = months[date[0].split(" ")[0].toLowerCase()] + " " + date[0].split(" ")[1]; + newItem.date = date.join(", "); + } + + if (metaTags["keywords"]) { + newItem.tags = metaTags["keywords"].split(", "); + for (var i in newItem.tags) { + if (newItem.tags[i] == "" || newItem.tags[i] == " ") { + break; + } else { + var words = newItem.tags[i].split(" "); + for (var j = 0 ; j < words.length ; j++) { + if (words[j][0] == words[j][0].toLowerCase() && words[j][0]) { + words[j] = words[j][0].toUpperCase() + words[j].substr(1).toLowerCase(); + } + } + } + newItem.tags[i] = words.join(" "); + } + } + + if (doc.evaluate(''//span[@class="postedby"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { + var byline = Zotero.Utilities.cleanString(doc.evaluate(''//span[@class="postedby"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent); + if (byline.substr(0,9).toLowerCase() == "posted by") { + byline = byline.substr(10).split(" "); + } else { + byline.split(" "); + } + for (var i = 0; i < byline.length ; i++) { + byline[i] = byline[i][0].toUpperCase() + byline[i].substr(1).toLowerCase(); + } + newItem.creators.push(Zotero.Utilities.cleanAuthor(byline.join(" "), "author")); + } else if (newItem.blogTitle == "Theag") { + newItem.creators.push(Zotero.Utilities.cleanAuthor("Matthew Yeomans", "author")); + newItem.blogTitle = "the Aggregator"; + } + + Zotero.debug(newItem); + + newItem.complete(); + +} + +function doWeb(doc, url) { + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x") return namespace; else return null; + } : null; + + var URIS = new Array(); + + var xpath = ''//h1[@class="entryTitle"]/a''; + var articles = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var art = articles.iterateNext(); + var arts = new Array(); + var urls = new Array(); + while (art) { + arts.push(art.textContent); + urls.push(art.href); + art = articles.iterateNext(); + } + if (arts.length > 1) { + var items = new Object; + for (var i = 0; i < arts.length ; i++ ) { + items[urls[i]] = arts[i]; + } + items = Zotero.selectItems(items); + + for (i in items) { + URIS.push(i); + } + } else { + URIS.push(url); + } + Zotero.Utilities.processDocuments(URIS, scrape, function() { Zotero.done(); } ); + + Zotero.wait(); +}'); + +REPLACE INTO translators VALUES ('9346ddef-126b-47ec-afef-8809ed1972ab', '1.0.0b3r1', '', '2007-07-30 22:00:00', '1', '99', '4', 'Institute of Physics', 'Michael Berkowitz', '^http://www.iop.org/EJ/(toc|abstract|search)/.*', +'function detectWeb(doc, url) { + if ((doc.location.href.indexOf("toc") == -1) && (doc.location.href.indexOf("search") == -1)) { + Zotero.debug("journalArticle"); + return "journalArticle"; + } else { + Zotero.debug("multiple"); + return "multiple"; + } +}', +'function parseRIS(getURL, pdfURL) { + Zotero.Utilities.HTTP.doGet(getURL, function(text){ + // load translator for RIS + var translator = Zotero.loadTranslator ("import"); + translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7"); + translator.setString(text); + translator.setHandler("itemDone", function(obj, item) { + item.attachments = [ + {url:pdfURL, title:"IOP Full Text PDF", mimeType:"application/pdf"} + ]; + item.complete(); + }); + translator.translate(); + Zotero.done(); + }, function() {}); + + Zotero.wait(); +} + + +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + + var xpath = ''//td[1][@id="toc-opts-left"]/span[@class="toclink"]/a[contains(text(), "Abstract")]''; + var PDFs = new Array(); + var urls = new Array(); + var pdfurls = new Array(); + var items = new Array(); + + if (doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + var links = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var url = links.iterateNext(); + while (url) { + items.push(url.href); + url = links.iterateNext (); + } + + var titles = new Array(); + var xpath2 = ''//strong[@class="tocTitle"]''; + var stuff = doc.evaluate(xpath2, doc, nsResolver, XPathResult.ANY_TYPE, null); + var title = stuff.iterateNext(); + while (title) { + titles.push(title.textContent); + title = stuff.iterateNext(); + } + + var xpath3 = ''//table/tbody/tr/td[2]/span[@class="toclink"]/a''; + var PDFlinks = doc.evaluate(xpath3, doc, nsResolver, XPathResult.ANY_TYPE, null); + var newPDF = PDFlinks.iterateNext(); + while (newPDF) { + PDFs.push(newPDF.href); + newPDF = PDFlinks.iterateNext(); + } + + var newItems = new Object(); + + Zotero.debug(items.length); + Zotero.debug(titles.length); + for (var x = 0 ; x < items.length ; x++) { + newItems[items[x]] = [titles[x], PDFs[x]]; + } + + + Zotero.debug(newItems); + + + newItems = Zotero.selectItems(newItems); + + if (!newItems) { + return true; + } + + for (var i in newItems) { + Zotero.debug(i); + urls.push (i); + var newStuff = newItems[i].split('',''); + pdfurls.push(Zotero.Utilities.cleanString(newStuff[newStuff.length - 1])); + } + + } else { + urls.push(doc.location.href); + var xpath4 = ''//div[@id="abstract"]//td[2]/a''; + pdfurls.push(doc.evaluate(xpath4, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().href); + } + + for (var i = 0 ; i < urls.length ; i++) { + urls[i] = urls[i].replace("abstract", "sview"); + } + + Zotero.debug(urls); + Zotero.debug(pdfurls); + Zotero.Utilities.HTTP.doPost(urls, "format=refmgr&submit=1", function(text) { + for (var j = 0 ; j < urls.length ; j++) { + parseRIS(urls[j] + "?format=refmgr&submit=1", pdfurls[j]); + } + }); + + Zotero.wait(); +} + +'); + +REPLACE INTO translators VALUES ('6ec8008d-b206-4a4c-8d0a-8ef33807703b', '1.0.0b3r1', '', '2007-07-30 22:00:00', '1', '100', '4', 'The Economist', 'Michael Berkowitz', '^http://(www.)*economist.com/*', +'function detectWeb(doc, url) { + if (doc.location.href.indexOf("search") != -1) { + return "multiple"; + } else if (doc.location.href.indexOf("displaystory") != -1 || doc.location.href.indexOf("cityPage") != -1) { + return "magazineArticle"; + } +}', +'function scrape(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + + newItem = new Zotero.Item("magazineArticle"); + newItem.ISSN = "0013-0613"; + newItem.url = doc.location.href; + newItem.publicationTitle = "The Economist"; + + + //get headline + var title = new Array(); + if (doc.title && doc.title != "" && doc.title != "Economist.com") { + title = doc.title.split(" | "); + } else { + title.push(doc.evaluate(''//div[@class="clear"][@id="pay-barrier"]/div[@class="col-left"]/div[@class="article"]/font/b'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent); + } + + + if (title.length == 1) { + title.push = title; + } else { + title = title.slice(0, title.length - 1); + title = title.join(": "); + } + newItem.title = title; + + if (doc.evaluate(''//div[@class="clear"][@id="pay-barrier"]/div[@class="col-right"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() ) { + newItem.extra = "(Subscription only)"; + } + + //get abstract + if (doc.evaluate(''//div[@id="content"]/div[@class="clear top-border"]/div[@class="col-left"]/h2'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() ) { + newItem.abstractNote = doc.evaluate(''//div[@id="content"]/div[@class="clear top-border"]/div[@class="col-left"]/h2'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + } else if (doc.evaluate(''//div[@class="clear"][@id="pay-barrier"]/div[@class="col-left"]/div[@class="article"]/p/strong'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() ) { + newItem.abstractNote = doc.evaluate(''//div[@class="clear"][@id="pay-barrier"]/div[@class="col-left"]/div[@class="article"]/p/strong'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + } + + if (newItem.abstractNote[newItem.abstractNote.length - 1] != ".") { + newItem.abstractNote += "."; + } + + //get date and extra stuff + if (doc.evaluate(''//div[@class="col-left"]/p[@class="info"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() ) { + newItem.date = doc.evaluate(''//div[@class="col-left"]/p[@class="info"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.substr(0,13); + } + + var url = doc.location.href; + newItem.attachments = [ + {url:url.replace("displaystory", "PrinterFriendly"), title:"The Economist Snapshot", mimeType:"text/html"} + ]; + + newItem.complete(); +} + + +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + + var urls = new Array(); + + if (doc.title == "Search | Economist.com") { + var items = new Array(); + var uris = new Array(); + var results = doc.evaluate(''//ol[@class="search-results"]/li/h2/a'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var headline = results.iterateNext(); + while (headline) { + items.push(headline.textContent); + uris.push(headline.href); + headline = results.iterateNext(); + } + + var newItems = new Object(); + for (var i = 0 ; i