From babf25dc27cc8cb9d6c38647e95cb2f5f58cdff2 Mon Sep 17 00:00:00 2001 From: Michael Berkowitz Date: Tue, 10 Jun 2008 20:34:58 +0000 Subject: [PATCH] -Adds Adam's translator for BANQ. --- scrapers.sql | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 253 insertions(+), 1 deletion(-) diff --git a/scrapers.sql b/scrapers.sql index d7f4aae71..8d517eb8e 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-06-10 19:30:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-06-10 21:30:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2008-03-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats and Michael Berkowitz', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -1089,6 +1089,258 @@ REPLACE INTO translators VALUES ('88915634-1af6-c134-0171-56fd198235ed', '1.0.0b Zotero.wait(); }'); +REPLACE INTO translators VALUES ('59cce211-9d77-4cdd-876d-6229ea20367f', '1.0.0b4.r5', '', '2008-06-10 21:30:00', '0', '100', '4', 'Bibliothèque et Archives nationales du Québec', 'Adam Crymble', 'http://catalogue.banq.qc.ca', +'function detectWeb(doc, url) { + if (doc.title.match("Search")) { + return "multiple"; + } else if (doc.title.match("Recherche")) { + return "multiple"; + + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("book")) { + return "book"; + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("mmusic")) { + return "book"; + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("manalytic")) { + return "book"; + + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("msdisc")) { + return "audioRecording"; + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("msound")) { + return "audioRecording"; + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("mscas")) { + return "audioRecording"; + + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("mvdisc")) { + return "videoRecording"; + + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("mpaint")) { + return "artwork"; + + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("mserial")) { + return "report"; + + } else if (doc.evaluate(''//td[2]/a/img'', doc, null, XPathResult.ANY_TYPE, null).iterateNext().src.match("mcomponent")) { + return "newspaperArticle"; + } +} + +', +'//Bibliotheque et Archives National du Quebec. Code by Adam Crymble + +function associateData (newItem, dataTags, field, zoteroField) { + if (dataTags[field]) { + newItem[zoteroField] = dataTags[field]; + } +} + +function scrape(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var dataTags = new Object(); + var fieldTitle; + var contents; + var descriptionField; + var tagsContent= new Array(); + var inField = 0; + + //determines media type + if (detectWeb(doc, url) == "book") { + var newItem = new Zotero.Item("book"); + descriptionField = "pages"; + } else if (detectWeb(doc, url) == "audioRecording") { + var newItem = new Zotero.Item("audioRecording"); + descriptionField = "runningTime"; + } else if (detectWeb(doc, url) == "videoRecording") { + var newItem = new Zotero.Item("videoRecording"); + descriptionField = "runningTime"; + } else if (detectWeb(doc, url) == "artwork") { + var newItem = new Zotero.Item("artwork"); + descriptionField = "artworkSize"; + } else if (detectWeb(doc, url) == "report") { + var newItem = new Zotero.Item("report"); + descriptionField = "pages"; + } else if (detectWeb(doc, url) == "newspaperArticle") { + var newItem = new Zotero.Item("newspaperArticle"); + descriptionField = "pages" + } + +//determines language + var lang = doc.evaluate(''//td[2]/a/img'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var langCount = doc.evaluate(''count (//td[2]/a/img)'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var lang1 = lang.iterateNext().src; + + if (langCount.numberValue > 1) { + lang1 = lang.iterateNext().src; + + if (lang1.match("lfre")) { + newItem.language = "French"; + } else if (lang1.match("leng")) { + newItem.language = "English"; + } + } + +//scraping XPaths + var xPathHeadings = doc.evaluate(''//td/table/tbody/tr/td[2]/b'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var xPathContents = doc.evaluate(''//td[2]/table/tbody/tr/td/table/tbody/tr/td[4]'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var xPathCount = doc.evaluate(''count (//td/table/tbody/tr/td[2]/b)'', doc, nsResolver, XPathResult.ANY_TYPE, null); + + if (doc.evaluate(''//td/table/tbody/tr/td[2]/b'', doc, nsResolver, XPathResult.ANY_TYPE, null)) { + + for (i=0; i 1) { + multipleTest = 0; + } + } + + for (j = 0; j < 10; j++) { + links1[j] = links.iterateNext().href; + //Zotero.debug(links1[0]); + items[links1] = next_title1[j]; + } + + + items = Zotero.selectItems(items); + for (var i in items) { + articles.push(i); + } + } else { + articles = [url]; + } + Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); + Zotero.wait(); + +}'); + REPLACE INTO translators VALUES ('2d174277-7651-458f-86dd-20e168d2f1f3', '1.0.0b4.r5', '', '2008-06-06 08:45:00', '0', '100', '4', 'Canadiana.org', 'Adam Crymble', 'http://(www.)?canadiana.org', 'function detectWeb(doc, url) {