From cb269dc15acf25008ce88580d809fe57f33ebf8b Mon Sep 17 00:00:00 2001 From: Avram Lyon Date: Tue, 14 Sep 2010 17:57:58 +0000 Subject: [PATCH] Adding NZZ.ch, Newsnetz, and FAZ.net translators, by Ibex --- translators/FAZ.NET.js | 133 ++++++++++++++++++++++++++++++++++++++ translators/NZZ.ch.js | 140 ++++++++++++++++++++++++++++++++++++++++ translators/Newsnetz.js | 139 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 412 insertions(+) create mode 100644 translators/FAZ.NET.js create mode 100644 translators/NZZ.ch.js create mode 100644 translators/Newsnetz.js diff --git a/translators/FAZ.NET.js b/translators/FAZ.NET.js new file mode 100644 index 000000000..9409f903b --- /dev/null +++ b/translators/FAZ.NET.js @@ -0,0 +1,133 @@ +{ + "translatorID":"4f0d0c90-5da0-11df-a08a-0800200c9a66", + "translatorType":4, + "label":"FAZ.NET", + "creator":"ibex", + "target":"^http://((www\\.)?faz\\.net/.)", + "minVersion":"2.0", + "maxVersion":"", + "priority":100, + "inRepository":false, + "lastUpdated":"2010-09-08 12:00:00" +} + +/* + FAZ Translator - Parses FAZ articles and creates Zotero-based metadata. + Copyright (C) 2010 ibex + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* Get the first xpath element from doc, if not found return null. */ +function getXPath(xpath, doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x") return namespace; else return null; + } : null; + + return doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); +} + +/* Zotero API */ +function detectWeb(doc, url) { + //Zotero.debug("ibex detectWeb URL= "+ url); + if (doc.title == "Suche - FAZ.NET" && getXPath('//div[@class = "SuchPagingModul"]', doc)) { + return "multiple"; + } else if (getXPath('//div[@class = "Article"]', doc)) { + return "newspaperArticle"; + } +} + +/* Zotero API */ +function doWeb(doc, url) { + //Zotero.debug("ibex doWeb URL = "+ url); + var urls = new Array(); + if (detectWeb(doc, url) == "multiple") { + var items = Zotero.Utilities.getItemArray(doc, doc.getElementById("MainColumn").getElementsByTagName("h1"), '/s/.+\\.html'); + if (!items || countObjectProperties(items) == 0) { + return true; + } + items = Zotero.selectItems(items); + if (!items) { + return true; + } + + for (var i in items) { + urls.push(i); + } + } else { + urls.push(doc.location.href); + } + Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); } ); + Zotero.wait(); +} + +function scrape(doc) { + //Zotero.debug("ibex scrape URL = "+ doc.location.href); + var newArticle = new Zotero.Item('newspaperArticle'); + newArticle.url = doc.location.href; + newArticle.title = Zotero.Utilities.trimInternal(getXPath('//div[@class = "Article"]/h1', doc).textContent); + newArticle.date = Zotero.Utilities.trimInternal(getXPath('//div[@class = "Article"]/span[@class = "Italic"][1]', doc).textContent); + + var subtitle = getXPath('//div[@class = "Article"]/h2', doc); + if (subtitle != null) { + newArticle.shortTitle = newArticle.title; + newArticle.title = Zotero.Utilities.trimInternal(subtitle.textContent) + ": " + newArticle.title; + } + + var teaser = getXPath('//div[@class = "Article"]/h4', doc); + if (teaser != null) { + newArticle.abstractNote = Zotero.Utilities.trimInternal(teaser.textContent); + } + + var authorline = getXPath('//div[@class = "Article"]/p[@class = "Author"]', doc); + if (authorline != null) { + authorline = Zotero.Utilities.trimInternal(authorline.textContent); + //assumption of authorline: "Von name1 [und Name2][, location]" + authorline = authorline.replace(/Von /, ""); + //remove ", location" + authorline = Zotero.Utilities.trim(authorline.replace(/, .*$/, "")); + + var authors = authorline.split(" und "); + for (var i = 0; i < authors.length && authorline.length > 0; i++) { + newArticle.creators.push(Zotero.Utilities.cleanAuthor(authors[i], "author")); + } + } + + newArticle.publicationTitle = "FAZ.NET"; + + var section = getXPath('//div[@id="FAZNavMain"]//li[@class = "tabSelected"]/a', doc); + if (section != null) { + newArticle.section = Zotero.Utilities.trimInternal(section.textContent); + } + + var source = getXPath('//div[@id="MainColumn"]/div[@class = "Article"]/p[@class = "ArticleSrc"]', doc); + if (source != null) { + newArticle.extra = Zotero.Utilities.trimInternal(Zotero.Utilities.cleanTags(source.innerHTML)); + } + + //unfortunately a print dialog will be shown due to if the snapshot is opened. A user must click on cancel afterwards. + var length = newArticle.attachments.push({title:"FAZ.NET Article Snapshot", mimeType:"text/html", url:doc.location.href.replace("~Scontent.html", "~Scontent~Afor~Eprint.html"), snapshot:true}); + + newArticle.complete(); +} + +/* There is no built-in function to count object properties which often are used as associative arrays.*/ +function countObjectProperties(obj) { + var size = 0; + for (var key in obj) { + if (obj.hasOwnProperty(key)) size++; + } + return size; +} diff --git a/translators/NZZ.ch.js b/translators/NZZ.ch.js new file mode 100644 index 000000000..003c4a77b --- /dev/null +++ b/translators/NZZ.ch.js @@ -0,0 +1,140 @@ +{ + "translatorID":"61ffe600-55e0-11df-bed9-0002a5d5c51b", + "translatorType":4, + "label":"nzz.ch", + "creator":"ibex", + "target":"^http://((www\\.)?nzz\\.ch/.)", + "minVersion":"2.0", + "maxVersion":"", + "priority":100, + "inRepository":false, + "lastUpdated":"2010-09-08 12:00:00" +} + +/* + NZZ Translator - Parses NZZ articles and creates Zotero-based metadata. + Copyright (C) 2010 ibex + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* Get the first xpath element from doc, if not found return null. */ +function getXPath(xpath, doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x") return namespace; else return null; + } : null; + + return doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); +} + +/* Zotero API */ +function detectWeb(doc, url) { + //Zotero.debug("ibex detectWeb URL= "+ url); + if (doc.title.substr(0, 6) == "Suche " && getXPath('//div[@class = "searchdetails"]', doc)) { + return "multiple"; + } else if (doc.location.href.match(/\.\d+\.html/) && getXPath('//li[@id = "article"]/div[@class = "article"]', doc)) { + return "newspaperArticle"; + } +} + +/* Zotero API */ +function doWeb(doc, url) { + //Zotero.debug("ibex doWeb URL= "+ url); + var urls = new Array(); + if (detectWeb(doc, url) == "multiple") { + var items = Zotero.Utilities.getItemArray(doc, doc.getElementById("searchresult").getElementsByTagName("h3"), '\\.\\d+\\.html'); + if (!items || countObjectProperties(items) == 0) { + return true; + } + items = Zotero.selectItems(items); + if (!items) { + return true; + } + + for (var i in items) { + urls.push(i); + } + } else { + urls.push(doc.location.href); + } + Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); } ); + Zotero.wait(); +} + +/* Three types of articles: "Neue Zürcher Zeitung", "NZZ Online" and "NZZ am Sonntag" */ +function scrape(doc) { + //Zotero.debug("ibex scrape URL = "+ doc.location.href); + var newArticle = new Zotero.Item('newspaperArticle'); + newArticle.url = doc.location.href; + newArticle.title = Zotero.Utilities.trimInternal(getXPath('//li[@id = "article"]/div[@class = "article"]/div[@class = "header"]//h1', doc).textContent); + + var publ = Zotero.Utilities.trimInternal(getXPath('//li[@id = "article"]/div[@class = "article"]/div[@class = "header"]/div[@class = "pubication"]', doc).textContent); + publ = publ.split(','); + newArticle.date = Zotero.Utilities.trimInternal(publ[0]); + newArticle.publicationTitle = Zotero.Utilities.trimInternal(publ[publ.length - 1]); + if (newArticle.publicationTitle.match(/^\d/)) { + //set a publication title if there is only a number (date) + newArticle.publicationTitle = "NZZ"; + } else if (newArticle.publicationTitle == "Neue Zürcher Zeitung") { + newArticle.ISSN = "0376-6829"; + } else if (newArticle.publicationTitle == "NZZ am Sonntag") { + newArticle.ISSN = "1660-0851"; + } + + var subtitle = getXPath('//li[@id = "article"]/div[@class = "article"]/div[@class = "header"]//h2', doc); + if (subtitle != null && newArticle.publicationTitle != "NZZ am Sonntag") { + newArticle.shortTitle = newArticle.title; + newArticle.title += ": " + Zotero.Utilities.trimInternal(subtitle.textContent); + } + + var teaser = getXPath('//li[@id = "article"]/div[@class = "article"]//div[@class = "body"]/h5', doc); + if (teaser != null) { + newArticle.abstractNote = Zotero.Utilities.trimInternal(teaser.textContent); + } + + var authorline = getXPath('//li[@id = "article"]/div[@class = "article"]//div[@class = "body"]/p[contains(@class, "quelle")]', doc); + authorline = !authorline && newArticle.publicationTitle == "NZZ am Sonntag"? subtitle :authorline; // subtitle in some cases of "NZZ am Sonntag" + if (authorline != null) { + authorline = Zotero.Utilities.trimInternal(authorline.textContent); + //assumption of authorline: "[Interview:|Von ]name1 [und Name2][, location]" + authorline = authorline.replace(/^.*Von /, ""); + authorline = authorline.replace(/Interview: /, ""); + //remove ", location" + authorline = Zotero.Utilities.trim(authorline.replace(/, .*$/, "")); + + var authors = authorline.split(" und "); + for (var i = 0; i < authors.length && authorline.length > 0; i++) { + newArticle.creators.push(Zotero.Utilities.cleanAuthor(authors[i], "author")); + } + } + + var section = getXPath('//ul[@id="navContent"]/li/a[@id="navContentSelected"]', doc); + if (section != null) { + newArticle.section = Zotero.Utilities.trimInternal(section.textContent.replace(/·/,"")); + } + + newArticle.attachments.push({title:"NZZ Online Article Snapshot", mimeType:"text/html", url:doc.location.href + "?printview=true", snapshot:true}); + + newArticle.complete(); +} + +/* There is no built-in function to count object properties which often are used as associative arrays.*/ +function countObjectProperties(obj) { + var size = 0; + for (var key in obj) { + if (obj.hasOwnProperty(key)) size++; + } + return size; +} diff --git a/translators/Newsnetz.js b/translators/Newsnetz.js new file mode 100644 index 000000000..cf8c9c488 --- /dev/null +++ b/translators/Newsnetz.js @@ -0,0 +1,139 @@ +{ + "translatorID":"caecaea0-5d06-11df-a08a-0800200c9a66", + "translatorType":4, + "label":"tagesanzeiger.ch/Newsnetz", + "creator":"ibex", + "target":"^http://((www\\.)?(tagesanzeiger|bernerzeitung|bazonline|derbund|thurgauerzeitung)\\.ch/.)", + "minVersion":"2.0", + "maxVersion":"", + "priority":100, + "inRepository":false, + "lastUpdated":"2010-09-08 12:00:00" +} + +/* + Tagesanzeiger.ch Translator - Parses tagesanzeiger.ch, bernerzeitung.ch, + bazonline.ch, derbund.ch, thurgauerzeitung.ch articles from to the + Newsnetz and creates Zotero-based metadata. + Copyright (C) 2010 ibex + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* Get the first xpath element from doc, if not found return null. */ +function getXPath(xpath, doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x") return namespace; else return null; + } : null; + + return doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); +} + +/* Zotero API */ +function detectWeb(doc, url) { + //Zotero.debug("ibex detectWeb URL= "+ url); + if (doc.location.href.indexOf("suche.html?") != -1 && doc.getElementById("panelArticleItems")) { + return "multiple"; + } else if (doc.location.href.indexOf("/story/") != -1 + && getXPath('//div[@id = "singlePage"]/div[@id = "singleLeft"]/h2', doc)) { + return "newspaperArticle"; + } +} + +/* Zotero API */ +function doWeb(doc, url) { + //Zotero.debug("ibex doWeb URL= "+ url); + var urls = new Array(); + if (detectWeb(doc, url) == "multiple") { + var items = Zotero.Utilities.getItemArray(doc, doc.getElementById("panelArticleItems").getElementsByTagName("h3"), '/story/\\d+'); + if (!items || countObjectProperties(items) == 0) { + return true; + } + items = Zotero.selectItems(items); + if (!items) { + return true; + } + + for (var i in items) { + urls.push(i); + } + } else { + urls.push(doc.location.href); + } + Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); } ); + Zotero.wait(); +} + +function scrape(doc) { + //Zotero.debug("ibex scrape URL = " + doc.location.href); + var newArticle = new Zotero.Item('newspaperArticle'); + newArticle.url = doc.location.href; + newArticle.title = Zotero.Utilities.trimInternal(getXPath('//div[@id = "singleLeft"]/h2', doc).textContent); + + var date = Zotero.Utilities.trimInternal(getXPath('//div[@id = "singleLeft"]/p[@class = "publishedDate"]', doc).textContent); + newArticle.date = Zotero.Utilities.trimInternal(date.split(/[:,] */)[1]); + + var authorline = getXPath('//div[@id = "singleLeft"]/div[@id = "metaLine"]/h5', doc); + if (authorline != null && authorline.textContent.length > 0) { + authorline = Zotero.Utilities.trimInternal(authorline.textContent); + //remove script code "//" + authorline = authorline.replace(/\/\//, ""); + //assumption of authorline: "[Interview:|Von name1 [und Name2][, location].] [Aktualisiert ...]" + authorline = authorline.replace(/Von /, ""); + authorline = authorline.replace(/Interview: /, ""); + authorline = authorline.replace(/Aktualisiert .*$/, ""); + authorline = authorline.replace(/, .*$/, ""); + authorline = Zotero.Utilities.trim(authorline.replace(/\. .*$/, "")); + + var authors = authorline.split(" und "); + for (var i = 0; i < authors.length && authorline.length > 0; i++) { + newArticle.creators.push(Zotero.Utilities.cleanAuthor(authors[i], "author")); + } + } + + var teaser = getXPath('//div[@id = "singleLeft"]/p[@class = "teaser"]', doc); + if (teaser != null) { + newArticle.abstractNote = Zotero.Utilities.trimInternal(teaser.textContent); + } + + var publicationTitle = getXPath('//div[@id = "singleLeft"]//span[@class = "idcode"]', doc); + newArticle.publicationTitle = doc.location.host.replace(/^www./,""); + if (publicationTitle != null) { + publicationTitle = Zotero.Utilities.trimInternal(publicationTitle.textContent); + newArticle.publicationTitle += ": " + publicationTitle; + if (publicationTitle == '(Tages-Anzeiger)') { + newArticle.publicationTitle = "Tages-Anzeiger"; + newArticle.ISSN = "1422-9994"; + } + } + + var section = getXPath('//div[@id = "singleHeader"]/h1/span', doc); + if (section != null) { + newArticle.section = Zotero.Utilities.trimInternal(section.textContent); + } + + newArticle.attachments.push({title:"tagesanzeiger.ch Article Snapshot", mimeType:"text/html", url:doc.location.href + "/print.html", snapshot:true}); + + newArticle.complete(); +} + +/* There is no built-in function to count object properties which often are used as associative arrays.*/ +function countObjectProperties(obj) { + var size = 0; + for (var key in obj) { + if (obj.hasOwnProperty(key)) size++; + } + return size; +} \ No newline at end of file