diff --git a/translators/Papers Past.js b/translators/Papers Past.js new file mode 100644 index 000000000..9a29130d6 --- /dev/null +++ b/translators/Papers Past.js @@ -0,0 +1,171 @@ +{ + "translatorID":"1b052690-16dd-431d-9828-9dc675eb55f6", + "label":"Papers Past", + "creator":"staplegun", + "target":"^http://paperspast\\.natlib\\.govt\\.nz", + "minVersion":"1.0", + "maxVersion":"", + "priority":100, + "inRepository":"1", + "translatorType":4, + "lastUpdated":"2010-09-14 19:04:32" +} + +/* + Papers Past Translator - Parses historic digitised newspaper articles and creates Zotero-based metadata + Copyright (C) 2010 staplegun + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +function detectWeb(doc, url) { + + // a results parameter in URL means search hitlist + if (url.match(/results=/) ) { + return "multiple"; + + } else { + + // init variables + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + var myXPath; + var myXPathObject; + + // publication title in meta tags means have an article + myXPath = '//meta[@name="newsarticle_publication"]/@content'; + myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var meta = myXPathObject.iterateNext().textContent; + if (meta.length > 0) { + return "newspaperArticle"; + } + } +} + +function doWeb(doc, url) { + + // init variables + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + + // hitlist page: compile hitlist titles, user selects which are wanted + // (add &zto=1 to URL for usage tracking) + var articles = new Array(); + if (detectWeb(doc, url) == "multiple") { + var titlesXPath = '//div[@class="search-results"]/p/a'; + var titles = doc.evaluate(titlesXPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var nextTitle; + var items = new Array(); + while (nextTitle = titles.iterateNext()) { + items[nextTitle.href+"&zto=1"] = nextTitle.textContent; + } + // presented to user - who reduces list to those selected + items = Zotero.selectItems(items); + // transfer this list to articles array + for (var i in items) { + articles.push(i); + } + + // article page: just continue with single (current) page URL + } else { + articles = [url+"&zto=1"]; + } + + // process each selected article page URL + Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();}); + Zotero.wait(); +} + +function scrape(doc) { + + // init variables + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + var myXPath; + var myXPathObject; + + // basic item details + var newItem = new Zotero.Item('newspaperArticle'); + newItem.url = doc.location.href; + newItem.archive = 'Papers Past'; + + // publication title + myXPath = '//meta[@name="newsarticle_publication"]/@content'; + myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + newItem.publicationTitle = myXPathObject.iterateNext().textContent; + Zotero.debug(newItem.publicationTitle); + + // article title (convert to sentence case) + // NB: THE CONVERSION SEEMS TO FAIL IF HAS SPECIAL CHARS + myXPath = '//meta[@name="newsarticle_headline"]/@content'; + myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var title = myXPathObject.iterateNext().textContent; + var words = title.split(/\s/); + var titleFixed = ''; + for (var i in words) { + words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase(); + titleFixed = titleFixed + words[i] + ' '; + } + titleFixed = Zotero.Utilities.trim(titleFixed); + newItem.title = titleFixed; + + // publication date (is preformatted to ISO 8601) + myXPath = '//meta[@name="dc_date"]/@content'; + myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + newItem.date = myXPathObject.iterateNext().textContent; + + // pagination + myXPath = '//meta[@name="newsarticle_firstpage"]/@content'; + myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var pages = myXPathObject.iterateNext().textContent; + + myXPath = '//meta[@name="newsarticle_otherpages"]/@content'; + myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + pages = pages + ' ' + myXPathObject.iterateNext().textContent; + + newItem.pages = Zotero.Utilities.trim(pages); + + // save copy of entire web page as attachment + var attachments = new Array(); + attachments.push({ + title:titleFixed + " : Article webpage", + mimeType:"text/html", + url:doc.location.href + }); + + // find image scans and add as attachments + myXPath = '//img[@class="veridianimage"]/@src'; + myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var imgSrc; + var imgUrl; + var imgNo = 0; + while (imgSrc = myXPathObject.iterateNext() ) { + imgUrl = "http://paperspast.natlib.govt.nz" + imgSrc.textContent; + attachments.push({ + title: titleFixed + " : Scan image part " + ++imgNo, + mimeType: "image/gif", + url: imgUrl + }); + } + newItem.attachments = attachments; + + // finish + newItem.complete(); +}