172 lines
5.8 KiB
JavaScript
172 lines
5.8 KiB
JavaScript
{
|
|
"translatorID":"1b052690-16dd-431d-9828-9dc675eb55f6",
|
|
"label":"Papers Past",
|
|
"creator":"staplegun",
|
|
"target":"^http://paperspast\\.natlib\\.govt\\.nz",
|
|
"minVersion":"1.0",
|
|
"maxVersion":"",
|
|
"priority":100,
|
|
"inRepository":"1",
|
|
"translatorType":4,
|
|
"lastUpdated":"2010-09-14 19:04:32"
|
|
}
|
|
|
|
/*
|
|
Papers Past Translator - Parses historic digitised newspaper articles and creates Zotero-based metadata
|
|
Copyright (C) 2010 staplegun
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
function detectWeb(doc, url) {
|
|
|
|
// a results parameter in URL means search hitlist
|
|
if (url.match(/results=/) ) {
|
|
return "multiple";
|
|
|
|
} else {
|
|
|
|
// init variables
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == "x" ) return namespace; else return null;
|
|
} : null;
|
|
var myXPath;
|
|
var myXPathObject;
|
|
|
|
// publication title in meta tags means have an article
|
|
myXPath = '//meta[@name="newsarticle_publication"]/@content';
|
|
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
var meta = myXPathObject.iterateNext().textContent;
|
|
if (meta.length > 0) {
|
|
return "newspaperArticle";
|
|
}
|
|
}
|
|
}
|
|
|
|
function doWeb(doc, url) {
|
|
|
|
// init variables
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == "x" ) return namespace; else return null;
|
|
} : null;
|
|
|
|
// hitlist page: compile hitlist titles, user selects which are wanted
|
|
// (add &zto=1 to URL for usage tracking)
|
|
var articles = new Array();
|
|
if (detectWeb(doc, url) == "multiple") {
|
|
var titlesXPath = '//div[@class="search-results"]/p/a';
|
|
var titles = doc.evaluate(titlesXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
var nextTitle;
|
|
var items = new Array();
|
|
while (nextTitle = titles.iterateNext()) {
|
|
items[nextTitle.href+"&zto=1"] = nextTitle.textContent;
|
|
}
|
|
// presented to user - who reduces list to those selected
|
|
items = Zotero.selectItems(items);
|
|
// transfer this list to articles array
|
|
for (var i in items) {
|
|
articles.push(i);
|
|
}
|
|
|
|
// article page: just continue with single (current) page URL
|
|
} else {
|
|
articles = [url+"&zto=1"];
|
|
}
|
|
|
|
// process each selected article page URL
|
|
Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});
|
|
Zotero.wait();
|
|
}
|
|
|
|
function scrape(doc) {
|
|
|
|
// init variables
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == "x" ) return namespace; else return null;
|
|
} : null;
|
|
var myXPath;
|
|
var myXPathObject;
|
|
|
|
// basic item details
|
|
var newItem = new Zotero.Item('newspaperArticle');
|
|
newItem.url = doc.location.href;
|
|
newItem.archive = 'Papers Past';
|
|
|
|
// publication title
|
|
myXPath = '//meta[@name="newsarticle_publication"]/@content';
|
|
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
newItem.publicationTitle = myXPathObject.iterateNext().textContent;
|
|
Zotero.debug(newItem.publicationTitle);
|
|
|
|
// article title (convert to sentence case)
|
|
// NB: THE CONVERSION SEEMS TO FAIL IF HAS SPECIAL CHARS
|
|
myXPath = '//meta[@name="newsarticle_headline"]/@content';
|
|
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
var title = myXPathObject.iterateNext().textContent;
|
|
var words = title.split(/\s/);
|
|
var titleFixed = '';
|
|
for (var i in words) {
|
|
words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase();
|
|
titleFixed = titleFixed + words[i] + ' ';
|
|
}
|
|
titleFixed = Zotero.Utilities.trim(titleFixed);
|
|
newItem.title = titleFixed;
|
|
|
|
// publication date (is preformatted to ISO 8601)
|
|
myXPath = '//meta[@name="dc_date"]/@content';
|
|
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
newItem.date = myXPathObject.iterateNext().textContent;
|
|
|
|
// pagination
|
|
myXPath = '//meta[@name="newsarticle_firstpage"]/@content';
|
|
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
var pages = myXPathObject.iterateNext().textContent;
|
|
|
|
myXPath = '//meta[@name="newsarticle_otherpages"]/@content';
|
|
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
pages = pages + ' ' + myXPathObject.iterateNext().textContent;
|
|
|
|
newItem.pages = Zotero.Utilities.trim(pages);
|
|
|
|
// save copy of entire web page as attachment
|
|
var attachments = new Array();
|
|
attachments.push({
|
|
title:titleFixed + " : Article webpage",
|
|
mimeType:"text/html",
|
|
url:doc.location.href
|
|
});
|
|
|
|
// find image scans and add as attachments
|
|
myXPath = '//img[@class="veridianimage"]/@src';
|
|
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
var imgSrc;
|
|
var imgUrl;
|
|
var imgNo = 0;
|
|
while (imgSrc = myXPathObject.iterateNext() ) {
|
|
imgUrl = "http://paperspast.natlib.govt.nz" + imgSrc.textContent;
|
|
attachments.push({
|
|
title: titleFixed + " : Scan image part " + ++imgNo,
|
|
mimeType: "image/gif",
|
|
url: imgUrl
|
|
});
|
|
}
|
|
newItem.attachments = attachments;
|
|
|
|
// finish
|
|
newItem.complete();
|
|
}
|