122 lines
3.3 KiB
JavaScript
122 lines
3.3 KiB
JavaScript
{
|
|
"translatorID":"b56f856e-934e-4b46-bc58-d61dccc9f32f",
|
|
"translatorType":4,
|
|
"label":"Mainichi Daily News",
|
|
"creator":"Frank Bennett",
|
|
"target":"^http://(?:search\\.)*mdn\\.mainichi\\.jp/(?:$|result\\?|mdnnews/|perspectives/|features/|arts/|travel/)",
|
|
"minVersion":"2.0b7",
|
|
"maxVersion":"",
|
|
"priority":100,
|
|
"inRepository":true,
|
|
"lastUpdated":"2010-06-05 20:35:00"
|
|
}
|
|
|
|
// #################################
|
|
// #### Local utility functions ####
|
|
// #################################
|
|
|
|
var itemRe = new RegExp('.*/([0-9]{8})[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{2}[a-z]{1}[0-9]{1}[a-z]{2}[0-9]{6}c\.html');
|
|
|
|
var getResolver = function (doc) {
|
|
var namespace, resolver;
|
|
namespace = doc.documentElement.namespaceURI;
|
|
if (namespace) {
|
|
resolver = function(prefix) {
|
|
if (prefix == 'x') {
|
|
return namespace;
|
|
} else {
|
|
return null;
|
|
}
|
|
};
|
|
} else {
|
|
resolver = null;
|
|
}
|
|
return resolver;
|
|
};
|
|
|
|
var cleanUp = function (str) {
|
|
var ret;
|
|
ret = str.replace("\u00a0", " ", "g").replace("\n", " ", "g");
|
|
ret = ret.replace(/^\s+/, "").replace(/\s+$/, "").replace(/\s+/g, " ");
|
|
ret = ret.replace(/\|.*/, "").replace(/<[^>]+>/g, "");;
|
|
ret = Zotero.Utilities.unescapeHTML(ret);
|
|
return ret;
|
|
}
|
|
|
|
|
|
// #########################
|
|
// ##### API functions #####
|
|
// #########################
|
|
|
|
var detectWeb = function (doc, url) {
|
|
if (itemRe.test(doc.location.href)) {
|
|
return "newspaperArticle";
|
|
} else {
|
|
return "multiple";
|
|
}
|
|
}
|
|
|
|
var doWeb = function (doc, url) {
|
|
var type, nsResolver, availableItems, xpath, found, nodes, headline, pos, myurl, m, items, title;
|
|
nsResolver = getResolver(doc);
|
|
type = detectWeb(doc, url);
|
|
if (type === "multiple") {
|
|
availableItems = {};
|
|
if (url.match(/^http:\/\/search\.mdn\.mainichi\.jp\/result\?/)){
|
|
xpath = '//div[@class="ResultTitle"]/a[contains(@href, "mdn.mainichi.jp")]';
|
|
} else {
|
|
xpath = '//h2[@class="NewsTitle"]/a[@href]|//ul[@class="Mark"]/li/a[@href]';
|
|
}
|
|
nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
found = nodes.iterateNext();
|
|
while (found) {
|
|
if (!itemRe.test(found.href)) {
|
|
found = nodes.iterateNext();
|
|
continue;
|
|
}
|
|
headline = found.textContent;
|
|
headline = cleanUp(headline);
|
|
availableItems[found.href] = headline;
|
|
found = nodes.iterateNext();
|
|
}
|
|
if (availableItems.__count__) {
|
|
items = Zotero.selectItems(availableItems);
|
|
for (myurl in items) {
|
|
if (items.hasOwnProperty(myurl)) {
|
|
scrapeAndParse(myurl, availableItems[myurl]);
|
|
}
|
|
}
|
|
}
|
|
} else if (type === "newspaperArticle") {
|
|
xpath = '//h2[@class="NewsTitle"]';
|
|
nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
title = nodes.iterateNext();
|
|
if (title) {
|
|
title = cleanUp(title.textContent);
|
|
scrapeAndParse(url, title);
|
|
}
|
|
}
|
|
};
|
|
|
|
// ############################
|
|
// ##### Scraper function #####
|
|
// ############################
|
|
|
|
var scrapeAndParse = function (url, title) {
|
|
var item, mytxt, m, val;
|
|
item = new Zotero.Item("newspaperArticle");
|
|
item.title = title;
|
|
item.publicationTitle = "Mainichi Daily News";
|
|
item.edition = "online edition";
|
|
item.url = url;
|
|
m = itemRe.exec(url);
|
|
if (m) {
|
|
var year = m[1].slice(0,4);
|
|
var month = m[1].slice(4,6);
|
|
var day = m[1].slice(6,8);
|
|
item.date = [year, month, day].join("-");
|
|
}
|
|
item.attachments.push({title:"Mainichi Daily News snapshot", mimeType:"text/html", url:url});
|
|
item.complete();
|
|
};
|