zotero/translators/Japan Times Online.js
2010-09-28 06:59:58 +00:00

182 lines
5.1 KiB
JavaScript

{
"translatorID":"b56d756e-934e-4b46-bc58-d61dccc9f32f",
"translatorType":4,
"label":"Japan Times Online",
"creator":"Frank Bennett",
"target":"^http://(?:www|search)\\.japantimes\\.co\\.jp/(?:cgi-bin|gsearch|features|entertainment|sports|life|news|rss)",
"minVersion":"2.0b7",
"maxVersion":"",
"priority":100,
"inRepository":true,
"lastUpdated":"2010-09-28 07:00:00"
}
/*
Japan Times Online Translator
Copyright (C) 2009-2010 Frank Bennett, biercenator@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
// #################################
// #### Local utility functions ####
// #################################
var itemRe = new RegExp('^http://search\.japantimes\.co\.jp/(?:cgi-bin|gsearch|features|entertainment|sports|life|news|rss)/[a-z]{2}[0-9]{8}[a-z0-9]{2}\.html');
var getResolver = function (doc) {
var namespace, resolver;
namespace = doc.documentElement.namespaceURI;
if (namespace) {
resolver = function(prefix) {
if (prefix == 'x') {
return namespace;
} else {
return null;
}
};
} else {
resolver = null;
}
return resolver;
};
var getTagContent = function (txt, attribute, value) {
var ret, m, rex;
ret = false;
rex = RegExp("<[^>]*" + attribute + "=\"" + value + "\"[^>]*>([^<]*)<");
m = rex.exec(txt);
if (m) {
ret = m[1];
}
return ret;
}
var getTagsWithAttributeAndContent = function (txt, tag, attribute) {
var ret, pos, len, lst, m, tagsrex, attribrex;
ret = {};
tagsrex = RegExp("(<" + tag + "(?: [^>]*>|>)|</" + tag+ ">)");
attribrex = RegExp(' ' + attribute + '="([^"]+)"');
lst = txt.split(tagsrex);
if (lst.length > 1) {
len = lst.length;
for (pos=1; pos < len; pos += 4) {
if (pos < (len - 2) && lst[pos + 2] == ("</" + tag + ">")) {
m = lst[pos].match(attribrex);
if (m) {
if (!itemRe.exec(m[1])) {
continue;
}
var title = lst[pos + 1];
title = title.replace(/\|.*/, "").replace(/<[^>]+>/g, "");;
ret[m[1]] = Zotero.Utilities.unescapeHTML(title);
}
}
}
}
return ret;
}
// #########################
// ##### API functions #####
// #########################
var detectWeb = function (doc, url) {
if (itemRe.test(doc.location.href)) {
return "newspaperArticle";
} else {
return "multiple";
}
}
var doWeb = function (doc, url) {
var type, nsResolver, availableItems, xpath, found, nodes, headline, pos, myurl, m, items;
nsResolver = getResolver(doc);
type = detectWeb(doc, url);
if (type === "multiple") {
availableItems = {};
if (url.match(/\/gsearch\//)) {
//
// For Google SafeSearch. Thanks, guys, it was an entertaining afternoon.
//
xpath = '//iframe[@name="googleSearchFrame"]';
var iframe = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var address = iframe.src;
var page = Zotero.Utilities.retrieveSource(address);
availableItems = getTagsWithAttributeAndContent(page, "a", "href");
} else {
xpath = '//a[contains(@href, "cgi-bin")]';
nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
found = nodes.iterateNext();
while (found) {
if (!itemRe.test(found)) {
found = nodes.iterateNext();
continue;
}
headline = found.text;
//
// Some headlines have a weird structure that yields two
// entries, the second of which is blank. Nothing is lost
// by this construct.
//
if (!headline.replace("\n", "")) {
found = nodes.iterateNext();
continue;
}
headline = headline.replace("\u00a0", " ", "g").replace("\n", " ", "g");
headline = headline.replace(/^\s+/, "").replace(/\s+$/, "").replace(/\s+/g, " ");
availableItems[found.href] = headline;
found = nodes.iterateNext();
}
}
if (availableItems.__count__) {
items = Zotero.selectItems(availableItems);
for (myurl in items) {
if (items.hasOwnProperty(myurl)) {
scrapeAndParse(myurl);
}
}
}
} else if (type === "newspaperArticle") {
scrapeAndParse(url);
}
};
// ############################
// ##### Scraper function #####
// ############################
var scrapeAndParse = function (url) {
var item, mytxt, m, val;
item = new Zotero.Item("newspaperArticle");
mytxt = Zotero.Utilities.retrieveSource(url);
item.publicationTitle = "Japan Times Online";
item.ISSN = "0289-1956";
item.url = url;
val = getTagContent(mytxt, "id", "date");
if (val) {
item.date = val;
}
val = getTagContent(mytxt, "id", "headline");
if (val) {
item.title = val;
}
item.attachments.push({title:"Japan Times Online snapshot", mimeType:"text/html", url:url});
item.complete();
};