-Adds fixed version of The Hindu translator.
This commit is contained in:
parent
fb40b61696
commit
89bdd9ee28
88
scrapers.sql
88
scrapers.sql
|
@ -22,7 +22,7 @@
|
|||
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-05-08 20:30:00'));
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-05-08 21:00:00'));
|
||||
|
||||
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2008-03-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats and Michael Berkowitz', '^https?://(?:www\.)?amazon',
|
||||
'function detectWeb(doc, url) {
|
||||
|
@ -1089,6 +1089,91 @@ REPLACE INTO translators VALUES ('88915634-1af6-c134-0171-56fd198235ed', '1.0.0b
|
|||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('9499c586-d672-42d6-9ec4-ee9594dcc571', '1.0.0b4.r5', '', '2008-05-08 21:00:00', '0', '100', '4', 'The Hindu', 'Prashant Iyengar and Michael Berkowitz', 'http://(www.)?hindu.com',
|
||||
'function detectWeb(doc, url) {
|
||||
if (doc.evaluate(''//h2[@class="r"]/a[@class="l"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "multiple";
|
||||
} else {
|
||||
return "newspaperArticle";
|
||||
}
|
||||
}
|
||||
',
|
||||
'function regexMeta(str, item) {
|
||||
var re = /NAME\=\"([\w\W]*?)\"\s+CONTENT\=\"([\w\W]*?)\"/;
|
||||
var stuff = str.match(re);
|
||||
if (stuff)
|
||||
{
|
||||
if (stuff[1] == "PAGEHEAD") {
|
||||
item.section = stuff[2].split(/\s+/)[0];
|
||||
}
|
||||
if (stuff[1] == "ZONE") {
|
||||
item.place = stuff[2].split(/\s+/)[0];
|
||||
}
|
||||
if (stuff[1] == "PAGENUMBER") {
|
||||
item.pages = stuff[2].split(/\s+/)[0];
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var arts = new Array();
|
||||
if (detectWeb(doc, url) == "multiple") {
|
||||
var xpath = ''//h2[@class="r"]/a[@class="l"]'';
|
||||
var links = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
|
||||
var link;
|
||||
var items = new Object();
|
||||
while (link = links.iterateNext()) {
|
||||
items[link.href] = link.textContent;
|
||||
}
|
||||
items = Zotero.selectItems(items);
|
||||
for (var i in items) {
|
||||
arts.push(i);
|
||||
}
|
||||
|
||||
} else { arts = [url]; }
|
||||
for each (var art in arts) {
|
||||
Zotero.debug(art);
|
||||
Zotero.Utilities.HTTP.doGet(art, function(text) {
|
||||
var newItem = new Zotero.Item("newspaperArticle");
|
||||
newItem.publicationTitle = "The Hindu";
|
||||
newItem.url = art;
|
||||
//title
|
||||
var t = /\<TITLE\>[\w\W]*\:([\w\W]*?)<\/TITLE/;
|
||||
newItem.title = Zotero.Utilities.unescapeHTML(Zotero.Utilities.capitalizeTitle(text.match(t)[1]));
|
||||
|
||||
var ti = /\<FONT color\=black\>(.*)?\<\/FONT\>/;
|
||||
newItem.date = text.match(ti)[1];
|
||||
|
||||
var auth = /\<font class\=storyhead[\w\W]*?justify\>([\w\W]*?)\<p\>/;
|
||||
if (text.match(auth))
|
||||
{
|
||||
//newItem.author=Zotero.Utilities.cleanAuthor(text.match(auth)[1]);
|
||||
cleanauth=Zotero.Utilities.cleanTags(text.match(auth)[1]);
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(cleanauth, "author"));
|
||||
|
||||
}
|
||||
|
||||
newItem.websiteTitle="The Hindu";
|
||||
newItem.edition="Online";
|
||||
|
||||
//hooray for real meta tags!
|
||||
var meta = /<META NAME[\w\W]*?\>/g;
|
||||
var metaTags = text.match(meta);
|
||||
for (var i = 0 ; i <metaTags.length ; i++) {
|
||||
regexMeta(metaTags[i], newItem);
|
||||
}
|
||||
newItem.complete();
|
||||
Zotero.done();
|
||||
});
|
||||
Zotero.wait();
|
||||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('e8d40f4b-c4c9-41ca-a59f-cf4deb3d3dc5', '1.0.0b4.r5', '', '2008-05-08 20:30:00', '0', '100', '4', 'Business Standard', 'Prashant Iyengar and Michael Berkowitz', 'http://www.business-standard.com',
|
||||
'function detectWeb(doc, url) {
|
||||
if (url.match(/googlesearch/)) {
|
||||
|
@ -1113,7 +1198,6 @@ REPLACE INTO translators VALUES ('e8d40f4b-c4c9-41ca-a59f-cf4deb3d3dc5', '1.0.0b
|
|||
} else {
|
||||
arts = [url];
|
||||
}
|
||||
Zotero.debug(arts);
|
||||
Zotero.Utilities.processDocuments(arts, function(doc) {
|
||||
var newItem = new Zotero.Item("newspaperArticle");
|
||||
newItem.publicationTitle = "The Business Standard";
|
||||
|
|
Loading…
Reference in New Issue
Block a user