-Updates PLoS detectWeb() so that it no longer returns non-article and non-search result pages
-Adds Innovate Online translator
This commit is contained in:
parent
e02b3a1f5e
commit
647c47257d
96
scrapers.sql
96
scrapers.sql
|
@ -22,7 +22,7 @@
|
||||||
|
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-01-06 23:55:00'));
|
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-01-07 19:00:00'));
|
||||||
|
|
||||||
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
|
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
|
@ -2086,11 +2086,101 @@ function getData(ids){
|
||||||
Zotero.wait();
|
Zotero.wait();
|
||||||
}');
|
}');
|
||||||
|
|
||||||
REPLACE INTO translators VALUES ('bdae838b-3a58-461f-9e8a-142ed9de61dc', '1.0.0b4.r5', '', '2007-12-05 17:00:00', '0', '100', '4', 'PLoS Journals', 'Michael Berkowitz', '^http://[^.]+\.plosjournals\.org/',
|
REPLACE INTO translators VALUES ('ca6e95d1-46b9-4535-885c-df0c2d4b7f7a', '1.0.0b4.r5', '', '2008-01-07 19:00:00', '0', '100', '4', 'Innovate Online', 'Michael Berkowitz', '^http://(www.)?innovateonline.info/',
|
||||||
|
'function detectWeb(doc, url) {
|
||||||
|
if (url.indexOf("view=article") != -1) {
|
||||||
|
return "journalArticle";
|
||||||
|
} else if (url.indexOf("view=search") != -1) {
|
||||||
|
return "multiple";
|
||||||
|
}
|
||||||
|
}',
|
||||||
|
'function doWeb(doc, url) {
|
||||||
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
|
var nsResolver = namespace ? function(prefix) {
|
||||||
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
|
} : null;
|
||||||
|
var newURIs = new Array();
|
||||||
|
|
||||||
|
if (url.indexOf("view=search") != -1) {
|
||||||
|
var titles = new Array();
|
||||||
|
var hrefs = new Array();
|
||||||
|
var items = new Object();
|
||||||
|
var xpath = ''//ul[@class="articles"]/li[@class="result"]/div[@class="header"]'';
|
||||||
|
var names = doc.evaluate(xpath, doc, namespace, XPathResult.ANY_TYPE, null);
|
||||||
|
var next_item = names.iterateNext();
|
||||||
|
while (next_item) {
|
||||||
|
titles.push(next_item.textContent.split(/\n/)[3]);
|
||||||
|
next_item = names.iterateNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
var nextpath = ''//ul[@class="articles"]/li/@onclick'';
|
||||||
|
var links = doc.evaluate(nextpath, doc, namespace, XPathResult.ANY_TYPE, null);
|
||||||
|
var next_link = links.iterateNext();
|
||||||
|
while (next_link) {
|
||||||
|
hrefs.push(next_link.textContent);
|
||||||
|
next_link = links.iterateNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var i = 0 ; i < titles.length ; i++) {
|
||||||
|
items[hrefs[i].match(/\d+/)] = titles[i];
|
||||||
|
}
|
||||||
|
items = Zotero.selectItems(items);
|
||||||
|
|
||||||
|
for (var i in items) {
|
||||||
|
newURIs.push(''http://innovateonline.info/index.php?view=article&id='' + i);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
var newURL = url;
|
||||||
|
if (newURL.indexOf("highlight") != -1) {
|
||||||
|
newURL = newURL.substring(0, newURL.indexOf("highlight") -1);
|
||||||
|
}
|
||||||
|
if (newURL.indexOf("action=synopsis") != -1) {
|
||||||
|
newURL = newURL.replace("action=synopsis", "action=article");
|
||||||
|
}
|
||||||
|
newURIs.push(newURL);
|
||||||
|
}
|
||||||
|
Zotero.debug(newURIs);
|
||||||
|
|
||||||
|
Zotero.Utilities.processDocuments(newURIs, function(newDoc) {
|
||||||
|
var newItem = new Zotero.Item("journalArticle");
|
||||||
|
newItem.repository = "Innovate Online";
|
||||||
|
newItem.publicationTitle = "Innovate";
|
||||||
|
newItem.title = newDoc.title.substring(10);
|
||||||
|
|
||||||
|
var authors = newDoc.evaluate(''//div[@id="title"]/div[@class="author"]/a'', newDoc, namespace, XPathResult.ANY_TYPE, null);
|
||||||
|
var author = authors.iterateNext();
|
||||||
|
while (author) {
|
||||||
|
newItem.creators.push(Zotero.Utilities.cleanAuthor(author.textContent, "author"));
|
||||||
|
author = authors.iterateNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
newItem.date = newDoc.evaluate(''//div[@id="page"]/a/div[@class="title"]'', newDoc, namespace, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||||
|
|
||||||
|
var voliss = newDoc.evaluate(''//div[@id="page"]/a/div[@class="subtitle"]'', newDoc, namespace, XPathResult.ANY_TYPE, null).iterateNext().textContent.match(/Volume\s+(\d+).*Issue\s+(\d+)/);
|
||||||
|
newItem.volume = voliss[1];
|
||||||
|
newItem.issue = voliss[2];
|
||||||
|
|
||||||
|
var id = newDoc.location.href.match(/\d+/)[0];
|
||||||
|
var PDFurl = "http://innovateonline.info/print.php?view=pdf&id=" + id;
|
||||||
|
newItem.attachments = [
|
||||||
|
{url:newDoc.location.href, title:"Innovate Online Snapshot", mimeType:"text/html"},
|
||||||
|
{url:PDFurl, title:"Innovate Online PDF", mimeType:"application/pdf"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Zotero.Utilities.HTTP.doGet(newDoc.location.href.replace("action=article", "action=synopsis"), function(text) {
|
||||||
|
var abs = text.match(/<div id=\"synopsis\">\n<p>(.*)<\/p>/)[1];
|
||||||
|
newItem.abstractNote = Zotero.Utilities.unescapeHTML(Zotero.Utilities.cleanTags(abs));
|
||||||
|
newItem.complete();
|
||||||
|
});
|
||||||
|
}, function() {Zotero.done;});
|
||||||
|
Zotero.wait();
|
||||||
|
}');
|
||||||
|
|
||||||
|
REPLACE INTO translators VALUES ('bdae838b-3a58-461f-9e8a-142ed9de61dc', '1.0.0b4.r5', '', '2008-01-07 19:00:00', '0', '100', '4', 'PLoS Journals', 'Michael Berkowitz', '^http://[^.]+\.plosjournals\.org/',
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
if (doc.evaluate(''//div[@class="search"][@id="browseResults"]/ul/li/span/a'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
if (doc.evaluate(''//div[@class="search"][@id="browseResults"]/ul/li/span/a'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||||
return "multiple";
|
return "multiple";
|
||||||
} else {
|
} else if (url.indexOf("get-document") != -1) {
|
||||||
return "journalArticle";
|
return "journalArticle";
|
||||||
}
|
}
|
||||||
}',
|
}',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user