-Updates PLoS detectWeb() so that it no longer returns non-article and non-search result pages
-Adds Innovate Online translator
This commit is contained in:
parent
e02b3a1f5e
commit
647c47257d
96
scrapers.sql
96
scrapers.sql
|
@ -22,7 +22,7 @@
|
|||
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-01-06 23:55:00'));
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-01-07 19:00:00'));
|
||||
|
||||
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
|
||||
'function detectWeb(doc, url) {
|
||||
|
@ -2086,11 +2086,101 @@ function getData(ids){
|
|||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('bdae838b-3a58-461f-9e8a-142ed9de61dc', '1.0.0b4.r5', '', '2007-12-05 17:00:00', '0', '100', '4', 'PLoS Journals', 'Michael Berkowitz', '^http://[^.]+\.plosjournals\.org/',
|
||||
REPLACE INTO translators VALUES ('ca6e95d1-46b9-4535-885c-df0c2d4b7f7a', '1.0.0b4.r5', '', '2008-01-07 19:00:00', '0', '100', '4', 'Innovate Online', 'Michael Berkowitz', '^http://(www.)?innovateonline.info/',
|
||||
'function detectWeb(doc, url) {
|
||||
if (url.indexOf("view=article") != -1) {
|
||||
return "journalArticle";
|
||||
} else if (url.indexOf("view=search") != -1) {
|
||||
return "multiple";
|
||||
}
|
||||
}',
|
||||
'function doWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
var newURIs = new Array();
|
||||
|
||||
if (url.indexOf("view=search") != -1) {
|
||||
var titles = new Array();
|
||||
var hrefs = new Array();
|
||||
var items = new Object();
|
||||
var xpath = ''//ul[@class="articles"]/li[@class="result"]/div[@class="header"]'';
|
||||
var names = doc.evaluate(xpath, doc, namespace, XPathResult.ANY_TYPE, null);
|
||||
var next_item = names.iterateNext();
|
||||
while (next_item) {
|
||||
titles.push(next_item.textContent.split(/\n/)[3]);
|
||||
next_item = names.iterateNext();
|
||||
}
|
||||
|
||||
var nextpath = ''//ul[@class="articles"]/li/@onclick'';
|
||||
var links = doc.evaluate(nextpath, doc, namespace, XPathResult.ANY_TYPE, null);
|
||||
var next_link = links.iterateNext();
|
||||
while (next_link) {
|
||||
hrefs.push(next_link.textContent);
|
||||
next_link = links.iterateNext();
|
||||
}
|
||||
|
||||
for (var i = 0 ; i < titles.length ; i++) {
|
||||
items[hrefs[i].match(/\d+/)] = titles[i];
|
||||
}
|
||||
items = Zotero.selectItems(items);
|
||||
|
||||
for (var i in items) {
|
||||
newURIs.push(''http://innovateonline.info/index.php?view=article&id='' + i);
|
||||
}
|
||||
} else {
|
||||
var newURL = url;
|
||||
if (newURL.indexOf("highlight") != -1) {
|
||||
newURL = newURL.substring(0, newURL.indexOf("highlight") -1);
|
||||
}
|
||||
if (newURL.indexOf("action=synopsis") != -1) {
|
||||
newURL = newURL.replace("action=synopsis", "action=article");
|
||||
}
|
||||
newURIs.push(newURL);
|
||||
}
|
||||
Zotero.debug(newURIs);
|
||||
|
||||
Zotero.Utilities.processDocuments(newURIs, function(newDoc) {
|
||||
var newItem = new Zotero.Item("journalArticle");
|
||||
newItem.repository = "Innovate Online";
|
||||
newItem.publicationTitle = "Innovate";
|
||||
newItem.title = newDoc.title.substring(10);
|
||||
|
||||
var authors = newDoc.evaluate(''//div[@id="title"]/div[@class="author"]/a'', newDoc, namespace, XPathResult.ANY_TYPE, null);
|
||||
var author = authors.iterateNext();
|
||||
while (author) {
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(author.textContent, "author"));
|
||||
author = authors.iterateNext();
|
||||
}
|
||||
|
||||
newItem.date = newDoc.evaluate(''//div[@id="page"]/a/div[@class="title"]'', newDoc, namespace, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
|
||||
var voliss = newDoc.evaluate(''//div[@id="page"]/a/div[@class="subtitle"]'', newDoc, namespace, XPathResult.ANY_TYPE, null).iterateNext().textContent.match(/Volume\s+(\d+).*Issue\s+(\d+)/);
|
||||
newItem.volume = voliss[1];
|
||||
newItem.issue = voliss[2];
|
||||
|
||||
var id = newDoc.location.href.match(/\d+/)[0];
|
||||
var PDFurl = "http://innovateonline.info/print.php?view=pdf&id=" + id;
|
||||
newItem.attachments = [
|
||||
{url:newDoc.location.href, title:"Innovate Online Snapshot", mimeType:"text/html"},
|
||||
{url:PDFurl, title:"Innovate Online PDF", mimeType:"application/pdf"}
|
||||
]
|
||||
|
||||
Zotero.Utilities.HTTP.doGet(newDoc.location.href.replace("action=article", "action=synopsis"), function(text) {
|
||||
var abs = text.match(/<div id=\"synopsis\">\n<p>(.*)<\/p>/)[1];
|
||||
newItem.abstractNote = Zotero.Utilities.unescapeHTML(Zotero.Utilities.cleanTags(abs));
|
||||
newItem.complete();
|
||||
});
|
||||
}, function() {Zotero.done;});
|
||||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('bdae838b-3a58-461f-9e8a-142ed9de61dc', '1.0.0b4.r5', '', '2008-01-07 19:00:00', '0', '100', '4', 'PLoS Journals', 'Michael Berkowitz', '^http://[^.]+\.plosjournals\.org/',
|
||||
'function detectWeb(doc, url) {
|
||||
if (doc.evaluate(''//div[@class="search"][@id="browseResults"]/ul/li/span/a'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "multiple";
|
||||
} else {
|
||||
} else if (url.indexOf("get-document") != -1) {
|
||||
return "journalArticle";
|
||||
}
|
||||
}',
|
||||
|
|
Loading…
Reference in New Issue
Block a user