-Fixes Cell Press translator so that it a) grabs the right PDF, and b) saves results from full-text pages.
This commit is contained in:
parent
7c9b9a531b
commit
22e446dadf
19
scrapers.sql
19
scrapers.sql
|
@ -22,7 +22,7 @@
|
|||
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-04-23 18:00:00'));
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-04-24 15:00:00'));
|
||||
|
||||
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2008-03-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats and Michael Berkowitz', '^https?://(?:www\.)?amazon',
|
||||
'function detectWeb(doc, url) {
|
||||
|
@ -4873,7 +4873,7 @@ function doWeb(doc, url) {
|
|||
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('f26cfb71-efd7-47ae-a28c-d4d8852096bd', '1.0.0b4.r5', '', '2008-02-14 23:15:00', '0', '99', '4', 'Cell Press', 'Michael Berkowitz', 'http://www.(cancercell|cell|cellhostandmicrobe|cellmetabolism|cellstemcell|chembiol|current-biology|developmentalcell|immunity|molecule|neuron|structure).(org|com)',
|
||||
REPLACE INTO translators VALUES ('f26cfb71-efd7-47ae-a28c-d4d8852096bd', '1.0.0b4.r5', '', '2008-04-24 15:00:00', '0', '99', '4', 'Cell Press', 'Michael Berkowitz', 'http://www.(cancercell|cell|cellhostandmicrobe|cellmetabolism|cellstemcell|chembiol|current-biology|developmentalcell|immunity|molecule|neuron|structure).(org|com)',
|
||||
'function detectWeb(doc, url) {
|
||||
if (url.indexOf("search/results?") != -1) {
|
||||
return "multiple";
|
||||
|
@ -4903,31 +4903,32 @@ REPLACE INTO translators VALUES ('f26cfb71-efd7-47ae-a28c-d4d8852096bd', '1.0.0b
|
|||
Zotero.Utilities.processDocuments(articles, function(newDoc) {
|
||||
var newItem = new Zotero.Item("journalArticle");
|
||||
newItem.title = newDoc.evaluate(''//h1[@class="article_title"]'', newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
var voliss = newDoc.evaluate(''//div[@class="article_citation"]/p[1]'', newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(".")[2].split(",");
|
||||
var voliss = newDoc.evaluate(''//div[contains(@class, "article_citation")]/p[1]'', newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(".")[2].split(",");
|
||||
newItem.publicationTitle = voliss[0];
|
||||
newItem.volume = voliss[1].match(/\d+/)[0];
|
||||
newItem.pages = voliss[2];
|
||||
newItem.date = voliss[3];
|
||||
newItem.pages = Zotero.Utilities.trimInternal(voliss[2]);
|
||||
newItem.date = Zotero.Utilities.trimInternal(voliss[3]);
|
||||
newItem.abstractNote = newDoc.evaluate(''//div[@class="panelcontent article_summary"]/p[contains(text(), " ")]'', newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
var authors = newDoc.evaluate(''//p[@class="authors"]'', newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(",");
|
||||
for (var i in authors) {
|
||||
var next_author = authors[i];
|
||||
if (next_author.match(/[a-z]/)) {
|
||||
next_author = Zotero.Utilities.trimInternal(next_author.match(/[\w\s\.\-]+/)[0].replace(/\d/g, ""));
|
||||
next_author = Zotero.Utilities.trimInternal(next_author.replace(/\d/g, ""));
|
||||
if (next_author.substr(0, 3) == "and") {
|
||||
next_author = next_author.substr(4);
|
||||
}
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(next_author, "author"));
|
||||
}
|
||||
}
|
||||
var pdfx = ''//a[contains(text(), "PDF")]'';
|
||||
var pdfurl = newDoc.evaluate(pdfx, newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().href;
|
||||
var newurl = newDoc.location.href;
|
||||
if (newurl.indexOf("abstract") != -1) {
|
||||
newurl = newurl.replace("abstract", "fulltext");
|
||||
}
|
||||
var uid = newurl.match(/uid=([^&]+)/)[1];
|
||||
var pdfx = ''//a[contains(text(), "PDF")][contains(@href, "'' + uid + ''")]'';
|
||||
var pdfurl = newDoc.evaluate(pdfx, newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().href;
|
||||
newItem.attachments = [
|
||||
{url:url, title:"Cell Press Snapshot", mimeType:"text/html"},
|
||||
{url:newurl, title:"Cell Press Snapshot", mimeType:"text/html"},
|
||||
{url:pdfurl, title:"Cell Press Full Text PDF", mimeType:"application/pdf"}
|
||||
];
|
||||
newItem.complete();
|
||||
|
|
Loading…
Reference in New Issue
Block a user