made arXiv translator work with eprintweb as well
This commit is contained in:
parent
05b3cd8566
commit
e201c3b580
72
scrapers.sql
72
scrapers.sql
|
@ -1,4 +1,4 @@
|
||||||
-- 107
|
-- 108
|
||||||
|
|
||||||
-- ***** BEGIN LICENSE BLOCK *****
|
-- ***** BEGIN LICENSE BLOCK *****
|
||||||
--
|
--
|
||||||
|
@ -22,7 +22,7 @@
|
||||||
|
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-11-24 18:04:00'));
|
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-11-24 18:50:00'));
|
||||||
|
|
||||||
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-11-21 22:30:00', 1, 100, 4, 'Amazon', 'Sean Takats', '^http://(?:www\.)amazon',
|
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-11-21 22:30:00', 1, 100, 4, 'Amazon', 'Sean Takats', '^http://(?:www\.)amazon',
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
|
@ -4116,9 +4116,9 @@ function doWeb(doc, url) {
|
||||||
}
|
}
|
||||||
}');
|
}');
|
||||||
|
|
||||||
REPLACE INTO "translators" VALUES ('ecddda2e-4fc6-4aea-9f17-ef3b56d7377a', '2006-11-24 16:32:00', 1, 100, 4, 'arXiv.org', 'Simon Kornblith', '^http://(?:www\.)?arxiv\.org/(?:find/\w|abs/[^/]+/[0-9]+)',
|
REPLACE INTO "translators" VALUES ('ecddda2e-4fc6-4aea-9f17-ef3b56d7377a', '2006-11-24 18:50:00', 1, 100, 4, 'arXiv.org/eprintweb.org', 'Simon Kornblith', '^http://(?:www\.)?(?:arxiv\.org/(?:find/\w|abs/[^/]+/[0-9]+)|eprintweb.org/S/search)',
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
var searchRe = /http:\/\/(?:www\.)?arxiv\.org\/find/;
|
var searchRe = /^http:\/\/(?:www\.)?(?:arxiv\.org\/find|eprintweb.org\/S\/search$)/;
|
||||||
if(searchRe.test(url)) {
|
if(searchRe.test(url)) {
|
||||||
return "multiple";
|
return "multiple";
|
||||||
} else {
|
} else {
|
||||||
|
@ -4160,21 +4160,70 @@ REPLACE INTO "translators" VALUES ('ecddda2e-4fc6-4aea-9f17-ef3b56d7377a', '2006
|
||||||
function doWeb(doc, url) {
|
function doWeb(doc, url) {
|
||||||
var fetchIDs = new Array();
|
var fetchIDs = new Array();
|
||||||
|
|
||||||
var absRe = /http:\/\/(?:www\.)?arxiv\.org\/abs\/(.+)$/;
|
var arxivAbsRe = /^http:\/\/(?:www\.)?arxiv\.org\/abs\/(.+)$/;
|
||||||
var m = absRe.exec(url);
|
var eprintsAbsRe = /^http:\/\/(?:www\.)?eprintweb.org\/S\/search(.*)$/
|
||||||
|
|
||||||
|
var arxivM = arxivAbsRe.exec(url);
|
||||||
|
var eprintsM = eprintsAbsRe.exec(url);
|
||||||
|
|
||||||
if(m) {
|
|
||||||
// single
|
|
||||||
fetchIDs.push(m[1]);
|
|
||||||
} else{
|
|
||||||
// search
|
|
||||||
var namespace = doc.documentElement.namespaceURI;
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
var nsResolver = namespace ? function(prefix) {
|
var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
} : null;
|
} : null;
|
||||||
|
|
||||||
|
if(arxivM) {
|
||||||
|
// arxiv single
|
||||||
|
fetchIDs.push(arxivM[1]);
|
||||||
|
} else if(eprintsM && eprintsM[1]) {
|
||||||
|
// eprints single
|
||||||
|
if(url.indexOf("refs") != -1 || url.indexOf("cited") != -1) {
|
||||||
|
var id = doc.evaluate(''//td[@class="panel"]//td[@class="txt"]/b[2]'', doc, nsResolver,
|
||||||
|
Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||||
|
} else {
|
||||||
|
var id = doc.evaluate(''//td[@class="panel"]//td[@class="txt"]/b'', doc, nsResolver,
|
||||||
|
Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||||
|
id = id.replace("/ ", "/");
|
||||||
|
id = id.substr(0, id.indexOf(" "));
|
||||||
|
}
|
||||||
|
fetchIDs.push(id);
|
||||||
|
} else {
|
||||||
|
// search
|
||||||
var items = new Object();
|
var items = new Object();
|
||||||
|
|
||||||
|
if(eprintsM) {
|
||||||
|
// eprints search
|
||||||
|
|
||||||
|
// get ids and titles
|
||||||
|
var started = false;
|
||||||
|
var elmts = doc.evaluate(''//td[@class="panel"]/table/tbody/tr/td'', doc, nsResolver,
|
||||||
|
Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null);
|
||||||
|
var elmt, title, id;
|
||||||
|
while(elmt = elmts.iterateNext()) {
|
||||||
|
if(!started && elmt.className == "lti") {
|
||||||
|
// wait until first title to process
|
||||||
|
started = true;
|
||||||
|
title = elmt.textContent;
|
||||||
|
} else if(started) {
|
||||||
|
if(elmt.className == "lti") {
|
||||||
|
// finish previous item
|
||||||
|
items[id] = title;
|
||||||
|
title = null;
|
||||||
|
// grab title
|
||||||
|
title = elmt.textContent;
|
||||||
|
} else if(elmt.className == "txt") {
|
||||||
|
// get id
|
||||||
|
var tags = elmt.getElementsByTagName("b");
|
||||||
|
id = tags[0].textContent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(title) {
|
||||||
|
items[id] = title;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// arxiv search
|
||||||
|
|
||||||
// get IDs and titles
|
// get IDs and titles
|
||||||
var ids = doc.evaluate(''//div[@id="content"]/dl/dt'', doc, nsResolver,
|
var ids = doc.evaluate(''//div[@id="content"]/dl/dt'', doc, nsResolver,
|
||||||
Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null);
|
Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null);
|
||||||
|
@ -4190,6 +4239,7 @@ function doWeb(doc, url) {
|
||||||
|
|
||||||
items[realID] = realID + " - " + title.textContent;
|
items[realID] = realID + " - " + title.textContent;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
items = Zotero.selectItems(items);
|
items = Zotero.selectItems(items);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user