Updated arXiv.org/eprintweb translator handles new article IDs

This commit is contained in:
Sean Takats 2007-04-13 15:51:34 +00:00
parent 81b4ce0069
commit ce53ec03a4

View File

@ -1,4 +1,4 @@
-- 215 -- 216
-- ***** BEGIN LICENSE BLOCK ***** -- ***** BEGIN LICENSE BLOCK *****
-- --
@ -22,7 +22,7 @@
-- Set the following timestamp to the most recent scraper update date -- Set the following timestamp to the most recent scraper update date
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-04-05 19:45:00')); REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-04-13 16:05:00'));
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-03-21 15:26:54', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-03-21 15:26:54', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
@ -4503,7 +4503,7 @@ function doWeb(doc, url) {
} }
}'); }');
REPLACE INTO translators VALUES ('ecddda2e-4fc6-4aea-9f17-ef3b56d7377a', '1.0.0b3.r1', '', '2007-04-02 14:55:00', '1', '100', '4', 'arXiv.org', 'Sean Takats', '^http://(?:www\.)?(?:arxiv\.org/(?:find/\w|list/\w|abs/[^/]+/[0-9]+)|eprintweb.org/S/(?:search|archive|article))', REPLACE INTO translators VALUES ('ecddda2e-4fc6-4aea-9f17-ef3b56d7377a', '1.0.0b3.r1', '', '2007-04-13 16:05:00', '1', '100', '4', 'arXiv.org', 'Sean Takats', '^http://(?:www\.)?(?:arxiv\.org/(?:find/\w|list/\w|abs/)|eprintweb.org/S/(?:search|archive|article)(?!.*refs$)(?!.*cited$))',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var searchRe = /^http:\/\/(?:www\.)?(?:arxiv\.org\/(?:find|list)|eprintweb.org\/S\/(?:archive|search$))/; var searchRe = /^http:\/\/(?:www\.)?(?:arxiv\.org\/(?:find|list)|eprintweb.org\/S\/(?:archive|search$))/;
if(searchRe.test(url)) { if(searchRe.test(url)) {
@ -4518,10 +4518,13 @@ REPLACE INTO translators VALUES ('ecddda2e-4fc6-4aea-9f17-ef3b56d7377a', '1.0.0b
} }
function doWeb(doc, url) { function doWeb(doc, url) {
var eprintsMultRe = /^http:\/\/(?:www\.)?eprintweb.org\/S\/(?:search|archive)/; var eprintMultRe = /^http:\/\/(?:www\.)?eprintweb.org\/S\/(?:search|archive)/;
var eprintsM = eprintsMultRe.exec(url); var eprintMultM = eprintMultRe.exec(url);
var eprintSingRe = /^http:\/\/(?:www\.)?eprintweb.org\/S\/(?:article|article)/;
var eprintSingM = eprintSingRe.exec(url);
if (eprintsM) { if (eprintMultM) {
var elmtsXPath = ''//table/tbody/tr/td[@class="txt"]/a[text()="Abstract"]/../b''; var elmtsXPath = ''//table/tbody/tr/td[@class="txt"]/a[text()="Abstract"]/../b'';
var titlesXPath = ''//table/tbody/tr/td[@class="lti"]''; var titlesXPath = ''//table/tbody/tr/td[@class="lti"]'';
var titleNode = ''./text()''; var titleNode = ''./text()'';
@ -4547,23 +4550,23 @@ function doWeb(doc, url) {
var arXivCats = new Array(); var arXivCats = new Array();
var arXivIDs = new Array(); var arXivIDs = new Array();
var i=0; var i=0;
if (eprintsM){ if (eprintMultM){
do { do {
var newURI = doc.evaluate(''./text()'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; var newID = doc.evaluate(''./text()'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newID = newID.replace(/arXiv:/, "");
newID = newID.replace(/\//g, "%2F");
availableItems[i] = doc.evaluate(titleNode, title, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; availableItems[i] = doc.evaluate(titleNode, title, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
var urlComponents = newURI.split("/"); arXivIDs[i] = newID;
arXivCats[i] = urlComponents[0].split(".")[0];
arXivIDs[i] = urlComponents[1];
i++; i++;
} while ((elmt = elmts.iterateNext()) && (title = titles.iterateNext())); } while ((elmt = elmts.iterateNext()) && (title = titles.iterateNext()));
} }
else{ else{
do { do {
var newURI = doc.evaluate(''./@href'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; var newID= elmt.textContent;
newID = newID.replace(/arXiv:/, "");
newID = newID.replace(/\//g, "%2F");
availableItems[i] = doc.evaluate(titleNode, title, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; availableItems[i] = doc.evaluate(titleNode, title, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
var urlComponents = newURI.split("/"); arXivIDs[i] = newID;
arXivCats[i] = urlComponents[urlComponents.length - 2].split(".")[0];
arXivIDs[i] = urlComponents[urlComponents.length - 1];
i++; i++;
} while ((elmt = elmts.iterateNext()) && (title = titles.iterateNext())); } while ((elmt = elmts.iterateNext()) && (title = titles.iterateNext()));
} }
@ -4572,26 +4575,24 @@ function doWeb(doc, url) {
return true; return true;
} }
for(var i in items) { for(var i in items) {
newURIs.push("http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai%3AarXiv.org%3A" + arXivCats[i] + "%2F" + arXivIDs[i] + "&metadataPrefix=oai_dc"); newURIs.push("http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai%3AarXiv.org%3A" + arXivIDs[i] + "&metadataPrefix=oai_dc");
} }
} }
else { else {
if (eprintsM){ if (eprintSingM){
var titleID = doc.evaluate(''//td[@class="panel"]//tr[1]/td[@class="txt"]/b/text()'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; var titleID = doc.evaluate(''//td[@class="ti"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
var urlComponents = titleID.split(" "); var arXivID = doc.evaluate(''//tr[1]/td[@class="txt"]/b'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
urlComponents = urlComponents[0].split("/"); arXivID = arXivID.replace(/arXiv:/, "");
var arXivCat; arXivID = arXivID.replace(/\//g, "%2F");
var arXivID;
arXivCat = urlComponents[0].split(".")[0];
arXivID = urlComponents[1];
} else { } else {
var urlComponents = url.split("/"); var arXivID = doc.evaluate(''//title'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
var arXivCat; var titleRe = /\[([^\]]*)]/;
var arXivID; var m = titleRe.exec(arXivID);
arXivCat = urlComponents[urlComponents.length - 2].split(".")[0]; arXivID = m[1];
arXivID = urlComponents[urlComponents.length - 1]; arXivID = arXivID.replace(/\//g, "%2F");
} }
newURIs.push("http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai%3AarXiv.org%3A" + arXivCat + "%2F" + arXivID + "&metadataPrefix=oai_dc"); newURIs.push("http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai%3AarXiv.org%3A" + arXivID + "&metadataPrefix=oai_dc");
} }