Closes #516 for PubMed direct hits and refines Max Planck VL Library support
This commit is contained in:
parent
58235c6bf6
commit
2e1fa819ab
103
scrapers.sql
103
scrapers.sql
|
@ -1,4 +1,4 @@
|
||||||
-- 172
|
-- 173
|
||||||
|
|
||||||
-- ***** BEGIN LICENSE BLOCK *****
|
-- ***** BEGIN LICENSE BLOCK *****
|
||||||
--
|
--
|
||||||
|
@ -22,7 +22,7 @@
|
||||||
|
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-02-07 02:10:00'));
|
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-02-15 22:50:00'));
|
||||||
|
|
||||||
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-12-15 03:40:00', 1, 100, 4, 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
|
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-12-15 03:40:00', 1, 100, 4, 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
|
@ -2747,21 +2747,22 @@ REPLACE INTO translators VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '1.0.0b
|
||||||
}
|
}
|
||||||
}');
|
}');
|
||||||
|
|
||||||
REPLACE INTO translators VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '1.0.0b3.r1', '', '2006-12-14 17:53:00', 1, 100, 12, 'NCBI PubMed', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?.*db=PubMed',
|
REPLACE INTO translators VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '1.0.0b3.r1', '', '2007-02-15 22:50:00', '1', '100', '4', 'NCBI PubMed', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?.*db=PubMed',
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
var namespace = doc.documentElement.namespaceURI;
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
var nsResolver = namespace ? function(prefix) {
|
var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
} : null;
|
} : null;
|
||||||
|
|
||||||
if(doc.location.href.indexOf("list_uids=") >= 0) {
|
var uids = doc.evaluate(''//input[@name="uid"]'', doc,
|
||||||
|
nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
|
if(uids.iterateNext()) {
|
||||||
|
if (uids.iterateNext()){
|
||||||
|
return "multiple";
|
||||||
|
}
|
||||||
return "journalArticle";
|
return "journalArticle";
|
||||||
} else if(doc.evaluate(''//div[@class="ResultSet"]/table/tbody'', doc,
|
|
||||||
nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
|
||||||
return "multiple";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function getPMID(co) {
|
function getPMID(co) {
|
||||||
var coParts = co.split("&");
|
var coParts = co.split("&");
|
||||||
for each(part in coParts) {
|
for each(part in coParts) {
|
||||||
|
@ -2781,7 +2782,8 @@ function detectSearch(item) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}',
|
}
|
||||||
|
',
|
||||||
'function lookupPMIDs(ids, doc) {
|
'function lookupPMIDs(ids, doc) {
|
||||||
Zotero.wait();
|
Zotero.wait();
|
||||||
|
|
||||||
|
@ -2806,7 +2808,7 @@ function detectSearch(item) {
|
||||||
} else {
|
} else {
|
||||||
var url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=pubmed&cmd=Retrieve&dopt=AbstractPlus&list_uids="+PMID;
|
var url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=pubmed&cmd=Retrieve&dopt=AbstractPlus&list_uids="+PMID;
|
||||||
newItem.attachments.push({url:url, title:"PubMed Snapshot",
|
newItem.attachments.push({url:url, title:"PubMed Snapshot",
|
||||||
mimeType:"text/html"});
|
mimeType:"text/html"});
|
||||||
}
|
}
|
||||||
|
|
||||||
var article = citation.Article;
|
var article = citation.Article;
|
||||||
|
@ -2874,44 +2876,43 @@ function detectSearch(item) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function doWeb(doc, url) {
|
function doWeb(doc, url) {
|
||||||
var uri = doc.location.href;
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
var ids = new Array();
|
var nsResolver = namespace ? function(prefix) {
|
||||||
var idRegexp = /[\?\&]list_uids=([0-9\,]+)/;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
|
|
||||||
var m = idRegexp.exec(uri);
|
|
||||||
if(m) {
|
|
||||||
ids.push(m[1]);
|
|
||||||
|
|
||||||
lookupPMIDs(ids, doc);
|
|
||||||
} else {
|
|
||||||
var namespace = doc.documentElement.namespaceURI;
|
|
||||||
var nsResolver = namespace ? function(prefix) {
|
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
|
||||||
} : null;
|
} : null;
|
||||||
|
var ids = new Array();
|
||||||
|
var uids = doc.evaluate(''//input[@name="uid"]'', doc,
|
||||||
|
nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
|
var uid = uids.iterateNext();
|
||||||
|
if(uid) {
|
||||||
|
if (uids.iterateNext()){
|
||||||
|
var items = new Array();
|
||||||
|
var tableRows = doc.evaluate(''//div[@class="ResultSet"]/table/tbody'', doc,
|
||||||
|
nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
|
var tableRow;
|
||||||
|
// Go through table rows
|
||||||
|
while(tableRow = tableRows.iterateNext()) {
|
||||||
|
var link = doc.evaluate(''.//a'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
|
uid = doc.evaluate(''.//input[@name="uid"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
|
var article = doc.evaluate(''./tr[2]/td[2]/text()[1]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
|
items[uid.value] = article.nodeValue;
|
||||||
|
}
|
||||||
|
|
||||||
var items = new Array();
|
items = Zotero.selectItems(items);
|
||||||
var tableRows = doc.evaluate(''//div[@class="ResultSet"]/table/tbody'', doc,
|
|
||||||
nsResolver, XPathResult.ANY_TYPE, null);
|
if(!items) {
|
||||||
var tableRow;
|
return true;
|
||||||
// Go through table rows
|
}
|
||||||
while(tableRow = tableRows.iterateNext()) {
|
|
||||||
var link = doc.evaluate(''.//a'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
for(var i in items) {
|
||||||
var article = doc.evaluate(''./tr[2]/td[2]/text()[1]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
ids.push(i);
|
||||||
items[link.href] = article.nodeValue;
|
}
|
||||||
|
|
||||||
|
lookupPMIDs(ids);
|
||||||
|
} else {
|
||||||
|
ids.push(uid.value);
|
||||||
|
lookupPMIDs(ids, doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
items = Zotero.selectItems(items);
|
|
||||||
|
|
||||||
if(!items) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(var i in items) {
|
|
||||||
var m = idRegexp.exec(i);
|
|
||||||
ids.push(m[1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
lookupPMIDs(ids);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2920,6 +2921,7 @@ function doSearch(item) {
|
||||||
lookupPMIDs([getPMID(item.contextObject)]);
|
lookupPMIDs([getPMID(item.contextObject)]);
|
||||||
}');
|
}');
|
||||||
|
|
||||||
|
|
||||||
REPLACE INTO translators VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '1.0.0b3.r1', '', '2006-12-12 23:41:00', 1, 100, 4, 'Embedded RDF', 'Simon Kornblith', NULL,
|
REPLACE INTO translators VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '1.0.0b3.r1', '', '2006-12-12 23:41:00', 1, 100, 4, 'Embedded RDF', 'Simon Kornblith', NULL,
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
var metaTags = doc.getElementsByTagName("meta");
|
var metaTags = doc.getElementsByTagName("meta");
|
||||||
|
@ -6468,7 +6470,7 @@ function doWeb(doc, url) {
|
||||||
}');
|
}');
|
||||||
|
|
||||||
|
|
||||||
REPLACE INTO translators VALUES ('66928fe3-1e93-45a7-8e11-9df6de0a11b3', '1.0.0b3r1', '', '2007-02-06 02:10:00', '0', '100', '4', 'Max Planck VL Library', 'Sean Takats', 'http://vlp.mpiwg-berlin.mpg.de/library/',
|
REPLACE INTO translators VALUES ('66928fe3-1e93-45a7-8e11-9df6de0a11b3', '1.0.0b3r1', '', '2007-02-15 22:50:00', '0', '100', '4', 'Max Planck Institute for the History of Science: Virtual Laboratory Library', 'Sean Takats', 'http://vlp.mpiwg-berlin.mpg.de/library/',
|
||||||
'function detectWeb(doc, url){
|
'function detectWeb(doc, url){
|
||||||
var namespace = doc.documentElement.namespaceURI;
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
var nsResolver = namespace ? function(prefix) {
|
var nsResolver = namespace ? function(prefix) {
|
||||||
|
@ -6478,7 +6480,7 @@ REPLACE INTO translators VALUES ('66928fe3-1e93-45a7-8e11-9df6de0a11b3', '1.0.0b
|
||||||
if (elmt){
|
if (elmt){
|
||||||
return "book";
|
return "book";
|
||||||
}
|
}
|
||||||
elmt = doc.evaluate(''//span[starts-with(@title, "lit")]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
elmt = doc.evaluate(''//span[starts-with(@title, "lit")] | //a[starts-with(@title, "lit")] | //p[starts-with(@title, "lit")]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
if (elmt){
|
if (elmt){
|
||||||
return "multiple";
|
return "multiple";
|
||||||
}
|
}
|
||||||
|
@ -6493,17 +6495,17 @@ REPLACE INTO translators VALUES ('66928fe3-1e93-45a7-8e11-9df6de0a11b3', '1.0.0b
|
||||||
var baseElmt = doc.evaluate(''//base[contains(@href, "/library/data/lit")]/@href'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
var baseElmt = doc.evaluate(''//base[contains(@href, "/library/data/lit")]/@href'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
if (baseElmt){
|
if (baseElmt){
|
||||||
var docID = baseElmt.nodeValue;
|
var docID = baseElmt.nodeValue;
|
||||||
var idRe = /lit[^\/]+/;
|
var idRe = /lit[0-9]+/;
|
||||||
var m = idRe.exec(docID);
|
var m = idRe.exec(docID);
|
||||||
uris.push("http://vlp.mpiwg-berlin.mpg.de/library/meta?id=" + m[0]);
|
uris.push("http://vlp.mpiwg-berlin.mpg.de/library/meta?id=" + m[0]);
|
||||||
} else {
|
} else {
|
||||||
var searchElmts = doc.evaluate(''//a[starts-with(@title, "lit")]'', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
var searchElmts = doc.evaluate(''//span[starts-with(@title, "lit")] | //a[starts-with(@title, "lit")] | //p[starts-with(@title, "lit")]'', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
var searchElmt;
|
var searchElmt;
|
||||||
var links = new Array();
|
var links = new Array();
|
||||||
var availableItems = new Array();
|
var availableItems = new Array();
|
||||||
var i = 0;
|
var i = 0;
|
||||||
while (searchElmt = searchElmts.iterateNext()){
|
while (searchElmt = searchElmts.iterateNext()){
|
||||||
availableItems[i] = searchElmt.textContent;
|
availableItems[i] = Zotero.Utilities.cleanString(searchElmt.textContent);
|
||||||
var docID = doc.evaluate(''./@title'', searchElmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
|
var docID = doc.evaluate(''./@title'', searchElmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
|
||||||
links.push("http://vlp.mpiwg-berlin.mpg.de/library/meta?id=" + docID);
|
links.push("http://vlp.mpiwg-berlin.mpg.de/library/meta?id=" + docID);
|
||||||
i++;
|
i++;
|
||||||
|
@ -6523,7 +6525,6 @@ REPLACE INTO translators VALUES ('66928fe3-1e93-45a7-8e11-9df6de0a11b3', '1.0.0b
|
||||||
translator.setTranslator("881f60f2-0802-411a-9228-ce5f47b64c7d");
|
translator.setTranslator("881f60f2-0802-411a-9228-ce5f47b64c7d");
|
||||||
translator.setString(text);
|
translator.setString(text);
|
||||||
translator.setHandler("itemDone", function(obj, item) {
|
translator.setHandler("itemDone", function(obj, item) {
|
||||||
// TODO item.attachments.push({url:"http://www.arxiv.org/pdf/" + articleID, mimeType:"application/pdf", title:"VL Library PDF"}
|
|
||||||
item.type = undefined;
|
item.type = undefined;
|
||||||
item.complete();
|
item.complete();
|
||||||
});
|
});
|
||||||
|
|
Loading…
Reference in New Issue
Block a user