diff --git a/scrapers.sql b/scrapers.sql index 793964c46..9475d7402 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 57 +-- 58 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00')); @@ -2995,6 +2995,115 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006 Scholar.wait(); }'); +REPLACE INTO "translators" VALUES ('57a00950-f0d1-4b41-b6ba-44ff0fc30289', '2006-08-26 1:10:00', 4, 'Google Scholar', 'Simon Kornblith', '^http://scholar\.google\.com/scholar', +'function detectWeb(doc, url) { + return "multiple"; +}', +'function getList(urls, each, done) { + var url = urls.shift(); + Scholar.Utilities.HTTP.doGet(url, function(text) { + if(each) { + each(text); + } + + if(urls.length) { + getList(urls, each, done); + } else if(done) { + done(text); + } + }); +} + +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var items = new Array(); + var relatedLinks = new Array(); + var links = new Array(); + var types = new Array(); + + var itemTypes = new Array(); + var attachments = new Array(); + + var elmts = doc.evaluate(''//p[@class="g"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + var elmt; + var i=0; + while(elmt = elmts.iterateNext()) { + var isCitation = doc.evaluate("./font[1]/b[1]/text()[1]", elmt, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + var relatedLink = doc.evaluate(''.//a[font/text() = "Related Articles"]'', + elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(relatedLink) { + relatedLinks[i] = relatedLink.href; + if(isCitation && isCitation.nodeValue == "[CITATION]") { + items[i] = Scholar.Utilities.getNodeString(doc, elmt, ''./text()|./b/text()'', nsResolver); + } else if(isCitation && isCitation.nodeValue == "[BOOK]") { + items[i] = Scholar.Utilities.getNodeString(doc, elmt, ''./text()|./b/text()'', nsResolver); + types[i] = "book"; + } else { + var link = doc.evaluate(''.//span[@class="w"]/a'', elmt, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + if(link) { + items[i] = link.textContent; + links[i] = link.href; + } + } + + if(items[i]) { + i++; + } + } + } + + items = Scholar.selectItems(items); + if(!items) { + return true; + } + + var relatedMatch = /[&?]q=related:([^&]+)/; + + var urls = new Array(); + for(var i in items) { + var m = relatedMatch.exec(relatedLinks[i]); + urls.push("http://scholar.google.com/scholar.ris?hl=en&lr=&q=info:"+m[1]+"&output=citation&oi=citation"); + if(links[i]) { + attachments.push([{title:"Google Scholar Linked Page", type:"text/html", + url:links[i]}]); + } else { + attachments.push([]); + } + + if(types[i]) { // for books + itemTypes.push(types[i]); + } else { + itemTypes.push(null); + } + } + + var translator = Scholar.loadTranslator("import"); + translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7"); + translator.setHandler("itemDone", function(obj, item) { + var itemType = itemTypes.shift(); + if(itemType) { + item.itemType = itemType; + } + + item.attachments = attachments.shift(); + item.complete(); + }); + + getList(urls, function(text) { + translator.setString(text); + translator.translate(); + }, function() { Scholar.done() }); + + Scholar.wait(); +}'); + REPLACE INTO "translators" VALUES ('9c335444-a562-4f88-b291-607e8f46a9bb', '2006-08-15 15:42:00', 4, 'Berkeley Library', 'Simon Kornblith', '^http://[^/]*berkeley.edu[^/]*/WebZ/(?:html/results.html|FETCH)\?.*sessionid=', 'function detectWeb(doc, url) { var resultsRegexp = /\/WebZ\/html\/results.html/i