From b8ddba3a67383eeddf55f12a47186db102b43a69 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Fri, 8 Sep 2006 01:59:22 +0000 Subject: [PATCH] CiteSeer translator --- .../content/scholar/xpcom/translate.js | 6 +- scrapers.sql | 103 +++++++++++++++++- 2 files changed, 102 insertions(+), 7 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js index 4221b4285..5ff86fef0 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/translate.js +++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -1756,13 +1756,13 @@ Scholar.Translate.prototype._storageFunctions = function(read, write) { return me._storage; } } else { // block reading - this._sandbox.Scholar.read = function(amount) { + this._sandbox.Scholar.read = function(amount) {; if(me._storagePointer >= me._storageLength) { return false; } - if((me._storagePointer+amount) <= me._storageLength) { - me._storagePointer = me._storageLength; + if((me._storagePointer+amount) > me._storageLength) { + me._storagePointer = me._storageLength+1; return me._storage; } diff --git a/scrapers.sql b/scrapers.sql index 66699443e..1cefd48e7 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 83 +-- 84 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00')); @@ -3886,6 +3886,100 @@ REPLACE INTO "translators" VALUES ('a07bb62a-4d2d-4d43-ba08-d9679a0122f8', '2006 Scholar.wait(); }'); +REPLACE INTO "translators" VALUES ('fa396dd4-7d04-4f99-95e1-93d6f355441d', '2006-09-07 18:30:00', 4, 'CiteSeer', 'Simon Kornblith', '^http://(?:citeseer\.ist\.psu\.edu/|citeseer\.csail\.mit\.edu/|citeseer\.ifi\.unizh\.ch/|citeseer\.comp\.nus\.edu\.sg/)', +'function detectWeb(doc, url) { + var searchRe = /http:\/\/[^\/]+\/ci?s/; + if(searchRe.test(url)) { + return "multiple"; + } else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + if(doc.evaluate(''/html/body/span[@class="m"]/pre'', doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext()) { + return "journalArticle"; + } + } +}', +'function scrape(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // figure out what attachments to add + var attachments = new Array(); + var results = doc.evaluate(''/html/body/span[@class="m"]/table[@class="h"]/tbody/tr/td[4]/center/font/a'', + doc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt; + + var acceptableTypes = ["PDF", "PS", "PS.gz"]; + var mimeTypes = ["application/pdf", "application/postscript", "application/gzip"]; + while(elmt = results.iterateNext()) { + var kind = elmt.textContent.toString(); + var index = acceptableTypes.indexOf(kind); + if(index != -1) { + var attachment = {url:elmt.href, mimeType:mimeTypes[index], + title:"Full Text "+kind}; + if(kind == "PDF") { + attachment.downloadable = true; + } + attachments.push(attachment); + } + } + + var bibtex = doc.evaluate(''/html/body/span[@class="m"]/pre/text()'', doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + if(bibtex) { + var translator = Scholar.loadTranslator("import"); + translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4"); + translator.setString(bibtex.nodeValue.toString()); + translator.setHandler("itemDone", function(obj, item) { + if(item.url) { // add http to url + item.url = "http://"+item.url; + } + item.attachments = attachments; + item.attachments.push({document:doc, downloadable:false, + title:"CiteSeer Abstract"}); + + item.complete(); + }); + translator.translate(); + } else { + throw "No BibTeX found!"; + } +} + +function doWeb(doc, url) { + var searchRe = /http:\/\/([^\/]+)\/ci?s/; + var m = searchRe.exec(doc.location.href); + if(m) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var items = Scholar.Utilities.getItemArray(doc, doc, "^http://"+m[1]+"/[^/]+.html"); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var urls = new Array(); + for(var i in items) { + urls.push(i); + } + + Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); }); + Scholar.wait(); + } else { + scrape(doc); + } +}'); + REPLACE INTO "translators" VALUES ('e07e9b8c-0e98-4915-bb5a-32a08cb2f365', '2006-08-07 11:36:00', 8, 'Open WorldCat', 'Simon Kornblith', 'http://partneraccess.oclc.org/', 'function detectSearch(item) { if(item.itemType == "book" || item.itemType == "bookSection") { @@ -5969,7 +6063,6 @@ var inputFieldMap = { publisher:"publisher" }; -// TODO: figure out if these are the best types for letter, interview, website var typeMap = { book:"book", bookSection:"inbook", @@ -5986,8 +6079,10 @@ var typeMap = { }; // supplements outputTypeMap for importing -// TODO: conference, inproceedings, techreport var inputTypeMap = { + inproceedings:"journalArticle", + conference:"journalArticle", + techreport:"book", booklet:"book", incollection:"bookSection", manual:"book", @@ -6219,7 +6314,7 @@ function beginRecord(type, closeChar) { } } -function doImport(attachments) { +function doImport() { // make regular expressions out of values var newArray = new Array(); for(var i in accentedCharacters) {