closes #83, figure out how to implement OpenURL
closes #76, implement extensible search/retrieval architecture for obtaining metadata OpenURL COinS lookup is now implemented using a real search architecture system. at the moment, it works with Open WorldCat for books, CrossRef for journal articles (provided the COinS object contains a DOI or an ISSN), and PubMed when a PMID is available.
This commit is contained in:
parent
9e5c15423a
commit
216f0c7581
|
@ -148,7 +148,6 @@ Scholar.OpenURL = new function() {
|
|||
this.discoverResolvers = discoverResolvers;
|
||||
this.createContextObject = createContextObject;
|
||||
this.parseContextObject = parseContextObject;
|
||||
this.lookupContextObject = lookupContextObject;
|
||||
|
||||
/*
|
||||
* Returns a URL to look up an item in the OpenURL resolver
|
||||
|
@ -305,12 +304,16 @@ Scholar.OpenURL = new function() {
|
|||
/*
|
||||
* Generates an item in the format returned by item.fromArray() given an
|
||||
* OpenURL version 1.0 contextObject
|
||||
*
|
||||
* accepts an item array to fill, or creates and returns a new item array
|
||||
*/
|
||||
function parseContextObject(co) {
|
||||
function parseContextObject(co, item) {
|
||||
var coParts = co.split("&");
|
||||
|
||||
var item = new Array();
|
||||
item.creators = new Array();
|
||||
if(!item) {
|
||||
var item = new Array();
|
||||
item.creators = new Array();
|
||||
}
|
||||
|
||||
// get type
|
||||
item.itemType = _determineResourceType(coParts);
|
||||
|
@ -416,157 +419,6 @@ Scholar.OpenURL = new function() {
|
|||
return item;
|
||||
}
|
||||
|
||||
/*
|
||||
* Looks up additional information on an item in the format returned by
|
||||
* item.fromArray() in CrossRef or Open WorldCat given an OpenURL version
|
||||
* 1.0 contextObject
|
||||
*/
|
||||
function lookupContextObject(co, done, error) {
|
||||
// CrossRef requires a url_ver to work right
|
||||
if(co.indexOf("url_ver=Z39.88-2004") == -1) {
|
||||
co = "url_ver=Z39.88-2004&"+co;
|
||||
}
|
||||
|
||||
var type = _determineResourceType(co.split("&"));
|
||||
if(!type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(type == "journal") {
|
||||
// look up journals in CrossRef
|
||||
Scholar.Utilities.HTTP.doGet("http://www.crossref.org/openurl/?"+co+"&noredirect=true", null, function(req) {
|
||||
var items = _processCrossRef(req.responseText);
|
||||
done(items);
|
||||
});
|
||||
} else {
|
||||
// look up books in Open WorldCat
|
||||
Scholar.Utilities.HTTP.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) {
|
||||
var doc = browser.contentDocument;
|
||||
// find new COinS in the Open WorldCat page
|
||||
items = _processOWC(doc);
|
||||
|
||||
if(items) { // we got a single item page; return the item
|
||||
done(items);
|
||||
} else { // assume we have a search results page
|
||||
var items = new Array();
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == 'x') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// first try to get only books
|
||||
var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
|
||||
var elmt = elmts.iterateNext();
|
||||
if(!elmt) { // if that fails, look for other options
|
||||
var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
|
||||
elmt = elmts.iterateNext()
|
||||
}
|
||||
|
||||
var urlsToProcess = new Array();
|
||||
do {
|
||||
urlsToProcess.push(elmt.href);
|
||||
} while(elmt = elmts.iterateNext());
|
||||
|
||||
Scholar.Utilities.HTTP.processDocuments(null, urlsToProcess, function(browser) {
|
||||
// per URL
|
||||
var newItems = _processOWC(browser.contentDocument);
|
||||
if(newItems) {
|
||||
items = items.concat(newItems);
|
||||
}
|
||||
}, function() { // done
|
||||
done(items);
|
||||
}, function() { // error
|
||||
error();
|
||||
});
|
||||
}
|
||||
}, null, function() {
|
||||
error();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Processes the XML format returned by CrossRef
|
||||
*/
|
||||
function _processCrossRef(xmlOutput) {
|
||||
xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, "");
|
||||
|
||||
// parse XML with E4X
|
||||
var qr = new Namespace("http://www.crossref.org/qrschema/2.0");
|
||||
try {
|
||||
var xml = new XML(xmlOutput);
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// ensure status is valid
|
||||
var status = xml.qr::body.qr::query.@status.toString();
|
||||
if(status != "resolved" && status != "multiresolved") {
|
||||
return false;
|
||||
}
|
||||
|
||||
var query = xml.qr::body.qr::query;
|
||||
var item = new Array();
|
||||
item.creators = new Array();
|
||||
|
||||
// try to get a DOI
|
||||
item.DOI = query.qr::doi.(@type=="journal_article").toString();
|
||||
if(!item.DOI) {
|
||||
item.DOI = query.qr::doi.(@type=="book_title").toString();
|
||||
}
|
||||
if(!item.DOI) {
|
||||
item.DOI = query.qr::doi.(@type=="book_content").toString();
|
||||
}
|
||||
|
||||
// try to get an ISSN (no print/electronic preferences)
|
||||
item.ISSN = query.qr::issn.toString();
|
||||
// get title
|
||||
item.title = query.qr::article_title.toString();
|
||||
// get publicationTitle
|
||||
item.publicationTitle = query.qr::journal_title.toString();
|
||||
// get author
|
||||
item.creators.push(Scholar.Utilities.cleanAuthor(query.qr::author.toString(), "author", true));
|
||||
// get volume
|
||||
item.volume = query.qr::volume.toString();
|
||||
// get issue
|
||||
item.issue = query.qr::issue.toString();
|
||||
// get year
|
||||
item.date = query.qr::year.toString();
|
||||
// get edition
|
||||
item.edition = query.qr::edition_number.toString();
|
||||
// get first page
|
||||
item.pages = query.qr::first_page.toString();
|
||||
|
||||
return [item];
|
||||
}
|
||||
|
||||
/*
|
||||
* Parses a document object referring to an Open WorldCat entry for its
|
||||
* OpenURL contextObject, then returns an item generated from this
|
||||
* contextObject
|
||||
*/
|
||||
function _processOWC(doc) {
|
||||
var spanTags = doc.getElementsByTagName("span");
|
||||
for(var i=0; i<spanTags.length; i++) {
|
||||
var spanClass = spanTags[i].getAttribute("class");
|
||||
if(spanClass) {
|
||||
var spanClasses = spanClass.split(" ");
|
||||
if(Scholar.inArray("Z3988", spanClasses)) {
|
||||
var spanTitle = spanTags[i].getAttribute("title");
|
||||
var item = parseContextObject(spanTitle);
|
||||
if(item) {
|
||||
return [item];
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determines the type of an OpenURL contextObject
|
||||
*/
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
* export
|
||||
* import
|
||||
* web
|
||||
* search
|
||||
*
|
||||
* a typical export process:
|
||||
* var translatorObj = new Scholar.Translate();
|
||||
|
@ -35,7 +36,10 @@
|
|||
* location - the location of the target (read-only; set with setLocation)
|
||||
* for import/export - this is an instance of nsILocalFile
|
||||
* for web - this is a URL
|
||||
* item - item to be used for searching (read-only; set with setItem)
|
||||
* path - the path to the target; for web, this is the same as location
|
||||
* saveItem - whether new items should be saved to the database. defaults to
|
||||
* true; set using second argument of constructor.
|
||||
*
|
||||
* PRIVATE PROPERTIES:
|
||||
*
|
||||
|
@ -49,6 +53,10 @@
|
|||
* _sandbox - sandbox in which translators will be executed
|
||||
* _streams - streams that need to be closed when execution is complete
|
||||
* _IDMap - a map from IDs as specified in Scholar.Item() to IDs of actual items
|
||||
* _parentTranslator - set when a translator is called from another translator.
|
||||
* among other things, disables passing of the translate
|
||||
* object to handlers and modifies complete() function on
|
||||
* returned items
|
||||
*
|
||||
* WEB-ONLY PRIVATE PROPERTIES:
|
||||
*
|
||||
|
@ -56,23 +64,41 @@
|
|||
* an EZProxy
|
||||
*/
|
||||
|
||||
Scholar.Translate = function(type) {
|
||||
Scholar.Translate = function(type, saveItem) {
|
||||
this.type = type;
|
||||
|
||||
// import = 001 = 1
|
||||
// export = 010 = 2
|
||||
// web = 100 = 4
|
||||
// import = 0001 = 1
|
||||
// export = 0010 = 2
|
||||
// web = 0100 = 4
|
||||
// search = 1000 = 8
|
||||
|
||||
// combination types determined by addition or bitwise AND
|
||||
// i.e., import+export = 1+2 = 3
|
||||
if(type == "import") {
|
||||
this._numericTypes = "1,3,5,7";
|
||||
} else if(type == "export") {
|
||||
this._numericTypes = "2,3,6,7";
|
||||
} else if(type == "web") {
|
||||
this._numericTypes = "4,5,6,7";
|
||||
this._numericTypes = "";
|
||||
for(var i=0; i<=1; i++) {
|
||||
for(var j=0; j<=1; j++) {
|
||||
for(var k=0; k<=1; k++) {
|
||||
if(type == "import") {
|
||||
this._numericTypes += ","+parseInt(i.toString()+j.toString()+k.toString()+"1", 2);
|
||||
} else if(type == "export") {
|
||||
this._numericTypes += ","+parseInt(i.toString()+j.toString()+"1"+k.toString(), 2);
|
||||
} else if(type == "web") {
|
||||
this._numericTypes += ","+parseInt(i.toString()+"1"+j.toString()+k.toString(), 2);
|
||||
} else if(type == "search") {
|
||||
this._numericTypes += ","+parseInt("1"+i.toString()+j.toString()+k.toString(), 2);
|
||||
} else {
|
||||
throw("invalid import type");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this._numericTypes = this._numericTypes.substr(1);
|
||||
|
||||
if(saveItem === false) { // three equals signs means if it's left
|
||||
// undefined, this.saveItem will still be true
|
||||
this.saveItem = false;
|
||||
} else {
|
||||
throw("invalid import type");
|
||||
this.saveItem = true;
|
||||
}
|
||||
|
||||
this._handlers = new Array();
|
||||
|
@ -87,6 +113,13 @@ Scholar.Translate.prototype.setBrowser = function(browser) {
|
|||
this.setLocation(browser.contentDocument.location.href);
|
||||
}
|
||||
|
||||
/*
|
||||
* sets the item to be used for searching
|
||||
*/
|
||||
Scholar.Translate.prototype.setItem = function(item) {
|
||||
this.item = item;
|
||||
}
|
||||
|
||||
/*
|
||||
* sets the location to operate upon (file should be an nsILocalFile object or
|
||||
* web address)
|
||||
|
@ -112,12 +145,41 @@ Scholar.Translate.prototype.setLocation = function(location) {
|
|||
* accepts either the object from getTranslators() or an ID
|
||||
*/
|
||||
Scholar.Translate.prototype.setTranslator = function(translator) {
|
||||
if(typeof(translator) == "object") { // passed an object and not an ID
|
||||
translator = translator.translatorID;
|
||||
if(!translator) {
|
||||
throw("cannot set translator: invalid value");
|
||||
}
|
||||
|
||||
var sql = "SELECT * FROM translators WHERE translatorID = ? AND type IN ("+this._numericTypes+")";
|
||||
this.translator = Scholar.DB.rowQuery(sql, [translator]);
|
||||
if(typeof(translator) == "object") { // passed an object and not an ID
|
||||
if(translator.translatorID) {
|
||||
translator = [translator.translatorID];
|
||||
} else {
|
||||
// we have an associative array of translators
|
||||
if(this.type != "search") {
|
||||
throw("cannot set translator: a single translator must be specified when doing "+this.type+" translation");
|
||||
}
|
||||
// accept a list of objects
|
||||
for(var i in translator) {
|
||||
if(typeof(translator[i]) == "object") {
|
||||
if(translator[i].translatorID) {
|
||||
translator[i] = translator[i].translatorID;
|
||||
} else {
|
||||
throw("cannot set translator: must specify a single translator or a list of translators");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
translator = [translator];
|
||||
}
|
||||
|
||||
var where = "";
|
||||
for(var i in translator) {
|
||||
where += " OR translatorID = ?";
|
||||
}
|
||||
where = where.substr(4);
|
||||
|
||||
var sql = "SELECT * FROM translators WHERE "+where+" AND type IN ("+this._numericTypes+")";
|
||||
this.translator = Scholar.DB.query(sql, translator);
|
||||
if(!this.translator) {
|
||||
return false;
|
||||
}
|
||||
|
@ -145,13 +207,13 @@ Scholar.Translate.prototype.setTranslator = function(translator) {
|
|||
* returns: N/A
|
||||
*
|
||||
* itemDone
|
||||
* valid: import, web
|
||||
* valid: import, web, search
|
||||
* called: when an item has been processed; may be called asynchronously
|
||||
* passed: an item object (see Scholar.Item)
|
||||
* returns: N/A
|
||||
*
|
||||
* collectionDone
|
||||
* valid: import, web
|
||||
* valid: import
|
||||
* called: when a collection has been processed, after all items have been
|
||||
* added; may be called asynchronously
|
||||
* passed: a collection object (see Scholar.Collection)
|
||||
|
@ -187,7 +249,7 @@ Scholar.Translate.prototype.getTranslators = function() {
|
|||
var sql = "SELECT translatorID, label, target, detectCode FROM translators WHERE type IN ("+this._numericTypes+") ORDER BY target IS NULL";
|
||||
var translators = Scholar.DB.query(sql);
|
||||
|
||||
if(!this.location) {
|
||||
if(!this.location && !this.item) {
|
||||
return translators; // no need to see which can translate, because
|
||||
// we don't have a location yet (for export or
|
||||
// import dialog)
|
||||
|
@ -228,20 +290,21 @@ Scholar.Translate.prototype.displayOptions = function() {
|
|||
}
|
||||
|
||||
Scholar.Translate.prototype._loadTranslator = function() {
|
||||
if(!this._sandbox) {
|
||||
// create a new sandbox if none exists
|
||||
if(!this._sandbox || this.type == "search") {
|
||||
// create a new sandbox if none exists, or for searching (so that it's
|
||||
// bound to the correct url)
|
||||
this._generateSandbox();
|
||||
}
|
||||
|
||||
// parse detect code for the translator
|
||||
this._parseDetectCode(this.translator);
|
||||
this._parseDetectCode(this.translator[0]);
|
||||
|
||||
Scholar.debug("parsing code for "+this.translator.label);
|
||||
Scholar.debug("parsing code for "+this.translator[0].label);
|
||||
|
||||
try {
|
||||
Components.utils.evalInSandbox(this.translator.code, this._sandbox);
|
||||
Components.utils.evalInSandbox(this.translator[0].code, this._sandbox);
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in parsing code for '+this.translator.label);
|
||||
Scholar.debug(e+' in parsing code for '+this.translator[0].label);
|
||||
this._translationComplete(false);
|
||||
return false;
|
||||
}
|
||||
|
@ -254,17 +317,24 @@ Scholar.Translate.prototype._loadTranslator = function() {
|
|||
*/
|
||||
Scholar.Translate.prototype.translate = function() {
|
||||
this._IDMap = new Array();
|
||||
this._complete = false;
|
||||
|
||||
if(!this.location) {
|
||||
throw("cannot translate: no location specified");
|
||||
if(!this.translator || !this.translator.length) {
|
||||
throw("cannot translate: no translator specified");
|
||||
}
|
||||
|
||||
this._complete = false;
|
||||
if(!this.location && this.type != "search") {
|
||||
// searches operate differently, because we could have an array of
|
||||
// translators and have to go through each
|
||||
throw("cannot translate: no location specified");
|
||||
}
|
||||
|
||||
if(!this._loadTranslator()) {
|
||||
return;
|
||||
}
|
||||
|
||||
this._sandbox.Scholar.scraperName = this.translator[0].label;
|
||||
|
||||
var returnValue;
|
||||
if(this.type == "web") {
|
||||
returnValue = this._web();
|
||||
|
@ -272,7 +342,10 @@ Scholar.Translate.prototype.translate = function() {
|
|||
returnValue = this._import();
|
||||
} else if(this.type == "export") {
|
||||
returnValue = this._export();
|
||||
} else if(this.type == "search") {
|
||||
returnValue = this._search();
|
||||
}
|
||||
|
||||
if(!returnValue) {
|
||||
// failure
|
||||
this._translationComplete(false);
|
||||
|
@ -285,12 +358,31 @@ Scholar.Translate.prototype.translate = function() {
|
|||
/*
|
||||
* generates a sandbox for scraping/scraper detection
|
||||
*/
|
||||
Scholar.Translate._searchSandboxRegexp = new RegExp();
|
||||
Scholar.Translate._searchSandboxRegexp.compile("^http://[\\w.]+/");
|
||||
Scholar.Translate.prototype._generateSandbox = function() {
|
||||
var me = this;
|
||||
|
||||
if(this.type == "web") {
|
||||
// use real URL, not proxied version, to create sandbox
|
||||
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
||||
if(this.type == "web" || this.type == "search") {
|
||||
// get sandbox URL
|
||||
var sandboxURL = "";
|
||||
if(this.type == "web") {
|
||||
// use real URL, not proxied version, to create sandbox
|
||||
sandboxURL = this.browser.contentDocument.location.href;
|
||||
} else {
|
||||
// generate sandbox for search by extracting domain from translator
|
||||
// target, if one exists
|
||||
if(this.translator && this.translator[0] && this.translator[0].target) {
|
||||
// so that web translators work too
|
||||
var tempURL = this.translator[0].target.replace(/\\/g, "").replace(/\^/g, "");
|
||||
var m = Scholar.Translate._searchSandboxRegexp.exec(tempURL);
|
||||
if(m) {
|
||||
sandboxURL = m[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
Scholar.debug("binding sandbox to "+sandboxURL);
|
||||
this._sandbox = new Components.utils.Sandbox(sandboxURL);
|
||||
this._sandbox.Scholar = new Object();
|
||||
|
||||
// add ingester utilities
|
||||
|
@ -300,27 +392,30 @@ Scholar.Translate.prototype._generateSandbox = function() {
|
|||
// set up selectItems handler
|
||||
this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) };
|
||||
} else {
|
||||
// use null URL to create sanbox
|
||||
// use null URL to create sandbox
|
||||
this._sandbox = new Components.utils.Sandbox("");
|
||||
this._sandbox.Scholar = new Object();
|
||||
|
||||
this._sandbox.Scholar.Utilities = new Scholar.Utilities();
|
||||
}
|
||||
|
||||
if(this.type == "web" || this.type == "import") {
|
||||
|
||||
if(this.type == "export") {
|
||||
// add routines to retrieve items and collections
|
||||
this._sandbox.Scholar.nextItem = function() { return me._exportGetItem() };
|
||||
this._sandbox.Scholar.nextCollection = function() { return me._exportGetCollection() }
|
||||
} else {
|
||||
// add routines to add new items
|
||||
this._sandbox.Scholar.Item = Scholar.Translate.ScholarItem;
|
||||
// attach the function to be run when an item is done
|
||||
this._sandbox.Scholar.Item.prototype.complete = function() {me._itemDone(this)};
|
||||
|
||||
// add routines to add new collections
|
||||
this._sandbox.Scholar.Collection = Scholar.Translate.ScholarCollection;
|
||||
// attach the function to be run when a collection is done
|
||||
this._sandbox.Scholar.Collection.prototype.complete = function() {me._collectionDone(this)};
|
||||
} else if(this.type == "export") {
|
||||
// add routines to retrieve items and collections
|
||||
this._sandbox.Scholar.nextItem = function() { return me._exportGetItem() };
|
||||
this._sandbox.Scholar.nextCollection = function() { return me._exportGetCollection() };
|
||||
if(this.type == "import") {
|
||||
// add routines to add new collections
|
||||
this._sandbox.Scholar.Collection = Scholar.Translate.ScholarCollection;
|
||||
// attach the function to be run when a collection is done
|
||||
this._sandbox.Scholar.Collection.prototype.complete = function() {me._collectionDone(this)};
|
||||
}
|
||||
}
|
||||
|
||||
this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
||||
|
@ -334,33 +429,50 @@ Scholar.Translate.prototype._generateSandbox = function() {
|
|||
this._sandbox.Scholar.addOption = function(option, value) {me._addOption(option, value) };
|
||||
|
||||
// for loading other translators and accessing their methods
|
||||
var me = this;
|
||||
this._sandbox.Scholar.loadTranslator = function(type, translatorID) {
|
||||
var translation = new Scholar.Translate(type);
|
||||
// assign same handlers as for parent, because the done handler won't
|
||||
// get called anyway, and the itemDone/selectItems handlers should be
|
||||
// the same
|
||||
translation._handlers = me._handlers;
|
||||
// set the translator
|
||||
translation.setTranslator(translatorID);
|
||||
// load the translator into our sandbox
|
||||
translation._loadTranslator();
|
||||
// use internal io
|
||||
translation._initializeInternalIO();
|
||||
return translation._sandbox;
|
||||
var translation = new Scholar.Translate(type, (translatorID ? true : false));
|
||||
if(translatorID) {
|
||||
// assign same handlers as for parent, because the done handler won't
|
||||
// get called anyway, and the itemDone/selectItems handlers should be
|
||||
// the same
|
||||
translation._handlers = me._handlers;
|
||||
// set the translator
|
||||
translation.setTranslator(translatorID);
|
||||
// load the translator into our sandbox
|
||||
translation._loadTranslator();
|
||||
// use internal io
|
||||
translation._initializeInternalIO();
|
||||
return translation._sandbox;
|
||||
} else {
|
||||
// create a safe translator object, so that scrapers can't get
|
||||
// access to potentially harmful methods.
|
||||
if(type == "import" || type == "export") {
|
||||
throw("you must specify a translatorID for "+type+" translation");
|
||||
}
|
||||
|
||||
var safeTranslator = new Object();
|
||||
safeTranslator.setItem = function(arg) { return translation.setItem(arg) };
|
||||
safeTranslator.setBrowser = function(arg) { return translation.setBrowser(arg) };
|
||||
safeTranslator.setHandler = function(arg1, arg2) { translation.setHandler(arg1, arg2) };
|
||||
safeTranslator.setTranslator = function(arg) { return translation.setTranslator(arg) };
|
||||
safeTranslator.getTranslators = function() { return translation.getTranslators() };
|
||||
safeTranslator.translate = function() { return translation.translate() };
|
||||
translation._parentTranslator = me;
|
||||
|
||||
return safeTranslator;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to see if _scraper_ can scrape this document
|
||||
*/
|
||||
Scholar.Translate.prototype._canTranslate = function(translator) {
|
||||
var canTranslate = false;
|
||||
|
||||
Scholar.Translate.prototype._canTranslate = function(translator) {
|
||||
// Test location with regular expression
|
||||
// If this is slow, we could preload all scrapers and compile regular
|
||||
// expressions, so each check will be faster
|
||||
if(translator.target) {
|
||||
if(translator.target && this.type != "search") {
|
||||
var canTranslate = false;
|
||||
if(this.type == "web") {
|
||||
var regularExpression = new RegExp(translator.target, "i");
|
||||
} else {
|
||||
|
@ -370,6 +482,8 @@ Scholar.Translate.prototype._canTranslate = function(translator) {
|
|||
if(regularExpression.test(this.path)) {
|
||||
canTranslate = true;
|
||||
}
|
||||
} else {
|
||||
var canTranslate = true;
|
||||
}
|
||||
|
||||
// Test with JavaScript if available and didn't have a regular expression or
|
||||
|
@ -388,14 +502,21 @@ Scholar.Translate.prototype._canTranslate = function(translator) {
|
|||
}
|
||||
}
|
||||
|
||||
if(this._sandbox.detect) {
|
||||
if((this.type == "web" && this._sandbox.detectWeb) ||
|
||||
(this.type == "search" && this._sandbox.detectSearch) ||
|
||||
(this.type == "import" && this._sandbox.detectImport) ||
|
||||
(this.type == "export" && this._sandbox.detectExport)) {
|
||||
var returnValue;
|
||||
|
||||
try {
|
||||
if(this.type == "web") {
|
||||
returnValue = this._sandbox.detect(this.browser.contentDocument, this.location);
|
||||
returnValue = this._sandbox.detectWeb(this.browser.contentDocument, this.location);
|
||||
} else if(this.type == "search") {
|
||||
returnValue = this._sandbox.detectSearch(this.item);
|
||||
} else if(this.type == "import") {
|
||||
returnValue = this._sandbox.detect();
|
||||
returnValue = this._sandbox.detectImport();
|
||||
} else if(this.type == "export") {
|
||||
returnValue = this._sandbox.detectExport();
|
||||
}
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in executing detectCode for '+translator.label);
|
||||
|
@ -476,7 +597,7 @@ Scholar.Translate.prototype._addOption = function(option, value) {
|
|||
* called as wait() in translator code
|
||||
*/
|
||||
Scholar.Translate.prototype._enableAsynchronous = function() {
|
||||
me = this;
|
||||
var me = this;
|
||||
this._waitForCompletion = true;
|
||||
this._sandbox.Scholar.done = function() { me._translationComplete(true) };
|
||||
}
|
||||
|
@ -505,13 +626,20 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) {
|
|||
if(!this._complete) {
|
||||
this._complete = true;
|
||||
|
||||
Scholar.debug("translation complete");
|
||||
|
||||
// call handler
|
||||
this._runHandler("done", returnValue);
|
||||
|
||||
// close open streams
|
||||
this._closeStreams();
|
||||
if(this.type == "search" && !this._itemsFound && this.translator.length > 1) {
|
||||
// if we're performing a search and didn't get any results, go on
|
||||
// to the next translator
|
||||
this.translator.shift();
|
||||
this.translate();
|
||||
} else {
|
||||
Scholar.debug("translation complete");
|
||||
|
||||
// call handler
|
||||
this._runHandler("done", returnValue);
|
||||
|
||||
// close open streams
|
||||
this._closeStreams();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -547,13 +675,23 @@ Scholar.Translate.prototype._closeStreams = function() {
|
|||
*/
|
||||
Scholar.Translate.prototype._itemDone = function(item) {
|
||||
Scholar.debug(item);
|
||||
if(!this.saveItem) { // if we're not supposed to save the item, just
|
||||
// return the item array
|
||||
|
||||
// if a parent sandbox exists, use complete() function from that sandbox
|
||||
if(this._parentTranslator) {
|
||||
var pt = this._parentTranslator;
|
||||
item.complete = function() { pt._itemDone(this) };
|
||||
Scholar.debug("done from parent sandbox");
|
||||
}
|
||||
this._runHandler("itemDone", item);
|
||||
return;
|
||||
}
|
||||
|
||||
// Get typeID, defaulting to "website"
|
||||
var type = (item.itemType ? item.itemType : "website");
|
||||
|
||||
Scholar.debug("type is "+type);
|
||||
if(type == "note") { // handle notes differently
|
||||
Scholar.debug("handling a note");
|
||||
var myID = Scholar.Notes.add(item.note);
|
||||
// re-retrieve the item
|
||||
var newItem = Scholar.Items.get(myID);
|
||||
|
@ -718,7 +856,11 @@ Scholar.Translate.prototype._runHandler = function(type, argument) {
|
|||
for(var i in this._handlers[type]) {
|
||||
Scholar.debug("running handler "+i+" for "+type);
|
||||
try {
|
||||
returnValue = this._handlers[type][i](this, argument);
|
||||
if(this._parentTranslator) {
|
||||
returnValue = this._handlers[type][i](null, argument);
|
||||
} else {
|
||||
returnValue = this._handlers[type][i](this, argument);
|
||||
}
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in handler '+i+' for '+type);
|
||||
}
|
||||
|
@ -734,7 +876,21 @@ Scholar.Translate.prototype._web = function() {
|
|||
try {
|
||||
this._sandbox.doWeb(this.browser.contentDocument, this.location);
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in executing code for '+this.translator.label);
|
||||
Scholar.debug(e+' in executing code for '+this.translator[0].label);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* does the actual search translation
|
||||
*/
|
||||
Scholar.Translate.prototype._search = function() {
|
||||
try {
|
||||
this._sandbox.doSearch(this.item);
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in executing code for '+this.translator[0].label);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -750,7 +906,7 @@ Scholar.Translate.prototype._import = function() {
|
|||
try {
|
||||
this._sandbox.doImport();
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in executing code for '+this.translator.label);
|
||||
Scholar.debug(e+' in executing code for '+this.translator[0].label);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -830,7 +986,7 @@ Scholar.Translate.prototype._export = function() {
|
|||
try {
|
||||
this._sandbox.doExport();
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in executing code for '+this.translator.label);
|
||||
Scholar.debug(e+' in executing code for '+this.translator[0].label);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -321,8 +321,8 @@ Scholar.Utilities.Ingester.prototype.lookupContextObject = function(co, done, er
|
|||
return Scholar.OpenURL.lookupContextObject(co, done, error);
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.prototype.parseContextObject = function(co) {
|
||||
return Scholar.OpenURL.parseContextObject(co);
|
||||
Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) {
|
||||
return Scholar.OpenURL.parseContextObject(co, item);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
353
scrapers.sql
353
scrapers.sql
|
@ -4,7 +4,7 @@
|
|||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-07 01:09:00'));
|
||||
|
||||
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)'');
|
||||
if(searchRe.test(doc.location.href)) {
|
||||
return "multiple";
|
||||
|
@ -123,7 +123,7 @@ function doWeb(doc, url) {
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
|
||||
return "book";
|
||||
} else if(doc.title == ''FirstSearch: WorldCat List of Records'') {
|
||||
|
@ -288,7 +288,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
|
||||
for(var i in export_options) {
|
||||
if(export_options[i].text == ''Latin1 MARC''
|
||||
|
@ -415,7 +415,7 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 4, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
|
@ -590,7 +590,7 @@ function doWeb(doc, url) {
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
if(doc.title == "History Cooperative: Search Results") {
|
||||
return "multiple";
|
||||
} else {
|
||||
|
@ -657,7 +657,7 @@ function doWeb(doc, url) {
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-08-06 21:45:00', 4, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button
|
||||
var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$'');
|
||||
if(matchRegexp.test(doc.location.href)) {
|
||||
|
@ -837,7 +837,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 4, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
|
@ -964,7 +964,7 @@ function doWeb(doc, url) {
|
|||
');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
if(doc.title == "Results") {
|
||||
return "magazineArticle";
|
||||
} else {
|
||||
|
@ -1147,7 +1147,7 @@ function doWeb(doc, url) {
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 4, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
if(doc.title.substring(0, 8) == "Article ") {
|
||||
return "magazineArticle";
|
||||
} else doc.title.substring(0, 10) == "Citations ") {
|
||||
|
@ -1273,7 +1273,7 @@ function doWeb(doc, url) {
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 4, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var detailRe = new RegExp("^http://[^/]+/universe/document");
|
||||
if(detailRe.test(doc.location.href)) {
|
||||
return "newspaperArticle";
|
||||
|
@ -1377,7 +1377,7 @@ function doWeb(doc, url) {
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}");
|
||||
|
||||
if(singleRe.test(doc.location.href)) {
|
||||
|
@ -1468,7 +1468,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]'');
|
||||
if(detailsRe.test(doc.location.href)) {
|
||||
return "book";
|
||||
|
@ -1556,7 +1556,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var node = Scholar.Utilities.getNode(doc, doc, ''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', null);
|
||||
if(node) {
|
||||
return "multiple";
|
||||
|
@ -1660,7 +1660,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 4, 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
if(doc.location.href.indexOf("/authority_hits") > 0) {
|
||||
return "multiple";
|
||||
} else {
|
||||
|
@ -1730,7 +1730,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006
|
|||
|
||||
|
||||
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
if(doc.location.href.indexOf("/GeacQUERY") > 0) {
|
||||
return "multiple";
|
||||
} else {
|
||||
|
@ -1818,7 +1818,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
|
@ -1954,7 +1954,7 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]");
|
||||
if(detailRe.test(doc.location.href)) {
|
||||
return "book";
|
||||
|
@ -2052,7 +2052,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 4, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi");
|
||||
if(searchRe.test(url)) {
|
||||
return "multiple";
|
||||
|
@ -2163,48 +2163,37 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006
|
|||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 4, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)',
|
||||
'function detect(doc, url) {
|
||||
REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 12, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)',
|
||||
'function detectWeb(doc, url) {
|
||||
if(doc.location.href.indexOf("list_uids=") >= 0) {
|
||||
return "journalArticle";
|
||||
} else {
|
||||
return "multiple";
|
||||
}
|
||||
}',
|
||||
'function doWeb(doc, url) {
|
||||
var uri = doc.location.href;
|
||||
var ids = new Array();
|
||||
var idRegexp = /[\?\&]list_uids=([0-9\,]+)/;
|
||||
|
||||
var m = idRegexp.exec(uri);
|
||||
if(m) {
|
||||
ids.push(m[1]);
|
||||
} else {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var items = new Array();
|
||||
var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver);
|
||||
// Go through table rows
|
||||
for(var i=0; i<tableRows.length; i++) {
|
||||
var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//a'', nsResolver);
|
||||
var article = Scholar.Utilities.getNode(doc, tableRows[i], ''./tr[2]/td[2]/text()[1]'', nsResolver);
|
||||
items[link.href] = article.nodeValue;
|
||||
}
|
||||
|
||||
items = Scholar.selectItems(items);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for(var i in items) {
|
||||
var m = idRegexp.exec(i);
|
||||
ids.push(m[1]);
|
||||
}
|
||||
|
||||
function getPMID(co) {
|
||||
var coParts = co.split("&");
|
||||
for each(part in coParts) {
|
||||
if(part.substr(0, 7) == "rft_id=") {
|
||||
var value = unescape(part.substr(7));
|
||||
if(value.substr(0, 10) == "info:pmid/") {
|
||||
return value.substr(10);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function detectSearch(item) {
|
||||
if(item.contextObject) {
|
||||
if(getPMID(item.contextObject)) {
|
||||
return "journalArticle";
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}',
|
||||
'function lookupPMIDs(ids) {
|
||||
Scholar.wait();
|
||||
|
||||
var newUri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=PubMed&retmode=xml&rettype=citation&id="+ids.join(",");
|
||||
Scholar.Utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
||||
|
@ -2283,13 +2272,54 @@ REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006
|
|||
}
|
||||
|
||||
Scholar.done();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var uri = doc.location.href;
|
||||
var ids = new Array();
|
||||
var idRegexp = /[\?\&]list_uids=([0-9\,]+)/;
|
||||
|
||||
Scholar.wait();
|
||||
var m = idRegexp.exec(uri);
|
||||
if(m) {
|
||||
ids.push(m[1]);
|
||||
} else {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var items = new Array();
|
||||
var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver);
|
||||
// Go through table rows
|
||||
for(var i=0; i<tableRows.length; i++) {
|
||||
var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//a'', nsResolver);
|
||||
var article = Scholar.Utilities.getNode(doc, tableRows[i], ''./tr[2]/td[2]/text()[1]'', nsResolver);
|
||||
items[link.href] = article.nodeValue;
|
||||
}
|
||||
|
||||
items = Scholar.selectItems(items);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for(var i in items) {
|
||||
var m = idRegexp.exec(i);
|
||||
ids.push(m[1]);
|
||||
}
|
||||
}
|
||||
|
||||
lookupPMIDs(ids);
|
||||
}
|
||||
|
||||
function doSearch(item) {
|
||||
// pmid was defined earlier in detectSearch
|
||||
lookupPMIDs([getPMID(item.contextObject)]);
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 4, 'Embedded RDF Scraper', 'Simon Kornblith', NULL,
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var metaTags = doc.getElementsByTagName("meta");
|
||||
|
||||
for(var i=0; i<metaTags.length; i++) {
|
||||
|
@ -2333,7 +2363,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS Scraper', 'Simon Kornblith', NULL,
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var spanTags = doc.getElementsByTagName("span");
|
||||
|
||||
var encounteredType = false;
|
||||
|
@ -2348,11 +2378,11 @@ REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006
|
|||
// determine if it''s a valid type
|
||||
var coParts = spanTitle.split("&");
|
||||
var type = null
|
||||
for(var i in coParts) {
|
||||
if(coParts[i].substr(0, 12) == "rft_val_fmt=") {
|
||||
var format = unescape(coParts[i].substr(12));
|
||||
for(var j in coParts) {
|
||||
if(coParts[j].substr(0, 12) == "rft_val_fmt=") {
|
||||
var format = unescape(coParts[j].substr(12));
|
||||
if(format == "info:ofi/fmt:kev:mtx:journal") {
|
||||
var type = "journal";
|
||||
var type = "journalArticle";
|
||||
} else if(format == "info:ofi/fmt:kev:mtx:book") {
|
||||
if(Scholar.Utilities.inArray("rft.genre=bookitem", coParts)) {
|
||||
var type = "bookSection";
|
||||
|
@ -2384,45 +2414,43 @@ function retrieveNextCOinS(needFullItems, newItems) {
|
|||
var item = needFullItems.shift();
|
||||
|
||||
Scholar.Utilities.debugPrint("looking up contextObject");
|
||||
Scholar.Utilities.lookupContextObject(item.contextObject, function(items) {
|
||||
Scholar.Utilities.debugPrint(items);
|
||||
if(items) {
|
||||
newItems = newItems.concat(items);
|
||||
}
|
||||
retrieveNextCOinS(needFullItems, newItems);
|
||||
}, function() {
|
||||
Scholar.done(false);
|
||||
var search = Scholar.loadTranslator("search");
|
||||
search.setHandler("itemDone", function(obj, item) {
|
||||
newItems.push(item);
|
||||
});
|
||||
search.setHandler("done", function() {
|
||||
retrieveNextCOinS(needFullItems, newItems);
|
||||
});
|
||||
search.setItem(item);
|
||||
|
||||
// look for translators
|
||||
var translators = search.getTranslators();
|
||||
if(translators) {
|
||||
search.setTranslator(translators);
|
||||
search.translate();
|
||||
} else {
|
||||
retrieveNextCOinS(needFullItems, newItems);
|
||||
}
|
||||
} else {
|
||||
completeCOinS(newItems);
|
||||
Scholar.done(true);
|
||||
}
|
||||
}
|
||||
|
||||
// attaches item data to a new Scholar.Item instance (because data returned from
|
||||
// Scholar.OpenURL.processContextObject does not have a complete() method)
|
||||
function addAsItem(itemArray) {
|
||||
var newItem = new Scholar.Item();
|
||||
for(var i in itemArray) {
|
||||
newItem[i] = itemArray[i];
|
||||
}
|
||||
newItem.complete();
|
||||
}
|
||||
|
||||
// saves all COinS objects
|
||||
function completeCOinS(newItems) {
|
||||
if(newItems.length > 1) {
|
||||
var selectArray = new Array();
|
||||
|
||||
for(var i in newItems) {
|
||||
selectArray[i] = newItems.title;
|
||||
selectArray[i] = newItems[i].title;
|
||||
}
|
||||
selectArray = Scholar.selectItems(selectArray);
|
||||
for(var i in selectArray) {
|
||||
addAsItem(newItems[i]);
|
||||
newItems[i].complete();
|
||||
}
|
||||
} else if(newItems.length) {
|
||||
addAsItem(newItems[0]);
|
||||
newItems[0].complete();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2438,8 +2466,8 @@ function doWeb(doc, url) {
|
|||
var spanClasses = spanClass.split(" ");
|
||||
if(Scholar.Utilities.inArray("Z3988", spanClasses)) {
|
||||
var spanTitle = spanTags[i].getAttribute("title");
|
||||
var newItem = Scholar.Utilities.parseContextObject(spanTitle);
|
||||
if(newItem) {
|
||||
var newItem = new Scholar.Item();
|
||||
if(Scholar.Utilities.parseContextObject(spanTitle, newItem)) {
|
||||
if(newItem.title && newItem.creators.length) {
|
||||
// title and creators are minimum data to avoid looking up
|
||||
newItems.push(newItem);
|
||||
|
@ -2463,7 +2491,7 @@ function doWeb(doc, url) {
|
|||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)',
|
||||
'function detect(doc, url) {
|
||||
'function detectWeb(doc, url) {
|
||||
var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');
|
||||
if(re.test(doc.location.href)) {
|
||||
return "book";
|
||||
|
@ -2553,6 +2581,161 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
|
|||
Scholar.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('e07e9b8c-0e98-4915-bb5a-32a08cb2f365', '2006-08-07 11:36:00', 8, 'Open WorldCat', 'Simon Kornblith', 'http://partneraccess.oclc.org/',
|
||||
'function detectSearch(item) {
|
||||
if(item.itemType == "book" || item.itemType == "bookSection") {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}',
|
||||
'// creates an item from an Open WorldCat document
|
||||
function processOWC(doc) {
|
||||
var spanTags = doc.getElementsByTagName("span");
|
||||
for(var i=0; i<spanTags.length; i++) {
|
||||
var spanClass = spanTags[i].getAttribute("class");
|
||||
if(spanClass) {
|
||||
var spanClasses = spanClass.split(" ");
|
||||
if(Scholar.Utilities.inArray("Z3988", spanClasses)) {
|
||||
var spanTitle = spanTags[i].getAttribute("title");
|
||||
var item = new Scholar.Item();
|
||||
if(Scholar.Utilities.parseContextObject(spanTitle, item)) {
|
||||
item.complete();
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
function doSearch(item) {
|
||||
if(item.contextObject) {
|
||||
var co = item.contextObject;
|
||||
} else {
|
||||
var co = Scholar.Utilities.createContextObject(item);
|
||||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) {
|
||||
var doc = browser.contentDocument;
|
||||
// find new COinS in the Open WorldCat page
|
||||
if(processOWC(doc)) { // we got a single item page
|
||||
Scholar.done();
|
||||
} else { // assume we have a search results page
|
||||
var items = new Array();
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// first try to get only books
|
||||
var elmts = doc.evaluate(''//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a'', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
|
||||
var elmt = elmts.iterateNext();
|
||||
if(!elmt) { // if that fails, look for other options
|
||||
var elmts = doc.evaluate(''//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a'', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
|
||||
elmt = elmts.iterateNext()
|
||||
}
|
||||
|
||||
var urlsToProcess = new Array();
|
||||
do {
|
||||
urlsToProcess.push(elmt.href);
|
||||
} while(elmt = elmts.iterateNext());
|
||||
|
||||
Scholar.Utilities.processDocuments(null, urlsToProcess, function(browser) {
|
||||
// per URL
|
||||
processOWC(browser.contentDocument);
|
||||
}, function() { // done
|
||||
Scholar.done();
|
||||
}, function() { // error
|
||||
Scholar.done(false);
|
||||
});
|
||||
}
|
||||
}, null, function() {
|
||||
error();
|
||||
});
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('11645bd1-0420-45c1-badb-53fb41eeb753', '2006-08-07 18:17:00', 8, 'CrossRef', 'Simon Kornblith', 'http://partneraccess.oclc.org/',
|
||||
'function detectSearch(item) {
|
||||
if(item.itemType == "journal") {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}',
|
||||
'function processCrossRef(xmlOutput) {
|
||||
xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, "");
|
||||
|
||||
// parse XML with E4X
|
||||
var qr = new Namespace("http://www.crossref.org/qrschema/2.0");
|
||||
try {
|
||||
var xml = new XML(xmlOutput);
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// ensure status is valid
|
||||
var status = xml.qr::query_result.qr::body.qr::query.@status.toString();
|
||||
if(status != "resolved" && status != "multiresolved") {
|
||||
return false;
|
||||
}
|
||||
|
||||
var query = xml.qr::query_result.qr::body.qr::query;
|
||||
var item = new Scholar.Item("journalArticle");
|
||||
|
||||
// try to get a DOI
|
||||
item.DOI = query.qr::doi.(@type=="journal_article").text().toString();
|
||||
if(!item.DOI) {
|
||||
item.DOI = query.qr::doi.(@type=="book_title").text().toString();
|
||||
}
|
||||
if(!item.DOI) {
|
||||
item.DOI = query.qr::doi.(@type=="book_content").text().toString();
|
||||
}
|
||||
|
||||
// try to get an ISSN (no print/electronic preferences)
|
||||
item.ISSN = query.qr::issn[0].text().toString();
|
||||
// get title
|
||||
item.title = query.qr::article_title.text().toString();
|
||||
// get publicationTitle
|
||||
item.publicationTitle = query.qr::journal_title.text().toString();
|
||||
// get author
|
||||
item.creators.push(Scholar.Utilities.cleanAuthor(query.qr::author.text().toString(), "author", true));
|
||||
// get volume
|
||||
item.volume = query.qr::volume.text().toString();
|
||||
// get issue
|
||||
item.issue = query.qr::issue.text().toString();
|
||||
// get year
|
||||
item.date = query.qr::year.text().toString();
|
||||
// get edition
|
||||
item.edition = query.qr::edition_number.text().toString();
|
||||
// get first page
|
||||
item.pages = query.qr::first_page.text().toString();
|
||||
item.complete();
|
||||
return true;
|
||||
}
|
||||
|
||||
function doSearch(item) {
|
||||
if(item.contextObject) {
|
||||
var co = item.contextObject;
|
||||
if(co.indexOf("url_ver=") == -1) {
|
||||
co = "url_ver=Z39.88-2004"+co;
|
||||
}
|
||||
} else {
|
||||
var co = Scholar.Utilities.createContextObject(item);
|
||||
}
|
||||
|
||||
Scholar.Utilities.HTTPUtilities.doGet("http://www.crossref.org/openurl/?"+co+"&noredirect=true", null, function(responseText) {
|
||||
processCrossRef(responseText);
|
||||
Scholar.done();
|
||||
});
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('0e2235e7-babf-413c-9acf-f27cce5f059c', '2006-07-05 23:40:00', 3, 'MODS (XML)', 'Simon Kornblith', 'xml',
|
||||
'Scholar.addOption("exportNotes", true);
|
||||
Scholar.addOption("exportFileData", true);',
|
||||
|
|
Loading…
Reference in New Issue
Block a user