closes #187, make berkeley's library work
closes #186, stop translators from hanging when a document loads inside a frameset, we now check whether we can scrape each individual frame. all functions involving tabs have been vastly simplified, because in the process of figuring this out, i discovered Firefox 2's new tab events. if a translator throws an exception inside loadDocument(), doGet(), doPost(), or processDocuments(), a translate error message will appear, and the translator will not hang
This commit is contained in:
parent
009a4ad520
commit
51108446e3
|
@ -23,7 +23,6 @@ var Scholar_Ingester_Interface = function() {}
|
||||||
* loading
|
* loading
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface.init = function() {
|
Scholar_Ingester_Interface.init = function() {
|
||||||
Scholar_Ingester_Interface.browsers = new Array();
|
|
||||||
Scholar_Ingester_Interface.browserData = new Object();
|
Scholar_Ingester_Interface.browserData = new Object();
|
||||||
Scholar_Ingester_Interface._scrapePopupShowing = false;
|
Scholar_Ingester_Interface._scrapePopupShowing = false;
|
||||||
Scholar.Ingester.ProxyMonitor.init();
|
Scholar.Ingester.ProxyMonitor.init();
|
||||||
|
@ -42,8 +41,10 @@ Scholar_Ingester_Interface.chromeLoad = function() {
|
||||||
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
|
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
|
||||||
|
|
||||||
// this gives us onLocationChange, for updating when tabs are switched/created
|
// this gives us onLocationChange, for updating when tabs are switched/created
|
||||||
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
|
Scholar_Ingester_Interface.tabBrowser.addEventListener("TabClose",
|
||||||
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
|
Scholar_Ingester_Interface.tabClose, false);
|
||||||
|
Scholar_Ingester_Interface.tabBrowser.addEventListener("TabSelect",
|
||||||
|
Scholar_Ingester_Interface.tabSelect, false);
|
||||||
// this is for pageshow, for updating the status of the book icon
|
// this is for pageshow, for updating the status of the book icon
|
||||||
Scholar_Ingester_Interface.appContent.addEventListener("pageshow",
|
Scholar_Ingester_Interface.appContent.addEventListener("pageshow",
|
||||||
Scholar_Ingester_Interface.contentLoad, true);
|
Scholar_Ingester_Interface.contentLoad, true);
|
||||||
|
@ -53,8 +54,7 @@ Scholar_Ingester_Interface.chromeLoad = function() {
|
||||||
* When chrome unloads, delete our document objects and remove our listeners
|
* When chrome unloads, delete our document objects and remove our listeners
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface.chromeUnload = function() {
|
Scholar_Ingester_Interface.chromeUnload = function() {
|
||||||
delete Scholar_Ingester_Interface.browserData, Scholar_Ingester_Interface.browsers;
|
delete Scholar_Ingester_Interface.browserData;
|
||||||
this.tabBrowser.removeProgressListener(this);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -77,7 +77,7 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
|
||||||
}
|
}
|
||||||
|
|
||||||
var translate = new Scholar.Translate("web");
|
var translate = new Scholar.Translate("web");
|
||||||
translate.setBrowser(browser);
|
translate.setDocument(data.document);
|
||||||
// use first translator available
|
// use first translator available
|
||||||
translate.setTranslator(data.translators[0]);
|
translate.setTranslator(data.translators[0]);
|
||||||
translate.setHandler("select", Scholar_Ingester_Interface._selectItems);
|
translate.setHandler("select", Scholar_Ingester_Interface._selectItems);
|
||||||
|
@ -90,86 +90,69 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
|
||||||
/*
|
/*
|
||||||
* An event handler called when a new document is loaded. Creates a new document
|
* An event handler called when a new document is loaded. Creates a new document
|
||||||
* object, and updates the status of the capture icon
|
* object, and updates the status of the capture icon
|
||||||
|
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface.contentLoad = function(event) {
|
Scholar_Ingester_Interface.contentLoad = function(event) {
|
||||||
if (event.originalTarget instanceof HTMLDocument) {
|
if(event.originalTarget instanceof HTMLDocument) {
|
||||||
// Stolen off the Mozilla extension developer's website, a routine to
|
var doc = event.originalTarget;
|
||||||
// determine the root document loaded from a frameset
|
var rootDoc = doc;
|
||||||
if (event.originalTarget.defaultView.frameElement) {
|
|
||||||
var doc = event.originalTarget;
|
// get the appropriate root document to check which browser we're on
|
||||||
while (doc.defaultView.frameElement) {
|
Scholar.debug("getting root document");
|
||||||
doc=doc.defaultView.frameElement.ownerDocument;
|
while(rootDoc.defaultView.frameElement) {
|
||||||
}
|
rootDoc = rootDoc.defaultView.frameElement.ownerDocument;
|
||||||
// Frame within a tab was loaded. doc is the root document of the frameset
|
|
||||||
} else {
|
|
||||||
var doc = event.originalTarget;
|
|
||||||
// Page was loaded. doc is the document that loaded.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Figure out what browser this contentDocument is associated with
|
// Figure out what browser this contentDocument is associated with
|
||||||
var browser;
|
var browser;
|
||||||
|
Scholar.debug("getting browser");
|
||||||
for(var i=0; i<Scholar_Ingester_Interface.tabBrowser.browsers.length; i++) {
|
for(var i=0; i<Scholar_Ingester_Interface.tabBrowser.browsers.length; i++) {
|
||||||
if(doc == Scholar_Ingester_Interface.tabBrowser.browsers[i].contentDocument) {
|
if(rootDoc == Scholar_Ingester_Interface.tabBrowser.browsers[i].contentDocument) {
|
||||||
browser = Scholar_Ingester_Interface.tabBrowser.browsers[i];
|
browser = Scholar_Ingester_Interface.tabBrowser.browsers[i];
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(!browser) {
|
if(!browser) {
|
||||||
Scholar.debug("Could not find browser!");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Scholar.debug("getting data");
|
||||||
// get data object
|
// get data object
|
||||||
var data = Scholar_Ingester_Interface._getData(browser);
|
var data = Scholar_Ingester_Interface._getData(browser);
|
||||||
|
|
||||||
|
// if there's already a scrapable page in the browser window, and it's
|
||||||
|
// still there, return
|
||||||
|
if(data.translators && data.translators.length && data.document.location) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Scholar.debug("translating");
|
||||||
// get translators
|
// get translators
|
||||||
var translate = new Scholar.Translate("web");
|
var translate = new Scholar.Translate("web");
|
||||||
translate.setBrowser(browser);
|
translate.setDocument(doc);
|
||||||
data.translators = translate.getTranslators();
|
data.translators = translate.getTranslators();
|
||||||
// update status
|
// update status
|
||||||
Scholar_Ingester_Interface._updateStatus(data);
|
Scholar_Ingester_Interface._updateStatus(data);
|
||||||
|
// add document
|
||||||
|
if(data.translators && data.translators.length) {
|
||||||
|
data.document = doc;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Dummy event handlers for all the events we don't care about
|
* called when a tab is closed
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface.Listener = function() {}
|
Scholar_Ingester_Interface.tabClose = function(event) {
|
||||||
Scholar_Ingester_Interface.Listener.onStatusChange = function() {}
|
// To execute if document object does not exist
|
||||||
Scholar_Ingester_Interface.Listener.onSecurityChange = function() {}
|
Scholar_Ingester_Interface._deleteData(event.target.linkedBrowser);
|
||||||
Scholar_Ingester_Interface.Listener.onProgressChange = function() {}
|
}
|
||||||
Scholar_Ingester_Interface.Listener.onStateChange = function() {}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* onLocationChange is called when tabs are switched. Use it to retrieve the
|
* called when a tab is switched
|
||||||
* appropriate status indicator for the current tab, and to free useless objects
|
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
|
Scholar_Ingester_Interface.tabSelect = function(event) {
|
||||||
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
|
|
||||||
|
|
||||||
// Remove document object of any browser that no longer exists
|
|
||||||
for (var i = 0; i < Scholar_Ingester_Interface.browsers.length; i++) {
|
|
||||||
var browser = Scholar_Ingester_Interface.browsers[i];
|
|
||||||
var exists = false;
|
|
||||||
|
|
||||||
for (var j = 0; j < browsers.length; j++) {
|
|
||||||
if (browser == browsers[j]) {
|
|
||||||
exists = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!exists) {
|
|
||||||
Scholar_Ingester_Interface.browsers.splice(i,1);
|
|
||||||
|
|
||||||
// To execute if document object does not exist
|
|
||||||
Scholar_Ingester_Interface._deleteDocument(browser);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||||
Scholar_Ingester_Interface._updateStatus(data);
|
Scholar_Ingester_Interface._updateStatus(data);
|
||||||
|
|
||||||
// Make sure scrape progress is gone
|
// Make sure scrape progress is gone
|
||||||
Scholar_Ingester_Interface.Progress.kill();
|
Scholar_Ingester_Interface.Progress.kill();
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,8 +29,8 @@
|
||||||
* PUBLIC PROPERTIES:
|
* PUBLIC PROPERTIES:
|
||||||
*
|
*
|
||||||
* type - the text type of translator (set by constructor, should be read only)
|
* type - the text type of translator (set by constructor, should be read only)
|
||||||
* browser - the browser object to be used for web scraping (read-only; set
|
* document - the document object to be used for web scraping (read-only; set
|
||||||
* with setBrowser)
|
* with setDocument)
|
||||||
* translator - the translator currently in use (read-only; set with
|
* translator - the translator currently in use (read-only; set with
|
||||||
* setTranslator)
|
* setTranslator)
|
||||||
* location - the location of the target (read-only; set with setLocation)
|
* location - the location of the target (read-only; set with setLocation)
|
||||||
|
@ -115,9 +115,9 @@ Scholar.Translate = function(type, saveItem) {
|
||||||
/*
|
/*
|
||||||
* sets the browser to be used for web translation; also sets the location
|
* sets the browser to be used for web translation; also sets the location
|
||||||
*/
|
*/
|
||||||
Scholar.Translate.prototype.setBrowser = function(browser) {
|
Scholar.Translate.prototype.setDocument = function(doc) {
|
||||||
this.browser = browser;
|
this.document = doc;
|
||||||
this.setLocation(browser.contentDocument.location.href);
|
this.setLocation(doc.location.href);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -428,7 +428,7 @@ Scholar.Translate.prototype._generateSandbox = function() {
|
||||||
var sandboxURL = "";
|
var sandboxURL = "";
|
||||||
if(this.type == "web") {
|
if(this.type == "web") {
|
||||||
// use real URL, not proxied version, to create sandbox
|
// use real URL, not proxied version, to create sandbox
|
||||||
sandboxURL = this.browser.contentDocument.location.href;
|
sandboxURL = this.document.location.href;
|
||||||
} else {
|
} else {
|
||||||
// generate sandbox for search by extracting domain from translator
|
// generate sandbox for search by extracting domain from translator
|
||||||
// target, if one exists
|
// target, if one exists
|
||||||
|
@ -446,8 +446,8 @@ Scholar.Translate.prototype._generateSandbox = function() {
|
||||||
this._sandbox.Scholar = new Object();
|
this._sandbox.Scholar = new Object();
|
||||||
|
|
||||||
// add ingester utilities
|
// add ingester utilities
|
||||||
this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this.locationIsProxied);
|
this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this);
|
||||||
this._sandbox.Scholar.Utilities.HTTP = new Scholar.Utilities.Ingester.HTTP(this.locationIsProxied);
|
this._sandbox.Scholar.Utilities.HTTP = new Scholar.Utilities.Ingester.HTTP(this);
|
||||||
|
|
||||||
// set up selectItems handler
|
// set up selectItems handler
|
||||||
this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) };
|
this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) };
|
||||||
|
@ -584,7 +584,7 @@ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtension
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if(this.type == "web") {
|
if(this.type == "web") {
|
||||||
returnValue = this._sandbox.detectWeb(this.browser.contentDocument, this.location);
|
returnValue = this._sandbox.detectWeb(this.document, this.location);
|
||||||
} else if(this.type == "search") {
|
} else if(this.type == "search") {
|
||||||
returnValue = this._sandbox.detectSearch(this.search);
|
returnValue = this._sandbox.detectSearch(this.search);
|
||||||
} else if(this.type == "import") {
|
} else if(this.type == "import") {
|
||||||
|
@ -954,7 +954,7 @@ Scholar.Translate.prototype._runHandler = function(type, argument) {
|
||||||
*/
|
*/
|
||||||
Scholar.Translate.prototype._web = function() {
|
Scholar.Translate.prototype._web = function() {
|
||||||
try {
|
try {
|
||||||
this._sandbox.doWeb(this.browser.contentDocument, this.location);
|
this._sandbox.doWeb(this.document, this.location);
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
Scholar.debug(e+' in executing code for '+this.translator[0].label);
|
Scholar.debug(e+' in executing code for '+this.translator[0].label);
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -164,8 +164,8 @@ Scholar.Utilities.prototype.itemTypeExists = function(type) {
|
||||||
// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
|
// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
|
||||||
// classes relating to data extraction specifically from HTML documents.
|
// classes relating to data extraction specifically from HTML documents.
|
||||||
|
|
||||||
Scholar.Utilities.Ingester = function(proxiedURL) {
|
Scholar.Utilities.Ingester = function(translate, proxiedURL) {
|
||||||
this.proxiedURL = proxiedURL;
|
this.translate = translate;
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
|
Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
|
||||||
|
@ -252,43 +252,62 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) {
|
||||||
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies
|
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies
|
||||||
|
|
||||||
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
|
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
|
||||||
if(this.proxiedURL) {
|
this.processDocuments([ url ], succeeded, null, failed);
|
||||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
|
||||||
}
|
|
||||||
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
|
|
||||||
}
|
}
|
||||||
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
|
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
|
||||||
if(this.proxiedURL) {
|
if(this.translate.locationIsProxied) {
|
||||||
for(i in urls) {
|
for(i in urls) {
|
||||||
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
|
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// unless the translator has proposed some way to handle an error, handle it
|
||||||
|
// by throwing a "scraping error" message
|
||||||
|
if(!exception) {
|
||||||
|
var translate = this.translate;
|
||||||
|
exception = function(e) {
|
||||||
|
Scholar.debug("an error occurred in code called by processDocuments: "+e);
|
||||||
|
translate._translationComplete(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception);
|
Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception);
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.Ingester.HTTP = function(proxiedURL) {
|
Scholar.Utilities.Ingester.HTTP = function(translate) {
|
||||||
this.proxiedURL = proxiedURL;
|
this.translate = translate;
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.Ingester.HTTP.prototype.doGet = function(url, onDone) {
|
Scholar.Utilities.Ingester.HTTP.prototype.doGet = function(url, onDone) {
|
||||||
if(this.proxiedURL) {
|
if(this.translate.locationIsProxied) {
|
||||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||||
}
|
}
|
||||||
Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
|
|
||||||
|
var translate = this.translate;
|
||||||
|
Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) {
|
||||||
|
try {
|
||||||
|
onDone(xmlhttp.responseText, xmlhttp);
|
||||||
|
} catch(e) {
|
||||||
|
Scholar.debug("an error occurred in code called by doGet: "+e);
|
||||||
|
translate._translationComplete(false);
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.Ingester.HTTP.prototype.doPost = function(url, body, onDone) {
|
Scholar.Utilities.Ingester.HTTP.prototype.doPost = function(url, body, onDone) {
|
||||||
if(this.proxiedURL) {
|
if(this.translate.locationIsProxied) {
|
||||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||||
}
|
}
|
||||||
Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
|
|
||||||
}
|
|
||||||
|
|
||||||
Scholar.Utilities.Ingester.HTTP.prototype.doOptions = function(url, onDone) {
|
var translate = this.translate;
|
||||||
if(this.proxiedURL) {
|
Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) {
|
||||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
try {
|
||||||
}
|
onDone(xmlhttp.responseText, xmlhttp);
|
||||||
Scholar.Utilities.HTTP.doOptions(url, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
|
} catch(e) {
|
||||||
|
Scholar.debug("an error occurred in code called by doPost: "+e);
|
||||||
|
translate._translationComplete(false);
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
|
// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
|
||||||
|
@ -310,7 +329,7 @@ Scholar.Utilities.HTTP = new function() {
|
||||||
* doGet can be called as:
|
* doGet can be called as:
|
||||||
* Scholar.Utilities.HTTP.doGet(url, onDone)
|
* Scholar.Utilities.HTTP.doGet(url, onDone)
|
||||||
**/
|
**/
|
||||||
function doGet(url, onDone) {
|
function doGet(url, onDone, onError) {
|
||||||
Scholar.debug("HTTP GET "+url);
|
Scholar.debug("HTTP GET "+url);
|
||||||
if (this.browserIsOffline()){
|
if (this.browserIsOffline()){
|
||||||
return false;
|
return false;
|
||||||
|
@ -429,17 +448,14 @@ Scholar.Utilities.HTTP = new function() {
|
||||||
|
|
||||||
// Download complete
|
// Download complete
|
||||||
case 4:
|
case 4:
|
||||||
try {
|
if(onDone){
|
||||||
if (onDone){
|
onDone(xmlhttp);
|
||||||
onDone(xmlhttp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (e){
|
|
||||||
Scholar.debug(e, 2);
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Downloads and processes documents with processor()
|
// Downloads and processes documents with processor()
|
||||||
|
@ -456,62 +472,70 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do
|
||||||
var hiddenBrowser = Scholar.Browser.createHiddenBrowser();
|
var hiddenBrowser = Scholar.Browser.createHiddenBrowser();
|
||||||
var prevUrl, url;
|
var prevUrl, url;
|
||||||
|
|
||||||
try {
|
if (urls.length == 0) {
|
||||||
if (urls.length == 0) {
|
if(firstDoc) {
|
||||||
if(firstDoc) {
|
processor(firstDoc, done);
|
||||||
processor(firstDoc, done);
|
} else {
|
||||||
} else {
|
done();
|
||||||
done();
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
return;
|
||||||
var urlIndex = -1;
|
|
||||||
var doLoad = function() {
|
|
||||||
urlIndex++;
|
|
||||||
if (urlIndex < urls.length) {
|
|
||||||
url = urls[urlIndex];
|
|
||||||
try {
|
|
||||||
Scholar.debug("loading "+url);
|
|
||||||
hiddenBrowser.loadURI(url);
|
|
||||||
} catch (e) {
|
|
||||||
Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
|
|
||||||
exception(e);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
|
||||||
if(!saveBrowser) {
|
|
||||||
Scholar.Browser.deleteHiddenBrowser(hiddenBrowser);
|
|
||||||
}
|
|
||||||
done();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
var onLoad = function() {
|
|
||||||
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
|
|
||||||
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
|
|
||||||
prevUrl = hiddenBrowser.contentDocument.location.href;
|
|
||||||
try {
|
|
||||||
processor(hiddenBrowser.contentDocument);
|
|
||||||
} catch (e) {
|
|
||||||
Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
|
|
||||||
exception(e);
|
|
||||||
}
|
|
||||||
doLoad();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
var init = function() {
|
|
||||||
hiddenBrowser.addEventListener("load", onLoad, true);
|
|
||||||
|
|
||||||
if (firstDoc) {
|
|
||||||
processor(firstDoc, doLoad);
|
|
||||||
} else {
|
|
||||||
doLoad();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
init();
|
|
||||||
} catch (e) {
|
|
||||||
Scholar.debug("processDocuments: " + e);
|
|
||||||
exception(e);
|
|
||||||
}
|
}
|
||||||
|
var urlIndex = -1;
|
||||||
|
|
||||||
|
var removeListeners = function() {
|
||||||
|
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||||
|
if(!saveBrowser) {
|
||||||
|
Scholar.Browser.deleteHiddenBrowser(hiddenBrowser);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var doLoad = function() {
|
||||||
|
urlIndex++;
|
||||||
|
if (urlIndex < urls.length) {
|
||||||
|
url = urls[urlIndex];
|
||||||
|
try {
|
||||||
|
Scholar.debug("loading "+url);
|
||||||
|
hiddenBrowser.loadURI(url);
|
||||||
|
} catch (e) {
|
||||||
|
removeListeners();
|
||||||
|
if(exception) {
|
||||||
|
exception(e);
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
throw(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
removeListeners();
|
||||||
|
done();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
var onLoad = function() {
|
||||||
|
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
|
||||||
|
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
|
||||||
|
prevUrl = hiddenBrowser.contentDocument.location.href;
|
||||||
|
try {
|
||||||
|
processor(hiddenBrowser.contentDocument);
|
||||||
|
} catch (e) {
|
||||||
|
removeListeners();
|
||||||
|
if(exception) {
|
||||||
|
exception(e);
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
throw(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
doLoad();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
var init = function() {
|
||||||
|
hiddenBrowser.addEventListener("load", onLoad, true);
|
||||||
|
|
||||||
|
if (firstDoc) {
|
||||||
|
processor(firstDoc, doLoad);
|
||||||
|
} else {
|
||||||
|
doLoad();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
init();
|
||||||
}
|
}
|
127
scrapers.sql
127
scrapers.sql
|
@ -1,7 +1,7 @@
|
||||||
-- 48
|
-- 49
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00'));
|
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
|
||||||
|
|
||||||
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
|
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
|
@ -112,7 +112,7 @@ function doWeb(doc, url) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||||
function() { Scholar.done(); }, function() {});
|
function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
} else {
|
} else {
|
||||||
|
@ -646,7 +646,7 @@ function doWeb(doc, url) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||||
function() { Scholar.done(); }, function() {});
|
function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
} else {
|
} else {
|
||||||
|
@ -763,7 +763,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
|
||||||
newItem.complete();
|
newItem.complete();
|
||||||
|
|
||||||
Scholar.done();
|
Scholar.done();
|
||||||
}, function() {});
|
}, null);
|
||||||
} else { // Search results page
|
} else { // Search results page
|
||||||
// Require link to match this
|
// Require link to match this
|
||||||
var tagRegexp = new RegExp();
|
var tagRegexp = new RegExp();
|
||||||
|
@ -952,7 +952,7 @@ function doWeb(doc, url) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||||
function() { Scholar.done() }, function() {});
|
function() { Scholar.done() }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}
|
}
|
||||||
|
@ -1127,7 +1127,7 @@ function doWeb(doc, url) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||||
function() { Scholar.done(); }, function() {});
|
function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
} else {
|
} else {
|
||||||
|
@ -1136,7 +1136,7 @@ function doWeb(doc, url) {
|
||||||
if(m && (m[1] == "1" || m[1] == "2")) {
|
if(m && (m[1] == "1" || m[1] == "2")) {
|
||||||
scrape(doc);
|
scrape(doc);
|
||||||
} else if(m) {
|
} else if(m) {
|
||||||
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, function() {});
|
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, null);
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1366,7 +1366,7 @@ function doWeb(doc, url) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||||
function() { Scholar.done(); }, function() {});
|
function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}
|
}
|
||||||
|
@ -1457,7 +1457,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
|
||||||
newItem.source = uri;
|
newItem.source = uri;
|
||||||
record.translate(newItem);
|
record.translate(newItem);
|
||||||
newItem.complete();
|
newItem.complete();
|
||||||
}, function() { Scholar.done(); }, function() {});
|
}, function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}');
|
}');
|
||||||
|
@ -1544,7 +1544,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
|
||||||
newItem.source = uri;
|
newItem.source = uri;
|
||||||
record.translate(newItem);
|
record.translate(newItem);
|
||||||
newItem.complete();
|
newItem.complete();
|
||||||
}, function() { Scholar.done() }, function() {});
|
}, function() { Scholar.done() }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}');
|
}');
|
||||||
|
@ -1647,7 +1647,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
|
||||||
newItem.source = uri;
|
newItem.source = uri;
|
||||||
record.translate(newItem);
|
record.translate(newItem);
|
||||||
newItem.complete();
|
newItem.complete();
|
||||||
}, function(){ Scholar.done(); }, function() {});
|
}, function(){ Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}');
|
}');
|
||||||
|
@ -1721,8 +1721,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}');
|
}');
|
||||||
|
|
||||||
|
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|GeacFETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
|
||||||
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
|
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
if(doc.location.href.indexOf("/GeacQUERY") > 0) {
|
if(doc.location.href.indexOf("/GeacQUERY") > 0) {
|
||||||
return "multiple";
|
return "multiple";
|
||||||
|
@ -1804,7 +1803,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
|
||||||
newItem.source = uri;
|
newItem.source = uri;
|
||||||
record.translate(newItem);
|
record.translate(newItem);
|
||||||
newItem.complete();
|
newItem.complete();
|
||||||
}, function() { Scholar.done(); }, function() {});
|
}, function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}');
|
}');
|
||||||
|
@ -2037,7 +2036,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
|
||||||
newItem.source = uri;
|
newItem.source = uri;
|
||||||
record.translate(newItem);
|
record.translate(newItem);
|
||||||
newItem.complete();
|
newItem.complete();
|
||||||
}, function() {Scholar.done(); }, function() {});
|
}, function() {Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}');
|
}');
|
||||||
|
@ -2568,7 +2567,79 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
newItem.complete();
|
newItem.complete();
|
||||||
}, function() { Scholar.done(); }, function() {});
|
}, function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
|
Scholar.wait();
|
||||||
|
}');
|
||||||
|
|
||||||
|
REPLACE INTO "translators" VALUES ('9c335444-a562-4f88-b291-607e8f46a9bb', '2006-08-15 15:42:00', 4, 'Berkeley Library', 'Simon Kornblith', '^http://[^/]*berkeley.edu[^/]*/WebZ/(?:html/results.html|FETCH)\?.*sessionid=',
|
||||||
|
'function detectWeb(doc, url) {
|
||||||
|
var resultsRegexp = /\/WebZ\/html\/results.html/i
|
||||||
|
if(resultsRegexp.test(url)) {
|
||||||
|
return "multiple";
|
||||||
|
} else {
|
||||||
|
return "book";
|
||||||
|
}
|
||||||
|
}',
|
||||||
|
'function reformURL(url) {
|
||||||
|
return url.replace(/fmtclass=[^&]*/, "")+":fmtclass=marc";
|
||||||
|
}
|
||||||
|
|
||||||
|
function doWeb(doc, url) {
|
||||||
|
var resultsRegexp = /\/WebZ\/html\/results.html/i
|
||||||
|
|
||||||
|
if(resultsRegexp.test(url)) {
|
||||||
|
var items = Scholar.Utilities.getItemArray(doc, doc, "/WebZ/FETCH", "^[0-9]*$");
|
||||||
|
items = Scholar.selectItems(items);
|
||||||
|
|
||||||
|
if(!items) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var urls = new Array();
|
||||||
|
for(var i in items) {
|
||||||
|
urls.push(reformURL(i));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
var urls = [reformURL(url)];
|
||||||
|
}
|
||||||
|
|
||||||
|
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
|
||||||
|
|
||||||
|
Scholar.Utilities.processDocuments(urls, function(newDoc) {
|
||||||
|
Scholar.Utilities.debug(newDoc.getElementsByTagName("body")[0].innerHTML);
|
||||||
|
var uri = newDoc.location.href;
|
||||||
|
|
||||||
|
var namespace = newDoc.documentElement.namespaceURI;
|
||||||
|
var nsResolver = namespace ? function(prefix) {
|
||||||
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
|
} : null;
|
||||||
|
|
||||||
|
var elmts = newDoc.evaluate(''//table/tbody/tr[@valign="top"]'',
|
||||||
|
newDoc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
|
|
||||||
|
var record = new marc.MARC_Record();
|
||||||
|
while(elmt = elmts.iterateNext()) {
|
||||||
|
var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
|
||||||
|
var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
|
||||||
|
var ind1 = value[4];
|
||||||
|
var ind2 = value[6];
|
||||||
|
value = Scholar.Utilities.cleanString(value.substr(6)).
|
||||||
|
replace(/\$([a-z0-9]) /g, record.subfield_delimiter+"$1");
|
||||||
|
if(value[0] != record.subfield_delimiter) {
|
||||||
|
value = record.subfield_delimiter+"a"+value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(field != 0) {
|
||||||
|
record.add_field(field, ind1, ind2, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var newItem = new Scholar.Item();
|
||||||
|
newItem.source = uri;
|
||||||
|
record.translate(newItem);
|
||||||
|
newItem.complete();
|
||||||
|
}, function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}');
|
}');
|
||||||
|
@ -2644,9 +2715,7 @@ function doSearch(item) {
|
||||||
Scholar.done(false);
|
Scholar.done(false);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}, function() {
|
}, null);
|
||||||
error();
|
|
||||||
});
|
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}');
|
}');
|
||||||
|
@ -4604,7 +4673,16 @@ MARC_Record.prototype.get_field_subfields = function(tag) { // returns a two-dim
|
||||||
}
|
}
|
||||||
|
|
||||||
MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
|
MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
|
||||||
if (tag.length != 3) { return false; }
|
/*if(tag.length != 3) {
|
||||||
|
return false;
|
||||||
|
}*/
|
||||||
|
|
||||||
|
if (tag.length < 3) {
|
||||||
|
tag = Scholar.Utilities.lpad(tag.toString(),"0",3);
|
||||||
|
} else if(tag.length > 3) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
var F = new this.MARC_field(this,tag,ind1,ind2,value);
|
var F = new this.MARC_field(this,tag,ind1,ind2,value);
|
||||||
// adds pointer to list of fields
|
// adds pointer to list of fields
|
||||||
this.variable_fields[this.variable_fields.length] = F;
|
this.variable_fields[this.variable_fields.length] = F;
|
||||||
|
@ -4666,9 +4744,11 @@ MARC_Record.prototype._clean = function(value) {
|
||||||
}
|
}
|
||||||
|
|
||||||
MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) {
|
MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) {
|
||||||
|
|
||||||
if(!part) {
|
if(!part) {
|
||||||
part = ''a'';
|
part = ''a'';
|
||||||
}
|
}
|
||||||
|
|
||||||
var field = this.get_field_subfields(fieldNo);
|
var field = this.get_field_subfields(fieldNo);
|
||||||
Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part);
|
Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part);
|
||||||
if(field) {
|
if(field) {
|
||||||
|
@ -4685,6 +4765,7 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(value) {
|
if(value) {
|
||||||
|
this._gotField = true;
|
||||||
value = this._clean(value);
|
value = this._clean(value);
|
||||||
|
|
||||||
if(execMe) {
|
if(execMe) {
|
||||||
|
@ -4807,6 +4888,10 @@ MARC_Record.prototype.translate = function(item) {
|
||||||
|
|
||||||
// Set type
|
// Set type
|
||||||
item.itemType = "book";
|
item.itemType = "book";
|
||||||
|
|
||||||
|
if(!this._gotField) {
|
||||||
|
throw("tried to create a marc record with no fields!");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides
|
MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides
|
||||||
|
|
Loading…
Reference in New Issue
Block a user