closes #187, make berkeley's library work

closes #186, stop translators from hanging

when a document loads inside a frameset, we now check whether we can scrape each individual frame.
all functions involving tabs have been vastly simplified, because in the process of figuring this out, i discovered Firefox 2's new tab events.
if a translator throws an exception inside loadDocument(), doGet(), doPost(), or processDocuments(), a translate error message will appear, and the translator will not hang
This commit is contained in:
Simon Kornblith 2006-08-15 19:46:42 +00:00
parent 009a4ad520
commit 51108446e3
4 changed files with 262 additions and 170 deletions

View File

@ -23,7 +23,6 @@ var Scholar_Ingester_Interface = function() {}
* loading * loading
*/ */
Scholar_Ingester_Interface.init = function() { Scholar_Ingester_Interface.init = function() {
Scholar_Ingester_Interface.browsers = new Array();
Scholar_Ingester_Interface.browserData = new Object(); Scholar_Ingester_Interface.browserData = new Object();
Scholar_Ingester_Interface._scrapePopupShowing = false; Scholar_Ingester_Interface._scrapePopupShowing = false;
Scholar.Ingester.ProxyMonitor.init(); Scholar.Ingester.ProxyMonitor.init();
@ -42,8 +41,10 @@ Scholar_Ingester_Interface.chromeLoad = function() {
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image"); Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
// this gives us onLocationChange, for updating when tabs are switched/created // this gives us onLocationChange, for updating when tabs are switched/created
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener, Scholar_Ingester_Interface.tabBrowser.addEventListener("TabClose",
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION); Scholar_Ingester_Interface.tabClose, false);
Scholar_Ingester_Interface.tabBrowser.addEventListener("TabSelect",
Scholar_Ingester_Interface.tabSelect, false);
// this is for pageshow, for updating the status of the book icon // this is for pageshow, for updating the status of the book icon
Scholar_Ingester_Interface.appContent.addEventListener("pageshow", Scholar_Ingester_Interface.appContent.addEventListener("pageshow",
Scholar_Ingester_Interface.contentLoad, true); Scholar_Ingester_Interface.contentLoad, true);
@ -53,8 +54,7 @@ Scholar_Ingester_Interface.chromeLoad = function() {
* When chrome unloads, delete our document objects and remove our listeners * When chrome unloads, delete our document objects and remove our listeners
*/ */
Scholar_Ingester_Interface.chromeUnload = function() { Scholar_Ingester_Interface.chromeUnload = function() {
delete Scholar_Ingester_Interface.browserData, Scholar_Ingester_Interface.browsers; delete Scholar_Ingester_Interface.browserData;
this.tabBrowser.removeProgressListener(this);
} }
/* /*
@ -77,7 +77,7 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
} }
var translate = new Scholar.Translate("web"); var translate = new Scholar.Translate("web");
translate.setBrowser(browser); translate.setDocument(data.document);
// use first translator available // use first translator available
translate.setTranslator(data.translators[0]); translate.setTranslator(data.translators[0]);
translate.setHandler("select", Scholar_Ingester_Interface._selectItems); translate.setHandler("select", Scholar_Ingester_Interface._selectItems);
@ -90,86 +90,69 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
/* /*
* An event handler called when a new document is loaded. Creates a new document * An event handler called when a new document is loaded. Creates a new document
* object, and updates the status of the capture icon * object, and updates the status of the capture icon
*/ */
Scholar_Ingester_Interface.contentLoad = function(event) { Scholar_Ingester_Interface.contentLoad = function(event) {
if (event.originalTarget instanceof HTMLDocument) { if(event.originalTarget instanceof HTMLDocument) {
// Stolen off the Mozilla extension developer's website, a routine to var doc = event.originalTarget;
// determine the root document loaded from a frameset var rootDoc = doc;
if (event.originalTarget.defaultView.frameElement) {
var doc = event.originalTarget; // get the appropriate root document to check which browser we're on
while (doc.defaultView.frameElement) { Scholar.debug("getting root document");
doc=doc.defaultView.frameElement.ownerDocument; while(rootDoc.defaultView.frameElement) {
} rootDoc = rootDoc.defaultView.frameElement.ownerDocument;
// Frame within a tab was loaded. doc is the root document of the frameset
} else {
var doc = event.originalTarget;
// Page was loaded. doc is the document that loaded.
} }
// Figure out what browser this contentDocument is associated with // Figure out what browser this contentDocument is associated with
var browser; var browser;
Scholar.debug("getting browser");
for(var i=0; i<Scholar_Ingester_Interface.tabBrowser.browsers.length; i++) { for(var i=0; i<Scholar_Ingester_Interface.tabBrowser.browsers.length; i++) {
if(doc == Scholar_Ingester_Interface.tabBrowser.browsers[i].contentDocument) { if(rootDoc == Scholar_Ingester_Interface.tabBrowser.browsers[i].contentDocument) {
browser = Scholar_Ingester_Interface.tabBrowser.browsers[i]; browser = Scholar_Ingester_Interface.tabBrowser.browsers[i];
break; break;
} }
} }
if(!browser) { if(!browser) {
Scholar.debug("Could not find browser!");
return; return;
} }
Scholar.debug("getting data");
// get data object // get data object
var data = Scholar_Ingester_Interface._getData(browser); var data = Scholar_Ingester_Interface._getData(browser);
// if there's already a scrapable page in the browser window, and it's
// still there, return
if(data.translators && data.translators.length && data.document.location) {
return;
}
Scholar.debug("translating");
// get translators // get translators
var translate = new Scholar.Translate("web"); var translate = new Scholar.Translate("web");
translate.setBrowser(browser); translate.setDocument(doc);
data.translators = translate.getTranslators(); data.translators = translate.getTranslators();
// update status // update status
Scholar_Ingester_Interface._updateStatus(data); Scholar_Ingester_Interface._updateStatus(data);
// add document
if(data.translators && data.translators.length) {
data.document = doc;
}
} }
} }
/* /*
* Dummy event handlers for all the events we don't care about * called when a tab is closed
*/ */
Scholar_Ingester_Interface.Listener = function() {} Scholar_Ingester_Interface.tabClose = function(event) {
Scholar_Ingester_Interface.Listener.onStatusChange = function() {} // To execute if document object does not exist
Scholar_Ingester_Interface.Listener.onSecurityChange = function() {} Scholar_Ingester_Interface._deleteData(event.target.linkedBrowser);
Scholar_Ingester_Interface.Listener.onProgressChange = function() {} }
Scholar_Ingester_Interface.Listener.onStateChange = function() {}
/* /*
* onLocationChange is called when tabs are switched. Use it to retrieve the * called when a tab is switched
* appropriate status indicator for the current tab, and to free useless objects
*/ */
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) { Scholar_Ingester_Interface.tabSelect = function(event) {
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
// Remove document object of any browser that no longer exists
for (var i = 0; i < Scholar_Ingester_Interface.browsers.length; i++) {
var browser = Scholar_Ingester_Interface.browsers[i];
var exists = false;
for (var j = 0; j < browsers.length; j++) {
if (browser == browsers[j]) {
exists = true;
break;
}
}
if (!exists) {
Scholar_Ingester_Interface.browsers.splice(i,1);
// To execute if document object does not exist
Scholar_Ingester_Interface._deleteDocument(browser);
}
}
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
Scholar_Ingester_Interface._updateStatus(data); Scholar_Ingester_Interface._updateStatus(data);
// Make sure scrape progress is gone // Make sure scrape progress is gone
Scholar_Ingester_Interface.Progress.kill(); Scholar_Ingester_Interface.Progress.kill();
} }

View File

@ -29,8 +29,8 @@
* PUBLIC PROPERTIES: * PUBLIC PROPERTIES:
* *
* type - the text type of translator (set by constructor, should be read only) * type - the text type of translator (set by constructor, should be read only)
* browser - the browser object to be used for web scraping (read-only; set * document - the document object to be used for web scraping (read-only; set
* with setBrowser) * with setDocument)
* translator - the translator currently in use (read-only; set with * translator - the translator currently in use (read-only; set with
* setTranslator) * setTranslator)
* location - the location of the target (read-only; set with setLocation) * location - the location of the target (read-only; set with setLocation)
@ -115,9 +115,9 @@ Scholar.Translate = function(type, saveItem) {
/* /*
* sets the browser to be used for web translation; also sets the location * sets the browser to be used for web translation; also sets the location
*/ */
Scholar.Translate.prototype.setBrowser = function(browser) { Scholar.Translate.prototype.setDocument = function(doc) {
this.browser = browser; this.document = doc;
this.setLocation(browser.contentDocument.location.href); this.setLocation(doc.location.href);
} }
/* /*
@ -428,7 +428,7 @@ Scholar.Translate.prototype._generateSandbox = function() {
var sandboxURL = ""; var sandboxURL = "";
if(this.type == "web") { if(this.type == "web") {
// use real URL, not proxied version, to create sandbox // use real URL, not proxied version, to create sandbox
sandboxURL = this.browser.contentDocument.location.href; sandboxURL = this.document.location.href;
} else { } else {
// generate sandbox for search by extracting domain from translator // generate sandbox for search by extracting domain from translator
// target, if one exists // target, if one exists
@ -446,8 +446,8 @@ Scholar.Translate.prototype._generateSandbox = function() {
this._sandbox.Scholar = new Object(); this._sandbox.Scholar = new Object();
// add ingester utilities // add ingester utilities
this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this.locationIsProxied); this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this);
this._sandbox.Scholar.Utilities.HTTP = new Scholar.Utilities.Ingester.HTTP(this.locationIsProxied); this._sandbox.Scholar.Utilities.HTTP = new Scholar.Utilities.Ingester.HTTP(this);
// set up selectItems handler // set up selectItems handler
this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) }; this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) };
@ -584,7 +584,7 @@ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtension
try { try {
if(this.type == "web") { if(this.type == "web") {
returnValue = this._sandbox.detectWeb(this.browser.contentDocument, this.location); returnValue = this._sandbox.detectWeb(this.document, this.location);
} else if(this.type == "search") { } else if(this.type == "search") {
returnValue = this._sandbox.detectSearch(this.search); returnValue = this._sandbox.detectSearch(this.search);
} else if(this.type == "import") { } else if(this.type == "import") {
@ -954,7 +954,7 @@ Scholar.Translate.prototype._runHandler = function(type, argument) {
*/ */
Scholar.Translate.prototype._web = function() { Scholar.Translate.prototype._web = function() {
try { try {
this._sandbox.doWeb(this.browser.contentDocument, this.location); this._sandbox.doWeb(this.document, this.location);
} catch(e) { } catch(e) {
Scholar.debug(e+' in executing code for '+this.translator[0].label); Scholar.debug(e+' in executing code for '+this.translator[0].label);
return false; return false;

View File

@ -164,8 +164,8 @@ Scholar.Utilities.prototype.itemTypeExists = function(type) {
// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional // Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
// classes relating to data extraction specifically from HTML documents. // classes relating to data extraction specifically from HTML documents.
Scholar.Utilities.Ingester = function(proxiedURL) { Scholar.Utilities.Ingester = function(translate, proxiedURL) {
this.proxiedURL = proxiedURL; this.translate = translate;
} }
Scholar.Utilities.Ingester.prototype = new Scholar.Utilities(); Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
@ -252,43 +252,62 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) {
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies // Ingester adapters for Scholar.Utilities.HTTP to handle proxies
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) { Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
if(this.proxiedURL) { this.processDocuments([ url ], succeeded, null, failed);
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
}
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
} }
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) { Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
if(this.proxiedURL) { if(this.translate.locationIsProxied) {
for(i in urls) { for(i in urls) {
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
} }
} }
// unless the translator has proposed some way to handle an error, handle it
// by throwing a "scraping error" message
if(!exception) {
var translate = this.translate;
exception = function(e) {
Scholar.debug("an error occurred in code called by processDocuments: "+e);
translate._translationComplete(false);
}
}
Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception); Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception);
} }
Scholar.Utilities.Ingester.HTTP = function(proxiedURL) { Scholar.Utilities.Ingester.HTTP = function(translate) {
this.proxiedURL = proxiedURL; this.translate = translate;
} }
Scholar.Utilities.Ingester.HTTP.prototype.doGet = function(url, onDone) { Scholar.Utilities.Ingester.HTTP.prototype.doGet = function(url, onDone) {
if(this.proxiedURL) { if(this.translate.locationIsProxied) {
url = Scholar.Ingester.ProxyMonitor.properToProxy(url); url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
} }
Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
var translate = this.translate;
Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) {
try {
onDone(xmlhttp.responseText, xmlhttp);
} catch(e) {
Scholar.debug("an error occurred in code called by doGet: "+e);
translate._translationComplete(false);
}
})
} }
Scholar.Utilities.Ingester.HTTP.prototype.doPost = function(url, body, onDone) { Scholar.Utilities.Ingester.HTTP.prototype.doPost = function(url, body, onDone) {
if(this.proxiedURL) { if(this.translate.locationIsProxied) {
url = Scholar.Ingester.ProxyMonitor.properToProxy(url); url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
} }
Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
}
Scholar.Utilities.Ingester.HTTP.prototype.doOptions = function(url, onDone) { var translate = this.translate;
if(this.proxiedURL) { Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) {
url = Scholar.Ingester.ProxyMonitor.properToProxy(url); try {
} onDone(xmlhttp.responseText, xmlhttp);
Scholar.Utilities.HTTP.doOptions(url, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) }) } catch(e) {
Scholar.debug("an error occurred in code called by doPost: "+e);
translate._translationComplete(false);
}
})
} }
// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be // These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
@ -310,7 +329,7 @@ Scholar.Utilities.HTTP = new function() {
* doGet can be called as: * doGet can be called as:
* Scholar.Utilities.HTTP.doGet(url, onDone) * Scholar.Utilities.HTTP.doGet(url, onDone)
**/ **/
function doGet(url, onDone) { function doGet(url, onDone, onError) {
Scholar.debug("HTTP GET "+url); Scholar.debug("HTTP GET "+url);
if (this.browserIsOffline()){ if (this.browserIsOffline()){
return false; return false;
@ -429,17 +448,14 @@ Scholar.Utilities.HTTP = new function() {
// Download complete // Download complete
case 4: case 4:
try { if(onDone){
if (onDone){ onDone(xmlhttp);
onDone(xmlhttp);
}
}
catch (e){
Scholar.debug(e, 2);
} }
break; break;
} }
} }
} }
// Downloads and processes documents with processor() // Downloads and processes documents with processor()
@ -456,62 +472,70 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do
var hiddenBrowser = Scholar.Browser.createHiddenBrowser(); var hiddenBrowser = Scholar.Browser.createHiddenBrowser();
var prevUrl, url; var prevUrl, url;
try { if (urls.length == 0) {
if (urls.length == 0) { if(firstDoc) {
if(firstDoc) { processor(firstDoc, done);
processor(firstDoc, done); } else {
} else { done();
done();
}
return;
} }
return;
var urlIndex = -1;
var doLoad = function() {
urlIndex++;
if (urlIndex < urls.length) {
url = urls[urlIndex];
try {
Scholar.debug("loading "+url);
hiddenBrowser.loadURI(url);
} catch (e) {
Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
exception(e);
}
} else {
hiddenBrowser.removeEventListener("load", onLoad, true);
if(!saveBrowser) {
Scholar.Browser.deleteHiddenBrowser(hiddenBrowser);
}
done();
}
};
var onLoad = function() {
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
prevUrl = hiddenBrowser.contentDocument.location.href;
try {
processor(hiddenBrowser.contentDocument);
} catch (e) {
Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
exception(e);
}
doLoad();
}
};
var init = function() {
hiddenBrowser.addEventListener("load", onLoad, true);
if (firstDoc) {
processor(firstDoc, doLoad);
} else {
doLoad();
}
}
init();
} catch (e) {
Scholar.debug("processDocuments: " + e);
exception(e);
} }
var urlIndex = -1;
var removeListeners = function() {
hiddenBrowser.removeEventListener("load", onLoad, true);
if(!saveBrowser) {
Scholar.Browser.deleteHiddenBrowser(hiddenBrowser);
}
}
var doLoad = function() {
urlIndex++;
if (urlIndex < urls.length) {
url = urls[urlIndex];
try {
Scholar.debug("loading "+url);
hiddenBrowser.loadURI(url);
} catch (e) {
removeListeners();
if(exception) {
exception(e);
return;
} else {
throw(e);
}
}
} else {
removeListeners();
done();
}
};
var onLoad = function() {
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
prevUrl = hiddenBrowser.contentDocument.location.href;
try {
processor(hiddenBrowser.contentDocument);
} catch (e) {
removeListeners();
if(exception) {
exception(e);
return;
} else {
throw(e);
}
}
doLoad();
}
};
var init = function() {
hiddenBrowser.addEventListener("load", onLoad, true);
if (firstDoc) {
processor(firstDoc, doLoad);
} else {
doLoad();
}
}
init();
} }

View File

@ -1,7 +1,7 @@
-- 48 -- 49
-- Set the following timestamp to the most recent scraper update date -- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00')); REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
@ -112,7 +112,7 @@ function doWeb(doc, url) {
} }
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {}); function() { Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
} else { } else {
@ -646,7 +646,7 @@ function doWeb(doc, url) {
} }
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {}); function() { Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
} else { } else {
@ -763,7 +763,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
newItem.complete(); newItem.complete();
Scholar.done(); Scholar.done();
}, function() {}); }, null);
} else { // Search results page } else { // Search results page
// Require link to match this // Require link to match this
var tagRegexp = new RegExp(); var tagRegexp = new RegExp();
@ -952,7 +952,7 @@ function doWeb(doc, url) {
} }
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done() }, function() {}); function() { Scholar.done() }, null);
Scholar.wait(); Scholar.wait();
} }
@ -1127,7 +1127,7 @@ function doWeb(doc, url) {
} }
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {}); function() { Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
} else { } else {
@ -1136,7 +1136,7 @@ function doWeb(doc, url) {
if(m && (m[1] == "1" || m[1] == "2")) { if(m && (m[1] == "1" || m[1] == "2")) {
scrape(doc); scrape(doc);
} else if(m) { } else if(m) {
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, function() {}); Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
} }
} }
@ -1366,7 +1366,7 @@ function doWeb(doc, url) {
} }
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {}); function() { Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
} }
@ -1457,7 +1457,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
newItem.source = uri; newItem.source = uri;
record.translate(newItem); record.translate(newItem);
newItem.complete(); newItem.complete();
}, function() { Scholar.done(); }, function() {}); }, function() { Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
}'); }');
@ -1544,7 +1544,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
newItem.source = uri; newItem.source = uri;
record.translate(newItem); record.translate(newItem);
newItem.complete(); newItem.complete();
}, function() { Scholar.done() }, function() {}); }, function() { Scholar.done() }, null);
Scholar.wait(); Scholar.wait();
}'); }');
@ -1647,7 +1647,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
newItem.source = uri; newItem.source = uri;
record.translate(newItem); record.translate(newItem);
newItem.complete(); newItem.complete();
}, function(){ Scholar.done(); }, function() {}); }, function(){ Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
}'); }');
@ -1721,8 +1721,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|GeacFETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
if(doc.location.href.indexOf("/GeacQUERY") > 0) { if(doc.location.href.indexOf("/GeacQUERY") > 0) {
return "multiple"; return "multiple";
@ -1804,7 +1803,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
newItem.source = uri; newItem.source = uri;
record.translate(newItem); record.translate(newItem);
newItem.complete(); newItem.complete();
}, function() { Scholar.done(); }, function() {}); }, function() { Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
}'); }');
@ -2037,7 +2036,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
newItem.source = uri; newItem.source = uri;
record.translate(newItem); record.translate(newItem);
newItem.complete(); newItem.complete();
}, function() {Scholar.done(); }, function() {}); }, function() {Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
}'); }');
@ -2568,7 +2567,79 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
} }
} }
newItem.complete(); newItem.complete();
}, function() { Scholar.done(); }, function() {}); }, function() { Scholar.done(); }, null);
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('9c335444-a562-4f88-b291-607e8f46a9bb', '2006-08-15 15:42:00', 4, 'Berkeley Library', 'Simon Kornblith', '^http://[^/]*berkeley.edu[^/]*/WebZ/(?:html/results.html|FETCH)\?.*sessionid=',
'function detectWeb(doc, url) {
var resultsRegexp = /\/WebZ\/html\/results.html/i
if(resultsRegexp.test(url)) {
return "multiple";
} else {
return "book";
}
}',
'function reformURL(url) {
return url.replace(/fmtclass=[^&]*/, "")+":fmtclass=marc";
}
function doWeb(doc, url) {
var resultsRegexp = /\/WebZ\/html\/results.html/i
if(resultsRegexp.test(url)) {
var items = Scholar.Utilities.getItemArray(doc, doc, "/WebZ/FETCH", "^[0-9]*$");
items = Scholar.selectItems(items);
if(!items) {
return true;
}
var urls = new Array();
for(var i in items) {
urls.push(reformURL(i));
}
} else {
var urls = [reformURL(url)];
}
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(urls, function(newDoc) {
Scholar.Utilities.debug(newDoc.getElementsByTagName("body")[0].innerHTML);
var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
var elmts = newDoc.evaluate(''//table/tbody/tr[@valign="top"]'',
newDoc, nsResolver, XPathResult.ANY_TYPE, null);
var record = new marc.MARC_Record();
while(elmt = elmts.iterateNext()) {
var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var ind1 = value[4];
var ind2 = value[6];
value = Scholar.Utilities.cleanString(value.substr(6)).
replace(/\$([a-z0-9]) /g, record.subfield_delimiter+"$1");
if(value[0] != record.subfield_delimiter) {
value = record.subfield_delimiter+"a"+value;
}
if(field != 0) {
record.add_field(field, ind1, ind2, value);
}
}
var newItem = new Scholar.Item();
newItem.source = uri;
record.translate(newItem);
newItem.complete();
}, function() { Scholar.done(); }, null);
Scholar.wait(); Scholar.wait();
}'); }');
@ -2644,9 +2715,7 @@ function doSearch(item) {
Scholar.done(false); Scholar.done(false);
}); });
} }
}, function() { }, null);
error();
});
Scholar.wait(); Scholar.wait();
}'); }');
@ -4604,7 +4673,16 @@ MARC_Record.prototype.get_field_subfields = function(tag) { // returns a two-dim
} }
MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
if (tag.length != 3) { return false; } /*if(tag.length != 3) {
return false;
}*/
if (tag.length < 3) {
tag = Scholar.Utilities.lpad(tag.toString(),"0",3);
} else if(tag.length > 3) {
return false;
}
var F = new this.MARC_field(this,tag,ind1,ind2,value); var F = new this.MARC_field(this,tag,ind1,ind2,value);
// adds pointer to list of fields // adds pointer to list of fields
this.variable_fields[this.variable_fields.length] = F; this.variable_fields[this.variable_fields.length] = F;
@ -4666,9 +4744,11 @@ MARC_Record.prototype._clean = function(value) {
} }
MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) { MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) {
if(!part) { if(!part) {
part = ''a''; part = ''a'';
} }
var field = this.get_field_subfields(fieldNo); var field = this.get_field_subfields(fieldNo);
Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part); Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part);
if(field) { if(field) {
@ -4685,6 +4765,7 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam
} }
} }
if(value) { if(value) {
this._gotField = true;
value = this._clean(value); value = this._clean(value);
if(execMe) { if(execMe) {
@ -4807,6 +4888,10 @@ MARC_Record.prototype.translate = function(item) {
// Set type // Set type
item.itemType = "book"; item.itemType = "book";
if(!this._gotField) {
throw("tried to create a marc record with no fields!");
}
} }
MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides