- Made ingester automatically create hidden browser objects, given a window object. This should make things much easier for both David and me.

- Multiple item detection code is now a part of the scraperJavaScript, rather than the scrapeDetectCode, and code to choose which items to add is part of Scholar.Ingester.Utilities, accessible from inside scrapers. The alternative approach would result in one request (or, in the case of JSTOR, three requests) per new item, while in some cases (e.g. Voyager) only one request is necessary to get all of the items.
This commit is contained in:
Simon Kornblith 2006-06-22 15:50:46 +00:00
parent 726364d091
commit 3890e5f122
5 changed files with 109 additions and 95 deletions

View File

@ -35,7 +35,6 @@ Scholar_Ingester_Interface.init = function() {
*/ */
Scholar_Ingester_Interface.chromeLoad = function() { Scholar_Ingester_Interface.chromeLoad = function() {
Scholar_Ingester_Interface.tabBrowser = document.getElementById("content"); Scholar_Ingester_Interface.tabBrowser = document.getElementById("content");
Scholar_Ingester_Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
Scholar_Ingester_Interface.appContent = document.getElementById("appcontent"); Scholar_Ingester_Interface.appContent = document.getElementById("appcontent");
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image"); Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
@ -61,21 +60,11 @@ Scholar_Ingester_Interface.chromeUnload = function() {
Scholar_Ingester_Interface.scrapeThisPage = function() { Scholar_Ingester_Interface.scrapeThisPage = function() {
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
if(documentObject.scraper) { if(documentObject.scraper) {
if(documentObject.scrapeURLList) {
// In the case that there are multiple scrapable URLs, make the user choose
Scholar_Ingester_Interface.chooseURL(documentObject);
}
Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping")); Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping); documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping);
} }
} }
Scholar_Ingester_Interface.chooseURL = function(documentObject) {
Scholar.debug("chooseURL called");
var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
"_blank","chrome,modal,centerscreen,resizable=yes", documentObject);
}
/* /*
* Updates the status of the capture icon to reflect the scrapability or lack * Updates the status of the capture icon to reflect the scrapability or lack
* thereof of the current page * thereof of the current page
@ -182,7 +171,7 @@ Scholar_Ingester_Interface._setDocument = function(browser) {
browser.setAttribute("scholar-key", key); browser.setAttribute("scholar-key", key);
} }
} }
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar_Ingester_Interface.hiddenBrowser); Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, window);
Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper(); Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper();
} }
@ -203,7 +192,7 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
/* /*
* Callback to be executed when scraping is complete * Callback to be executed when scraping is complete
*/ */
Scholar_Ingester_Interface._finishScraping = function(obj) { Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) {
if(obj.items.length) { if(obj.items.length) {
try { // Encased in a try block to fix a as-of-yet unresolved issue try { // Encased in a try block to fix a as-of-yet unresolved issue
var item1 = obj.items[0]; var item1 = obj.items[0];
@ -243,12 +232,14 @@ Scholar_Ingester_Interface._finishScraping = function(obj) {
for(i in obj.items) { for(i in obj.items) {
obj.items[i].save(); obj.items[i].save();
} }
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
} else if(returnValue) {
Scholar_Ingester_Interface.scrapeProgress.kill();
} else { } else {
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError")); Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
Scholar_Ingester_Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription")); Scholar_Ingester_Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
}
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000); setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
}
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@ -333,7 +324,6 @@ Scholar_Ingester_Interface.Progress.prototype.addDescription = function(descript
this.table.appendChild(tr); this.table.appendChild(tr);
} }
Scholar_Ingester_Interface.Progress.prototype.fade = function() { Scholar_Ingester_Interface.Progress.prototype.fade = function() {
// Icky, icky hack to keep objects // Icky, icky hack to keep objects
var me = this; var me = this;
@ -349,3 +339,8 @@ Scholar_Ingester_Interface.Progress.prototype.fade = function() {
// Begin fade // Begin fade
this._fader(); this._fader();
} }
Scholar_Ingester_Interface.Progress.prototype.kill = function() {
this.div.style.display = 'none';
}

View File

@ -12,10 +12,4 @@
<hbox id="urlbar-icons"> <hbox id="urlbar-icons">
<image src="chrome://scholar/skin/treeitem-book.png" id="scholar-status-image" onclick="Scholar_Ingester_Interface.scrapeThisPage()" position="1" hidden="true"/> <image src="chrome://scholar/skin/treeitem-book.png" id="scholar-status-image" onclick="Scholar_Ingester_Interface.scrapeThisPage()" position="1" hidden="true"/>
</hbox> </hbox>
<window id="main-window">
<box style="visibility: collapse">
<browser id="scholar-hidden-browser" />
</box>
</window>
</overlay> </overlay>

View File

@ -19,26 +19,26 @@ Scholar_Ingester_Interface_SelectItems = function() {}
* loading * loading
*/ */
Scholar_Ingester_Interface_SelectItems.init = function() { Scholar_Ingester_Interface_SelectItems.init = function() {
this.documentObject = window.arguments[0]; this.io = window.arguments[0];
this.Scholar_Ingester_Interface = window.arguments[1];
this.listbox = document.getElementById("scholar-selectitems-links"); this.listbox = document.getElementById("scholar-selectitems-links");
for(i in this.documentObject.scrapeURLList) { // we could use a tree for this if we wanted to for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
var itemNode = document.createElement("listitem"); var itemNode = document.createElement("listitem");
itemNode.setAttribute("type", "checkbox"); itemNode.setAttribute("type", "checkbox");
itemNode.setAttribute("value", i); itemNode.setAttribute("value", i);
itemNode.setAttribute("label", this.documentObject.scrapeURLList[i]); itemNode.setAttribute("label", this.io.dataIn[i]);
itemNode.setAttribute("checked", false); itemNode.setAttribute("checked", false);
this.listbox.appendChild(itemNode); this.listbox.appendChild(itemNode);
} }
} }
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() { Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
// clear scrapeURLList this.io.dataOut = new Object();
this.documentObject.scrapeURLList = new Object();
// collect scrapeURLList from listbox // collect scrapeURLList from listbox
for(var i=0; i<this.listbox.length; i++) { for(var i=0; i<this.listbox.length; i++) {
var itemNode = this.listbox[i]; var itemNode = this.listbox[i];
this.documentObject.scrapeURLList[itemNode.getAttribute("value")] = itemNode.getAttribute("label"); this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
} }
} }

View File

@ -4,6 +4,21 @@
Scholar.Ingester = new function() {} Scholar.Ingester = new function() {}
Scholar.Ingester.createHiddenBrowser = function(myWindow) {
// Create a hidden browser
var newHiddenBrowser = myWindow.document.createElement("browser");
var windows = myWindow.document.getElementsByTagName("window");
windows[0].appendChild(newHiddenBrowser);
Scholar.debug("created hidden browser");
return newHiddenBrowser;
}
Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
// Delete a hidden browser
delete myBrowser;
Scholar.debug("deleted hidden browser");
}
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// //
// Scholar.Ingester.Model // Scholar.Ingester.Model
@ -48,8 +63,8 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// Scholar.Ingester.Utilities class, a set of methods to assist in data // Scholar.Ingester.Utilities class, a set of methods to assist in data
// extraction. Most code here was stolen directly from the Piggy Bank project. // extraction. Most code here was stolen directly from the Piggy Bank project.
Scholar.Ingester.Utilities = function(hiddenBrowser) { Scholar.Ingester.Utilities = function(myWindow) {
this._hiddenBrowser = hiddenBrowser; this.window = myWindow;
} }
// Adapter for Piggy Bank function to print debug messages; log level is // Adapter for Piggy Bank function to print debug messages; log level is
@ -115,7 +130,7 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
// exception - a function to execute if an exception occurs (exceptions are // exception - a function to execute if an exception occurs (exceptions are
// also logged in the Firefox Scholar log) // also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
var hiddenBrowser = this._hiddenBrowser; var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
Scholar.debug("processDocuments called"); Scholar.debug("processDocuments called");
try { try {
@ -141,26 +156,23 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
exception(e); exception(e);
} }
} else { } else {
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
hiddenBrowser.setTimeout(done, 10); hiddenBrowser.setTimeout(done, 10);
} }
}; };
var onLoad = function() { var onLoad = function() {
Scholar.debug("onLoad called"); Scholar.debug("onLoad called");
if(hiddenBrowser.id == "scholar-hidden-browser") {
hiddenBrowser.removeEventListener("load", onLoad, true); hiddenBrowser.removeEventListener("load", onLoad, true);
try { try {
var newHiddenBrowser = new Object(); var newHiddenBrowser = new Object();
Scholar.debug("new hidden browser");
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
Scholar.debug("added attributes");
processor(newHiddenBrowser); processor(newHiddenBrowser);
Scholar.debug("called processor");
} catch (e) { } catch (e) {
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
exception(e); exception(e);
} }
} doLoad();
}; };
var init = function() { var init = function() {
Scholar.debug("init called"); Scholar.debug("init called");
@ -302,6 +314,50 @@ Scholar.Ingester.Utilities.prototype.cleanTags = function(x) {
return x.replace(/<[^>]+>/g, ""); return x.replace(/<[^>]+>/g, "");
} }
/*
* Allows a user to select which items to scrape
*/
Scholar.Ingester.Utilities.prototype.selectItems = function(itemList) {
// mozillazine made me do it! honest!
var io = { dataIn:itemList, dataOut:null }
var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
"_blank","chrome,modal,centerscreen,resizable=yes", io);
return io.dataOut;
}
/*
* Grabs items based on URLs
*/
Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) {
var availableItems = new Object(); // Technically, associative arrays are objects
// Require link to match this
var tagRegexp = new RegExp();
tagRegexp.compile(urlRe);
// Do not allow text to match this
var rejectRegexp = new RegExp();
rejectRegexp.compile(rejectRe);
var links = inHere.getElementsByTagName("a");
for(var i=0; i<links.length; i++) {
if(tagRegexp.test(links[i].href)) {
var text = this.getNodeString(doc, links[i], './/text()', null);
if(text) {
text = this.cleanString(text);
if(!rejectRegexp.test(text)) {
if(availableItems[links[i].href]) {
availableItems[links[i].href] += " "+text;
} else {
availableItems[links[i].href] = text;
}
}
}
}
}
return availableItems;
}
// These functions are for use by importMARCRecord. They're private, because, // These functions are for use by importMARCRecord. They're private, because,
// while they are useful, it's also nice if as many of our scrapers as possible // while they are useful, it's also nice if as many of our scrapers as possible
// are PiggyBank compatible, and if our scrapers used functions, that would // are PiggyBank compatible, and if our scrapers used functions, that would
@ -512,14 +568,14 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
/* /*
* Constructor for Document object * Constructor for Document object
*/ */
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){ Scholar.Ingester.Document = function(browserWindow, myWindow){
this.scraper = null; this.scraper = null;
this.browser = browserWindow; this.browser = browserWindow;
this.window = myWindow;
this.model = new Scholar.Ingester.Model(); this.model = new Scholar.Ingester.Model();
this.items = new Array(); this.items = new Array();
this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"] this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
.getService(Ci.nsIAppShellService); .getService(Ci.nsIAppShellService);
this._hiddenBrowser = hiddenBrowser;
this._generateSandbox(); this._generateSandbox();
} }
@ -596,17 +652,19 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
var scraperSandbox = this._sandbox; var scraperSandbox = this._sandbox;
try { try {
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox); var returnValue = Components.utils.evalInSandbox("(function(){\n" +
this.scraper.scraperJavaScript +
"\n})()", scraperSandbox);
} catch(e) { } catch(e) {
Scholar.debug(e+' in scraperJavaScript for '+this.scraper.label); Scholar.debug(e+' in scraperJavaScript for '+this.scraper.label);
this._scrapePageComplete(); this._scrapePageComplete(false);
return; return;
} }
// If synchronous, call _scrapePageComplete(); // If synchronous, call _scrapePageComplete();
if(!this._waitForCompletion) { if(!this._waitForCompletion) {
Scholar.debug("is asynch"); Scholar.debug("is asynch");
this._scrapePageComplete(); this._scrapePageComplete(returnValue);
} }
} }
@ -637,10 +695,10 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
/* /*
* Called when scraping (synchronous or asynchronous) is complete * Called when scraping (synchronous or asynchronous) is complete
*/ */
Scholar.Ingester.Document.prototype._scrapePageComplete = function() { Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) {
this._updateDatabase(); this._updateDatabase();
if(this._scrapeCallback) { if(this._scrapeCallback) {
this._scrapeCallback(this); this._scrapeCallback(this, returnValue);
} }
} }
@ -651,7 +709,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href); this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
this._sandbox.browser = this.browser; this._sandbox.browser = this.browser;
this._sandbox.doc = this._sandbox.browser.contentDocument; this._sandbox.doc = this._sandbox.browser.contentDocument;
this._sandbox.utilities = new Scholar.Ingester.Utilities(this._hiddenBrowser); this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window);
this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow); this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow);
this._sandbox.window = this.window; this._sandbox.window = this.window;
this._sandbox.model = this.model; this._sandbox.model = this.model;

View File

@ -175,48 +175,7 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex
wait();'); wait();');
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-21 22:44:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-21 22:44:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
'if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { 'var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
// We have search results
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
var availableItems = new Object(); // Technically, associative arrays are objects
// Require link to match this
var tagRegexp = new RegExp();
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
// Do not allow text to match this
var rejectRegexp = new RegExp();
rejectRegexp.compile(''\[ [0-9]+ \]'');
var links = doc.getElementsByTagName("a");
for(var i=0; i<links.length; i++) {
if(tagRegexp.test(links[i].href)) {
var text = utilities.getNodeString(doc, links[i], ''.//text()'', nsResolver);
if(text) {
text = utilities.cleanString(text);
if(!rejectRegexp.test(text)) {
if(availableItems[links[i].href]) {
availableItems[links[i].href] += " "+text;
} else {
availableItems[links[i].href] = text;
}
}
}
}
}
if(availableItems) {
return availableItems;
} else {
return false;
}
}
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
for(i in export_options) { for(i in export_options) {
if(export_options[i].text == ''Latin1 MARC'' if(export_options[i].text == ''Latin1 MARC''
|| export_options[i].text == ''Raw MARC'' || export_options[i].text == ''Raw MARC''
@ -233,6 +192,14 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]'');
var items = utilities.selectItems(items);
if(!items) {
return true;
}
}
var uri = doc.location.href; var uri = doc.location.href;
var raw, unicode, latin1; var raw, unicode, latin1;