diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
index 88a8ff468..ccf7e0e1a 100644
--- a/chrome/chromeFiles/content/scholar/ingester/browser.js
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -165,7 +165,7 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject)
//////////////////////////////////////////////////////////////////////////////
//
-// Private Scholar.Ingester.Document methods
+// Private Scholar_Ingester_Interface methods
//
//////////////////////////////////////////////////////////////////////////////
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
index 3090f2b88..cf620ce4a 100644
--- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -19,6 +19,48 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
Scholar.debug("deleted hidden browser");
}
+/*
+ * Operates the ingester given only a URL
+ * url - URL to scrape
+ * complete - callback function to be executed if page grab completes
+ * (will be passed document object; obj.items contains array of
+ * *unsaved* items scraped; empty array indicates unscrapable page)
+ * error - callback function to be executed if an error occurred loading page
+ * myWindow - optional argument indicating window to attach a dialog to. if no
+ * window is given, Firefox Scholar uses the hidden DOM window and
+ * will simply avoid scraping multiple pages
+ */
+Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) {
+ var isHidden = false;
+ if(!myWindow) {
+ var myWindow = Components.classes["@mozilla.org/appshell/appShellService;1"]
+ .getService(Components.interfaces.nsIAppShellService)
+ .hiddenDOMWindow;
+ var isHidden = true;
+ }
+
+ var succeeded = function(browser) {
+ var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden);
+ myDoc.retrieveScraper();
+ if(myDoc.scraper) {
+ myDoc.scrapePage(function(myDoc) {
+ Scholar.Ingester.deleteHiddenBrowser(browser);
+ complete(myDoc);
+ });
+ } else {
+ Scholar.Ingester.deleteHiddenBrowser(browser);
+ complete(myDoc);
+ }
+ }
+
+ var failed = function() {
+ Scholar.debug("Scholar.Ingester.ingestURL: could not ingest "+url);
+ error();
+ }
+
+ Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed, true);
+}
+
/////////////////////////////////////////////////////////////////
//
// Scholar.Ingester.ProxyMonitor
@@ -195,10 +237,11 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
/*
* Constructor for Document object
*/
-Scholar.Ingester.Document = function(browserWindow, myWindow){
- this.scraper = this.type = null;
- this.browser = browserWindow;
+Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) {
+ this.browser = myBrowser;
this.window = myWindow;
+ this.isHidden = isHidden;
+ this.scraper = this.type = null;
this.model = new Scholar.Ingester.Model();
// Create separate URL to account for proxies
@@ -349,7 +392,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this._sandbox.browser = this.browser;
this._sandbox.doc = this.browser.contentDocument;
this._sandbox.url = this.url;
- this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL);
+ this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden);
this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL);
this._sandbox.window = this.window;
this._sandbox.model = this.model;
diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
index c5b0b7b06..77d1589ed 100644
--- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@@ -133,9 +133,10 @@ Scholar.Utilities.prototype.cleanTags = function(x) {
// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
// classes relating to data extraction specifically from HTML documents.
-Scholar.Utilities.Ingester = function(myWindow, proxiedURL) {
+Scholar.Utilities.Ingester = function(myWindow, proxiedURL, isHidden) {
this.window = myWindow;
this.proxiedURL = proxiedURL;
+ this.isHidden = isHidden;
}
Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
@@ -154,95 +155,6 @@ Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, paren
return elmts;
}
-// Loads a single document for a scraper, running succeeded() on success or
-// failed() on failure
-Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
- Scholar.debug("loadDocument called");
- this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
-}
-
-// Downloads and processes documents with processor()
-// browser - a browser object
-// firstDoc - the first document to process with the processor (if null,
-// first document is processed without processor)
-// urls - an array of URLs to load
-// processor - a function to execute to process each document
-// done - a function to execute when all document processing is complete
-// exception - a function to execute if an exception occurs (exceptions are
-// also logged in the Scholar for Firefox log)
-Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
- var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
- var myWindow = this.window;
- var prevUrl, url;
- Scholar.debug("processDocuments called");
-
- try {
- if (urls.length == 0) {
- if(firstDoc) {
- processor(firstDoc, done);
- } else {
- done();
- }
- return;
- }
-
- var urlIndex = -1;
- var doLoad = function() {
- urlIndex++;
- if (urlIndex < urls.length) {
- url = urls[urlIndex];
- if(this.proxiedURL) {
- url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
- }
- try {
- Scholar.debug("loading "+url);
- hiddenBrowser.loadURI(url);
- } catch (e) {
- Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
- exception(e);
- }
- } else {
- hiddenBrowser.removeEventListener("load", onLoad, true);
- Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
- done();
- }
- };
- var onLoad = function() {
- Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
- if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
- prevUrl = hiddenBrowser.contentDocument.location.href;
- try {
- var newHiddenBrowser = new Object();
- newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
- newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
- processor(newHiddenBrowser);
- } catch (e) {
- Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
- exception(e);
- }
- doLoad();
- }
- };
- var init = function() {
- Scholar.debug("init called");
- hiddenBrowser.addEventListener("load", onLoad, true);
-
- if (firstDoc) {
- Scholar.debug("processing");
- processor(firstDoc, doLoad);
- } else {
- Scholar.debug("doing load");
- doLoad();
- }
- }
-
- init();
- } catch (e) {
- Scholar.debug("processDocuments: " + e);
- exception(e);
- }
-}
-
// Appears to look for links in a document containing a certain substring (kind
// of like getItemArray, only with NO REGEXP FUNCTIONALITY)
Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) {
@@ -296,11 +208,15 @@ Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode,
* Allows a user to select which items to scrape
*/
Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) {
- // mozillazine made me do it! honest!
- var io = { dataIn:itemList, dataOut:null }
- var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
- "_blank","chrome,modal,centerscreen,resizable=yes", io);
- return io.dataOut;
+ if(this.isHidden != true) {
+ // this is kinda ugly, mozillazine made me do it! honest!
+ var io = { dataIn:itemList, dataOut:null }
+ var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
+ "_blank","chrome,modal,centerscreen,resizable=yes", io);
+ return io.dataOut;
+ } else {
+ return null;
+ }
}
/*
@@ -348,13 +264,6 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
return availableItems;
}
-/*
- * Handles OAI-PMH requests
- */
-Scholar.Utilities.Ingester.prototype.importOAIPMH = function(uri, model) {
-
-}
-
// These functions are for use by importMARCRecord. They're private, because,
// while they are useful, it's also nice if as many of our scrapers as possible
// are PiggyBank compatible, and if our scrapers used functions, that would
@@ -465,478 +374,19 @@ Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, mo
model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true);
}
-/*
- * END SCHOLAR FOR FIREFOX EXTENSIONS
- */
-
-// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
-// accessed outside the sandbox, and even if it could, it wouldn't let scripts
-// access across domains, so everything's replicated here.
-// Scholar for Firefox Utilities
-// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
-// This code is licensed according to the GPL
-
-/////////////////////////////////////////////////////////////////
-//
-// Scholar.Utilities
-//
-/////////////////////////////////////////////////////////////////
-// Scholar.Utilities class, a set of methods to assist in data
-// extraction. Some of the code here was stolen directly from the Piggy Bank
-// project.
-
-Scholar.Utilities = function () {}
-
-// Adapter for Piggy Bank function to print debug messages; log level is
-// fixed at 4 (could change this)
-Scholar.Utilities.prototype.debugPrint = function(msg) {
- Scholar.debug(msg, 4);
-}
-
-// Appears to trim a string, chopping of newlines/spacing
-Scholar.Utilities.prototype.trimString = function(s) {
- var i = 0;
- var spaceChars = " \n\r\t" + String.fromCharCode(160) /* */;
- while (i < s.length) {
- var c = s.charAt(i);
- if (spaceChars.indexOf(c) < 0) {
- break;
- }
- i++;
- }
-
- s = s.substring(i);
-
- i = s.length;
- while (i > 0) {
- var c = s.charAt(i - 1);
- if (spaceChars.indexOf(c) < 0) {
- break;
- }
- i--;
- }
-
- return s.substring(0, i);
-}
-
-/*
- * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
- * Functions below this point are extensions to the utilities provided by
- * Piggy Bank. When used in external code, the repository will need to add
- * a function definition when exporting in Piggy Bank format.
- */
-
-/*
- * Converts a JavaScript date object to an ISO-style date
- */
-Scholar.Utilities.prototype.dateToISO = function(jsDate) {
- var date = "";
- var year = jsDate.getFullYear().toString();
- var month = (jsDate.getMonth()+1).toString();
- var day = jsDate.getDate().toString();
-
- for(var i = year.length; i<4; i++) {
- date += "0";
- }
- date += year+"-";
-
- if(month.length == 1) {
- date += "0";
- }
- date += month+"-";
-
- if(day.length == 1) {
- date += "0";
- }
- date += day;
-
- return date;
-}
-
-/*
- * Cleans extraneous punctuation off an author name
- */
-Scholar.Utilities.prototype.cleanAuthor = function(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
- author = author.replace(/ +/, ' ');
- // Add period for initials
- if(author.substring(author.length-2, author.length-1) == " ") {
- author += ".";
- }
- var splitNames = author.split(', ');
- if(splitNames.length > 1) {
- author = splitNames[1]+' '+splitNames[0];
- }
- return author;
-}
-
-/*
- * Cleans whitespace off a string and replaces multiple spaces with one
- */
-Scholar.Utilities.prototype.cleanString = function(s) {
- s = s.replace(/[ \xA0]+/g, " ");
- return this.trimString(s);
-}
-
-/*
- * Cleans any non-word non-parenthesis characters off the ends of a string
- */
-Scholar.Utilities.prototype.superCleanString = function(x) {
- var x = x.replace(/^[^\w(]+/, "");
- return x.replace(/[^\w)]+$/, "");
-}
-
-/*
- * Eliminates HTML tags, replacing
s with /ns
- */
-Scholar.Utilities.prototype.cleanTags = function(x) {
- x = x.replace(/
]*>/gi, "\n");
- return x.replace(/<[^>]+>/g, "");
-}
-
-// These functions are for use by importMARCRecord. They're private, because,
-// while they are useful, it's also nice if as many of our scrapers as possible
-// are PiggyBank compatible, and if our scrapers used functions, that would
-// break compatibility
-Scholar.Utilities.prototype._MARCCleanString = function(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
- return author.replace(/ +/, ' ');
-}
-
-Scholar.Utilities.prototype._MARCCleanNumber = function(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
- var regexp = /^[^ ]*/;
- var m = regexp.exec(author);
- if(m) {
- return m[0];
- }
-}
-Scholar.Utilities.prototype._MARCPullYear = function(text) {
- var pullRe = /[0-9]+/;
- var m = pullRe.exec(text);
- if(m) {
- return m[0];
- }
-}
-
-Scholar.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
- if(!part) {
- part = 'a';
- }
- var field = record.get_field_subfields(fieldNo);
- Scholar.debug('Found '+field.length+' matches for '+fieldNo+part);
- if(field) {
- for(i in field) {
- var value;
- for(var j=0; j= 0 && !(addedURLs[href])) {
- urls.unshift(href);
- addedURLs[href] = true;
- }
- aElement = aElements.iterateNext();
- }
- return urls;
-}
-
-// For now, we're going to skip the getLLsFromAddresses function (which gets
-// latitude and longitude pairs from a series of addresses, but requires the
-// big mess of Java code that is the Piggy Bank server) and the geoHelper
-// tools (which rely on getLLsFromAddresses) since these are probably not
-// essential components for Scholar and would take a great deal of effort to
-// implement. We can, however, always implement them later.
-
-/*
- * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
- */
-
-/*
- * Gets a given node (assumes only one value)
- */
-Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext();
-}
-
-/*
- * Gets a given node as a string containing all child nodes
- */
-Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) {
- var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
- var returnVar = "";
- for(var i=0; i