diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js new file mode 100644 index 000000000..a88414d51 --- /dev/null +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -0,0 +1,140 @@ +// Firefox Scholar Ingester Browser Functions +// Utilities based on code taken from Greasemonkey +// This code is licensed according to the GPL + +// Prepare the browser and collector instrumentation caches -------------------- +Scholar.Ingester.Interface = function() {} + +Scholar.Ingester.Interface.init = function() { + Scholar.Ingester.Interface.browsers = new Array(); + + window.addEventListener("load", Scholar.Ingester.Interface.chromeLoad, false); + window.addEventListener("unload", Scholar.Ingester.Interface.chromeUnload, false); + + Scholar.Ingester.Interface.browsers = new Array(); + Scholar.Ingester.Interface.browserDocuments = new Object(); +} + +Scholar.Ingester.Interface.chromeLoad = function() { + Scholar.Ingester.Interface.tabBrowser = document.getElementById("content"); + Scholar.Ingester.Interface.appContent = document.getElementById("appcontent"); + Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image"); + + // this gives us onLocationChange + Scholar.Ingester.Interface.tabBrowser.addProgressListener(Scholar.Ingester.Interface.Listener, + Components.interfaces.nsIWebProgress.NOTIFY_LOCATION); + // this gives us DOMContentLoaded + Scholar.Ingester.Interface.appContent.addEventListener("DOMContentLoaded", + Scholar.Ingester.Interface.contentLoad, true); +} + +Scholar.Ingester.Interface.chromeUnload = function() { + this.tabBrowser.removeProgressListener(this); +} + +Scholar.Ingester.Interface.getDocument = function(browser) { + try { + var key = browser.getAttribute("scholar-key"); + if(Scholar.Ingester.Interface.browserDocuments[key]) { + return Scholar.Ingester.Interface.browserDocuments[key]; + } + } finally {} + return false; +} + +Scholar.Ingester.Interface.setDocument = function(browser) { + try { + var key = browser.getAttribute("scholar-key"); + } finally { + if(!key) { + var key = (new Date()).getTime(); + browser.setAttribute("scholar-key", key); + } + } + Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser); + Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper(); +} + +Scholar.Ingester.Interface.deleteDocument = function(browser) { + try { + var key = browser.getAttribute("scholar-key"); + if(Scholar.Ingester.Interface.browserDocuments[key]) { + delete Scholar.Ingester.Interface.browserDocuments[key]; + return true; + } + } finally {} + return false; +} + +Scholar.Ingester.Interface.scrapeThisPage = function() { + var document = Scholar.Ingester.Interface.getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser); + if(document.scraper) { + document.scrapePage(); + } +} + +Scholar.Ingester.Interface.updateStatus = function(browser) { + var document = Scholar.Ingester.Interface.getDocument(browser); + if(document && document.scraper) { + this.statusImage.src = "chrome://scholar/skin/capture_colored.png"; + } else { + this.statusImage.src = "chrome://scholar/skin/capture_gray.png"; + } +} + +Scholar.Ingester.Interface.contentLoad = function() { + Scholar.Ingester.Interface.setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser); + Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser); +} + +Scholar.Ingester.Interface.Listener = function() {} +Scholar.Ingester.Interface.Listener.onStatusChange = function() {} +Scholar.Ingester.Interface.Listener.onSecurityChange = function() {} +Scholar.Ingester.Interface.Listener.onProgressChange = function() {} +Scholar.Ingester.Interface.Listener.onStateChange = function() {} +Scholar.Ingester.Interface.Listener.onLocationChange = function() { + var browsers = Scholar.Ingester.Interface.tabBrowser.browsers; + + // Remove document object of any browser that no longer exists + for (var i = 0; i < Scholar.Ingester.Interface.browsers.length; i++) { + var browser = Scholar.Ingester.Interface.browsers[i]; + var exists = false; + + for (var j = 0; j < browsers.length; j++) { + if (browser == browsers[j]) { + exists = true; + break; + } + } + + if (!exists) { + Scholar.Ingester.Interface.browsers.splice(i,1); + + // To execute if document object does not exist + Scholar.Ingester.Interface.deleteDocument(browser); + } + } + + /*// Add a collector to any new browser + for (var i = 0; i < browsers.length; i++) { + var browser = browsers[i]; + var exists = false; + + for (var j = 0; j < Scholar.Ingester.Interface.browsers.length; j++) { + if (browser == Scholar.Ingester.Interface.browsers[j]) { + exists = true; + break; + } + } + + if (!exists) { + Scholar.Ingester.Interface.browsers.splice(i,0,browser); + + // To execute if window is new + } + }*/ + + Scholar.Ingester.Interface.updateStatus( + Scholar.Ingester.Interface.tabBrowser.selectedBrowser + ); +} \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.xul b/chrome/chromeFiles/content/scholar/ingester/browser.xul new file mode 100755 index 000000000..150b3c550 --- /dev/null +++ b/chrome/chromeFiles/content/scholar/ingester/browser.xul @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + diff --git a/chrome/chromeFiles/content/scholar/ingester/ingester.js b/chrome/chromeFiles/content/scholar/ingester/ingester.js new file mode 100644 index 000000000..7295c68c1 --- /dev/null +++ b/chrome/chromeFiles/content/scholar/ingester/ingester.js @@ -0,0 +1,473 @@ +// Firefox Scholar Ingester +// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) +// This code is licensed according to the GPL + +Scholar.Ingester = new function() {} + +///////////////////////////////////////////////////////////////// +// +// Scholar.Ingester.Model +// +///////////////////////////////////////////////////////////////// + +// Scholar.Ingester.Model, an object representing an RDF data model with +// methods to add to that model. In Piggy Bank, this was implemented in Java, +// but seeing as we don't really want an enormous web server running with FS, +// but we don't actually need that, so it's much simpler. +// +// The Java version of this class can be viewed at +// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java +Scholar.Ingester.Model = function() { + this.data = new Object(); +} + +// Piggy Bank provides a fourth argument, one that determines if the third +// argument is a literal or an RDF URI. Since our ontologies are +// sufficiently restricted, we have no chance of confusing a literal and an +// RDF URI and thus this is unnecessary. +Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) { + if(!this.data[uri]) this.data[uri] = new Object(); + this.data[uri][rdfUri] = literal; + Scholar.debug(rdfUri+" for "+uri+" is "+literal); +} + +// Additional functions added for compatibility purposes only +// No idea if any scraper actually uses these, but just in case, they're +// implemented so as not to throw an exception +Scholar.Ingester.Model.prototype.addTag = function() {} +Scholar.Ingester.Model.prototype.getRepository = function() {} +Scholar.Ingester.Model.prototype.detachRepository = function() {} + +///////////////////////////////////////////////////////////////// +// +// Scholar.Ingester.Utilities +// +///////////////////////////////////////////////////////////////// +// Scholar.Ingester.Utilities class, a set of methods to assist in data +// extraction. Most code here was stolen directly from the Piggy Bank project. +Scholar.Ingester.Utilities = function() {} + +// Adapter for Piggy Bank function to print debug messages; log level is +// fixed at 4 (could change this) +Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) { + Scholar.debug(msg, 4); +} + +// Appears to trim a string, chopping of newlines/spacing +Scholar.Ingester.Utilities.prototype.trimString = function(s) { + var i = 0; + var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */; + while (i < s.length) { + var c = s.charAt(i); + if (spaceChars.indexOf(c) < 0) { + break; + } + i++; + } + + s = s.substring(i); + + i = s.length; + while (i > 0) { + var c = s.charAt(i - 1); + if (spaceChars.indexOf(c) < 0) { + break; + } + i--; + } + + return s.substring(0, i); +} + +// Takes an XPath query and returns the results +Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { + var elmts = []; + + var iterator = doc.evaluate(xpath, parentNode, nsResolver, XPathResult.ANY_TYPE,null); + var elmt = iterator.iterateNext(); + var i = 0; + while (elmt) { + elmts[i++] = elmt; + elmt = iterator.iterateNext(); + } + return elmts; +} + +// Loads a single document for a scraper, running succeeded() on success or +// failed() on failure +Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) { + this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); +} + +// Downloads and processes documents with processor() +// browser - a browser object +// firstDoc - the first document to process with the processor (if null, +// first document is processed without processor) +// urls - an array of URLs to load +// processor - a function to execute to process each document +// done - a function to execute when all document processing is complete +// exception - a function to execute if an exception occurs (exceptions are +// also logged in the Firefox Scholar log) +Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { + try { + if (urls.length == 0) { + if (firstDoc) { + processor(firstDoc, done); + } else { + done(); + } + return; + } + + var urlIndex = -1; + var doLoad = function() { + urlIndex++; + if (urlIndex < urls.length) { + try { + var url = urls[urlIndex]; + var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser"); + b.loadURI(url); + } catch (e) { + exception(e); + Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2); + } + } else { + window.setTimeout(done, 10); + } + }; + var onLoad = function() { + try { + var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser; + processor(b.contentDocument, doLoad); + } catch (e) { + exception(e); + Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); + } + }; + var init = function() { + var listener; + listener.onStateChange = function(webProgress, request, stateFlags, status) { + if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 && + request.name == urls[urlIndex]) { + try { + Scholar.Ingester.progressDialog.setTimeout(onLoad, 10); + } catch (e) { + exception(e); + Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2); + } + } + }; + + var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser"); + tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS); + + if (firstDoc) { + processor(firstDoc, doLoad); + } else { + doLoad(); + } + } + + w.addEventListener("load", init, false); + } catch (e) { + exception(e); + PB_Debug.print("processDocuments: " + e); + } +} + +// Appears to look for links in a document containing a certain substring +Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) { + var urls = []; + var addedURLs = []; + + var aElements = doc.evaluate("//a", doc, null, XPathResult.ANY_TYPE,null); + var aElement = aElements.iterateNext(); + while (aElement) { + var href = aElement.href; + if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { + urls.unshift(href); + addedURLs[href] = true; + } + aElement = aElements.iterateNext(); + } + return urls; +} + +// For now, we're going to skip the getLLsFromAddresses function (which gets +// latitude and longitude pairs from a series of addresses, but requires the +// big mess of Java code that is the Piggy Bank server) and the geoHelper +// tools (which rely on getLLsFromAddresses) since these are probably not +// essential components for Scholar and would take a great deal of effort to +// implement. We can, however, always implement them later. + +// It looks like these are simple front-ends for XMLHttpRequest. They're a +// component of the Piggy Bank API, so they're implemented here. +Scholar.Ingester.Utilities.HTTPUtilities = function() {} + +Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) { + var xmlhttp = new XMLHttpRequest(); + + xmlhttp.open('GET', url, true); + xmlhttp.overrideMimeType("text/xml"); + xmlhttp.onreadystatechange = function() { + Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone); + }; + xmlhttp.send(null); +} + +Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) { + var xmlhttp = new XMLHttpRequest(); + + xmlhttp.open('POST', url, true); + xmlhttp.overrideMimeType("text/xml"); + xmlhttp.onreadystatechange = function() { + Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone); + }; + xmlhttp.send(body); +} + +Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) { + var xmlhttp = new XMLHttpRequest(); + + xmlhttp.open('OPTIONS', url, true); + xmlhttp.overrideMimeType("text/xml"); + xmlhttp.onreadystatechange = function() { + Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone); + }; + xmlhttp.send(body); +} + +// Possible point of failure; for some reason, this used to be a separate +// class, so make sure it works +Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) { + switch (xmlhttp.readyState) { + + // Request not yet made + case 1: + break; + + // Contact established with server but nothing downloaded yet + case 2: + try { + // Check for HTTP status 200 + if (xmlhttp.status != 200) { + if (onStatus) { + onStatus( + xmlhttp.status, + xmlhttp.statusText, + xmlhttp + ); + xmlhttp.abort(); + } + } + } catch (e) { + Scholar.debug(e, 2); + } + break; + + // Called multiple while downloading in progress + case 3: + break; + + // Download complete + case 4: + try { + if (onDone) { + onDone(xmlhttp.responseText, xmlhttp); + } + } catch (e) { + Scholar.debug(e, 2); + } + break; + } +} +////////////////////////////////////////////////////////////////////////////// +// +// Scholar.Ingester.Document +// +////////////////////////////////////////////////////////////////////////////// + +/* Public properties: + * browser - browser window object of document + * model - data model for semantic scrapers + * scraper - best scraper to use to scrape page + * + * Private properties: + * _sandbox - sandbox for code execution + * _progressDialog - dialog showing scrape progress + */ + +////////////////////////////////////////////////////////////////////////////// +// +// Public Scholar.Ingester.Document methods +// +////////////////////////////////////////////////////////////////////////////// + +/* + * Constructor for Document object + */ +Scholar.Ingester.Document = function(browserWindow){ + this.browser = browserWindow; + this.scraper = null + this.model = new Scholar.Ingester.Model(); + this._generateSandbox(); +} + +/* + * Retrieves the best scraper to scrape a given page + */ +Scholar.Ingester.Document.prototype.retrieveScraper = function() { + Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href); + var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC'; + var scrapers = Scholar.DB.query(sql); + for(var i=0; i + + + + + + + + + + + + + + diff --git a/chrome/chromeFiles/skin/default/scholar/capture_colored.png b/chrome/chromeFiles/skin/default/scholar/capture_colored.png new file mode 100644 index 000000000..fedb86584 Binary files /dev/null and b/chrome/chromeFiles/skin/default/scholar/capture_colored.png differ diff --git a/chrome/chromeFiles/skin/default/scholar/capture_gray.png b/chrome/chromeFiles/skin/default/scholar/capture_gray.png new file mode 100644 index 000000000..a100833ed Binary files /dev/null and b/chrome/chromeFiles/skin/default/scholar/capture_gray.png differ