diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index 7ccffee54..906be0655 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -121,6 +121,473 @@ Scholar.Utilities.prototype.cleanTags = function(x) { return x.replace(/<[^>]+>/g, ""); } +/* + * END SCHOLAR FOR FIREFOX EXTENSIONS + */ + +///////////////////////////////////////////////////////////////// +// +// Scholar.Utilities.Ingester +// +///////////////////////////////////////////////////////////////// +// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional +// classes relating to data extraction specifically from HTML documents. + +Scholar.Utilities.Ingester = function(myWindow, proxiedURL) { + this.window = myWindow; + this.proxiedURL = proxiedURL; +} + +Scholar.Utilities.Ingester.prototype = new Scholar.Utilities(); + +// Takes an XPath query and returns the results +Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { + var elmts = []; + + var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); + var elmt = iterator.iterateNext(); + var i = 0; + while (elmt) { + elmts[i++] = elmt; + elmt = iterator.iterateNext(); + } + return elmts; +} + +// Loads a single document for a scraper, running succeeded() on success or +// failed() on failure +Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) { + Scholar.debug("loadDocument called"); + this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); +} + +// Downloads and processes documents with processor() +// browser - a browser object +// firstDoc - the first document to process with the processor (if null, +// first document is processed without processor) +// urls - an array of URLs to load +// processor - a function to execute to process each document +// done - a function to execute when all document processing is complete +// exception - a function to execute if an exception occurs (exceptions are +// also logged in the Scholar for Firefox log) +Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { + var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window); + var myWindow = this.window; + var prevUrl, url; + Scholar.debug("processDocuments called"); + + try { + if (urls.length == 0) { + if(firstDoc) { + processor(firstDoc, done); + } else { + done(); + } + return; + } + + var urlIndex = -1; + var doLoad = function() { + urlIndex++; + if (urlIndex < urls.length) { + url = urls[urlIndex]; + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + try { + Scholar.debug("loading "+url); + hiddenBrowser.loadURI(url); + } catch (e) { + Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2); + exception(e); + } + } else { + hiddenBrowser.removeEventListener("load", onLoad, true); + Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser); + done(); + } + }; + var onLoad = function() { + Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded"); + if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times + prevUrl = hiddenBrowser.contentDocument.location.href; + try { + var newHiddenBrowser = new Object(); + newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; + newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; + processor(newHiddenBrowser); + } catch (e) { + Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2); + exception(e); + } + doLoad(); + } + }; + var init = function() { + Scholar.debug("init called"); + hiddenBrowser.addEventListener("load", onLoad, true); + + if (firstDoc) { + Scholar.debug("processing"); + processor(firstDoc, doLoad); + } else { + Scholar.debug("doing load"); + doLoad(); + } + } + + init(); + } catch (e) { + Scholar.debug("processDocuments: " + e); + exception(e); + } +} + +// Appears to look for links in a document containing a certain substring (kind +// of like getItemArray, only with NO REGEXP FUNCTIONALITY) +Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) { + var urls = []; + var addedURLs = []; + + var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); + var aElement = aElements.iterateNext(); + while (aElement) { + var href = aElement.href; + if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { + urls.unshift(href); + addedURLs[href] = true; + } + aElement = aElements.iterateNext(); + } + return urls; +} + +// For now, we're going to skip the getLLsFromAddresses function (which gets +// latitude and longitude pairs from a series of addresses, but requires the +// big mess of Java code that is the Piggy Bank server) and the geoHelper +// tools (which rely on getLLsFromAddresses) since these are probably not +// essential components for Scholar and would take a great deal of effort to +// implement. We can, however, always implement them later. + +/* + * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS + */ + +/* + * Gets a given node (assumes only one value) + */ +Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); +} + +/* + * Gets a given node as a string containing all child nodes + */ +Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { + var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); + var returnVar = ""; + for(var i=0; i 0) { + var c = s.charAt(i - 1); + if (spaceChars.indexOf(c) < 0) { + break; + } + i--; + } + + return s.substring(0, i); +} + +/* + * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS + * Functions below this point are extensions to the utilities provided by + * Piggy Bank. When used in external code, the repository will need to add + * a function definition when exporting in Piggy Bank format. + */ + +/* + * Converts a JavaScript date object to an ISO-style date + */ +Scholar.Utilities.prototype.dateToISO = function(jsDate) { + var date = ""; + var year = jsDate.getFullYear().toString(); + var month = (jsDate.getMonth()+1).toString(); + var day = jsDate.getDate().toString(); + + for(var i = year.length; i<4; i++) { + date += "0"; + } + date += year+"-"; + + if(month.length == 1) { + date += "0"; + } + date += month+"-"; + + if(day.length == 1) { + date += "0"; + } + date += day; + + return date; +} + +/* + * Cleans extraneous punctuation off an author name + */ +Scholar.Utilities.prototype.cleanAuthor = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); + author = author.replace(/ +/, ' '); + // Add period for initials + if(author.substring(author.length-2, author.length-1) == " ") { + author += "."; + } + var splitNames = author.split(', '); + if(splitNames.length > 1) { + author = splitNames[1]+' '+splitNames[0]; + } + return author; +} + +/* + * Cleans whitespace off a string and replaces multiple spaces with one + */ +Scholar.Utilities.prototype.cleanString = function(s) { + s = s.replace(/[ \xA0]+/g, " "); + return this.trimString(s); +} + +/* + * Cleans any non-word non-parenthesis characters off the ends of a string + */ +Scholar.Utilities.prototype.superCleanString = function(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +/* + * Eliminates HTML tags, replacing
s with /ns + */ +Scholar.Utilities.prototype.cleanTags = function(x) { + x = x.replace(/]*>/gi, "\n"); + return x.replace(/<[^>]+>/g, ""); +} + // These functions are for use by importMARCRecord. They're private, because, // while they are useful, it's also nice if as many of our scrapers as possible // are PiggyBank compatible, and if our scrapers used functions, that would @@ -461,104 +928,186 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, * END SCHOLAR FOR FIREFOX EXTENSIONS */ +// Ingester adapters for Scholar.Utilities.HTTP to handle proxies + +Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) { + this.proxiedURL = proxiedURL +} + +Scholar.Utilities.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + Scholar.Utilities.HTTP.doGet(url, onStatus, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) }) +} + +Scholar.Utilities.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + Scholar.Utilities.HTTP.doPost(url, body, onStatus, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) }) +} + +Scholar.Utilities.Ingester.HTTPUtilities.prototype.doOptions = function(url, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + Scholar.Utilities.HTTP.doOptions(url, onStatus, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) }) +} + // These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be // accessed outside the sandbox, and even if it could, it wouldn't let scripts // access across domains, so everything's replicated here. -Scholar.Utilities.HTTP = function(contentWindow, proxiedURL) { - this.window = contentWindow; - this.proxiedURL = proxiedURL; -} - -Scholar.Utilities.HTTP.prototype.doGet = function(url, onStatus, onDone) { - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); +Scholar.Utilities.HTTP = new function() { + this.doGet = doGet; + this.doPost = doPost; + this.doOptions = doOptions; + this.browserIsOffline = browserIsOffline; + + /** + * Send an HTTP GET request via XMLHTTPRequest + * + * Returns false if browser is offline + **/ + function doGet(url, onStatus, onDone) { + if (this.browserIsOffline()){ + return false; + } + + var xmlhttp = Components.classes["@mozilla.org/xmlextras/xmlhttprequest;1"] + .createInstance(); + + var test = xmlhttp.open('GET', url, true); + + xmlhttp.onreadystatechange = function(){ + _stateChange(xmlhttp, onStatus, onDone); + }; + + xmlhttp.send(null); + + return true; } - var xmlhttp = new this.window.XMLHttpRequest(); - xmlhttp.open('GET', url, true); - xmlhttp.overrideMimeType("text/plain"); - - var me = this; - xmlhttp.onreadystatechange = function() { - me.stateChange(xmlhttp, onStatus, onDone); - }; - xmlhttp.send(null); -} - -Scholar.Utilities.HTTP.prototype.doPost = function(url, body, onStatus, onDone) { - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + /** + * Send an HTTP POST request via XMLHTTPRequest + * + * Returns false if browser is offline + **/ + function doPost(url, body, onStatus, onDone) { + if (this.browserIsOffline()){ + return false; + } + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + + var xmlhttp = Components.classes["@mozilla.org/xmlextras/xmlhttprequest;1"] + .createInstance(); + + xmlhttp.open('POST', url, true); + + var me = this; + xmlhttp.onreadystatechange = function(){ + _stateChange(xmlhttp, onStatus, onDone); + }; + + xmlhttp.send(body); + + return true; } - var xmlhttp = new this.window.XMLHttpRequest(); - xmlhttp.open('POST', url, true); - xmlhttp.overrideMimeType("text/plain"); - - var me = this; - xmlhttp.onreadystatechange = function() { - me.stateChange(xmlhttp, onStatus, onDone); - }; - xmlhttp.send(body); -} - -Scholar.Utilities.HTTP.prototype.doOptions = function(url, body, onStatus, onDone) { - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + /** + * Send an HTTP OPTIONS request via XMLHTTPRequest + * + * Returns false if browser is offline + **/ + function doOptions(url, body, onStatus, onDone) { + if (this.browserIsOffline()){ + return false; + } + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + + var xmlhttp = Components.classes["@mozilla.org/xmlextras/xmlhttprequest;1"] + .createInstance(); + + xmlhttp.open('OPTIONS', url, true); + + var me = this; + xmlhttp.onreadystatechange = function(){ + _stateChange(xmlhttp, onStatus, onDone); + }; + + xmlhttp.send(body); + + return true; } - var xmlhttp = new this.window.XMLHttpRequest(); - - xmlhttp.open('OPTIONS', url, true); - xmlhttp.overrideMimeType("text/plain"); - var me = this; - xmlhttp.onreadystatechange = function() { - me.stateChange(xmlhttp, onStatus, onDone); - }; - xmlhttp.send(body); -} - -Scholar.Utilities.HTTP.prototype.stateChange = function(xmlhttp, onStatus, onDone) { - switch (xmlhttp.readyState) { - - // Request not yet made - case 1: - break; - - // Contact established with server but nothing downloaded yet - case 2: - try { + function browserIsOffline() { + return Components.classes["@mozilla.org/network/io-service;1"] + .getService(Components.interfaces.nsIIOService).offline; + } + + + function _stateChange(xmlhttp, onStatus, onDone){ + if(!onDone) { + onStatus = null; + onDone = onStatus; + } + switch (xmlhttp.readyState){ + // Request not yet made + case 1: + break; + + // Contact established with server but nothing downloaded yet + case 2: + // Accessing status will throw an exception if no network connection + try { + xmlhttp.status; + } + catch (e){ + Scholar.debug('No network connection'); + xmlhttp.noNetwork = true; + return false; + } + // Check for HTTP status 200 - if (xmlhttp.status != 200) { - if (onStatus) { - onStatus( - xmlhttp.status, - xmlhttp.statusText, - xmlhttp - ); - xmlhttp.abort(); + if (xmlhttp.status != 200){ + Scholar.debug('XMLHTTPRequest received HTTP response code ' + + xmlhttp.status); + if(onStatus) { + try { + onStatus( + xmlhttp.status, + xmlhttp.statusText, + xmlhttp + ); + } catch (e) { + Scholar.debug(e, 2); + } } } - } catch (e) { - Scholar.debug(e, 2); - } - break; - - // Called multiple while downloading in progress - case 3: - break; - - // Download complete - case 4: - try { - if (onDone) { - onDone(xmlhttp.responseText, xmlhttp); + break; + + // Called multiple while downloading in progress + case 3: + break; + + // Download complete + case 4: + try { + if (onDone){ + onDone(xmlhttp); + } } - } catch (e) { - Scholar.debug(e, 2); - } - break; + catch (e){ + Scholar.debug(e, 2); + } + break; + } } } \ No newline at end of file