From 4242c62b1b05e6248cfdf2b962c8d0eb46711588 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Mon, 26 Jun 2006 20:02:30 +0000 Subject: [PATCH] - Fix redundancy in utilities.js (I accidentally copied and pasted a much larger block of code than i meant to) - Move processDocuments, a function for loading a DOM representation of a document or set of documents, to Scholar.Utilities.HTTP - Add Scholar.Ingester.ingestURL, a simplified function to scrape a URL (closes #33) --- .../content/scholar/ingester/browser.js | 2 +- .../content/scholar/xpcom/ingester.js | 51 +- .../content/scholar/xpcom/utilities.js | 670 +++--------------- scrapers.sql | 4 +- 4 files changed, 152 insertions(+), 575 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index 88a8ff468..ccf7e0e1a 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -165,7 +165,7 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) ////////////////////////////////////////////////////////////////////////////// // -// Private Scholar.Ingester.Document methods +// Private Scholar_Ingester_Interface methods // ////////////////////////////////////////////////////////////////////////////// diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index 3090f2b88..cf620ce4a 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -19,6 +19,48 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) { Scholar.debug("deleted hidden browser"); } +/* + * Operates the ingester given only a URL + * url - URL to scrape + * complete - callback function to be executed if page grab completes + * (will be passed document object; obj.items contains array of + * *unsaved* items scraped; empty array indicates unscrapable page) + * error - callback function to be executed if an error occurred loading page + * myWindow - optional argument indicating window to attach a dialog to. if no + * window is given, Firefox Scholar uses the hidden DOM window and + * will simply avoid scraping multiple pages + */ +Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) { + var isHidden = false; + if(!myWindow) { + var myWindow = Components.classes["@mozilla.org/appshell/appShellService;1"] + .getService(Components.interfaces.nsIAppShellService) + .hiddenDOMWindow; + var isHidden = true; + } + + var succeeded = function(browser) { + var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden); + myDoc.retrieveScraper(); + if(myDoc.scraper) { + myDoc.scrapePage(function(myDoc) { + Scholar.Ingester.deleteHiddenBrowser(browser); + complete(myDoc); + }); + } else { + Scholar.Ingester.deleteHiddenBrowser(browser); + complete(myDoc); + } + } + + var failed = function() { + Scholar.debug("Scholar.Ingester.ingestURL: could not ingest "+url); + error(); + } + + Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed, true); +} + ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.ProxyMonitor @@ -195,10 +237,11 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {} /* * Constructor for Document object */ -Scholar.Ingester.Document = function(browserWindow, myWindow){ - this.scraper = this.type = null; - this.browser = browserWindow; +Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) { + this.browser = myBrowser; this.window = myWindow; + this.isHidden = isHidden; + this.scraper = this.type = null; this.model = new Scholar.Ingester.Model(); // Create separate URL to account for proxies @@ -349,7 +392,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() { this._sandbox.browser = this.browser; this._sandbox.doc = this.browser.contentDocument; this._sandbox.url = this.url; - this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL); + this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden); this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL); this._sandbox.window = this.window; this._sandbox.model = this.model; diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index c5b0b7b06..77d1589ed 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -133,9 +133,10 @@ Scholar.Utilities.prototype.cleanTags = function(x) { // Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional // classes relating to data extraction specifically from HTML documents. -Scholar.Utilities.Ingester = function(myWindow, proxiedURL) { +Scholar.Utilities.Ingester = function(myWindow, proxiedURL, isHidden) { this.window = myWindow; this.proxiedURL = proxiedURL; + this.isHidden = isHidden; } Scholar.Utilities.Ingester.prototype = new Scholar.Utilities(); @@ -154,95 +155,6 @@ Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, paren return elmts; } -// Loads a single document for a scraper, running succeeded() on success or -// failed() on failure -Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) { - Scholar.debug("loadDocument called"); - this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); -} - -// Downloads and processes documents with processor() -// browser - a browser object -// firstDoc - the first document to process with the processor (if null, -// first document is processed without processor) -// urls - an array of URLs to load -// processor - a function to execute to process each document -// done - a function to execute when all document processing is complete -// exception - a function to execute if an exception occurs (exceptions are -// also logged in the Scholar for Firefox log) -Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { - var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window); - var myWindow = this.window; - var prevUrl, url; - Scholar.debug("processDocuments called"); - - try { - if (urls.length == 0) { - if(firstDoc) { - processor(firstDoc, done); - } else { - done(); - } - return; - } - - var urlIndex = -1; - var doLoad = function() { - urlIndex++; - if (urlIndex < urls.length) { - url = urls[urlIndex]; - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); - } - try { - Scholar.debug("loading "+url); - hiddenBrowser.loadURI(url); - } catch (e) { - Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2); - exception(e); - } - } else { - hiddenBrowser.removeEventListener("load", onLoad, true); - Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser); - done(); - } - }; - var onLoad = function() { - Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded"); - if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times - prevUrl = hiddenBrowser.contentDocument.location.href; - try { - var newHiddenBrowser = new Object(); - newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; - newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; - processor(newHiddenBrowser); - } catch (e) { - Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2); - exception(e); - } - doLoad(); - } - }; - var init = function() { - Scholar.debug("init called"); - hiddenBrowser.addEventListener("load", onLoad, true); - - if (firstDoc) { - Scholar.debug("processing"); - processor(firstDoc, doLoad); - } else { - Scholar.debug("doing load"); - doLoad(); - } - } - - init(); - } catch (e) { - Scholar.debug("processDocuments: " + e); - exception(e); - } -} - // Appears to look for links in a document containing a certain substring (kind // of like getItemArray, only with NO REGEXP FUNCTIONALITY) Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) { @@ -296,11 +208,15 @@ Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, * Allows a user to select which items to scrape */ Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) { - // mozillazine made me do it! honest! - var io = { dataIn:itemList, dataOut:null } - var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul", - "_blank","chrome,modal,centerscreen,resizable=yes", io); - return io.dataOut; + if(this.isHidden != true) { + // this is kinda ugly, mozillazine made me do it! honest! + var io = { dataIn:itemList, dataOut:null } + var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul", + "_blank","chrome,modal,centerscreen,resizable=yes", io); + return io.dataOut; + } else { + return null; + } } /* @@ -348,13 +264,6 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, return availableItems; } -/* - * Handles OAI-PMH requests - */ -Scholar.Utilities.Ingester.prototype.importOAIPMH = function(uri, model) { - -} - // These functions are for use by importMARCRecord. They're private, because, // while they are useful, it's also nice if as many of our scrapers as possible // are PiggyBank compatible, and if our scrapers used functions, that would @@ -465,478 +374,19 @@ Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, mo model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true); } -/* - * END SCHOLAR FOR FIREFOX EXTENSIONS - */ - -// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be -// accessed outside the sandbox, and even if it could, it wouldn't let scripts -// access across domains, so everything's replicated here. -// Scholar for Firefox Utilities -// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) -// This code is licensed according to the GPL - -///////////////////////////////////////////////////////////////// -// -// Scholar.Utilities -// -///////////////////////////////////////////////////////////////// -// Scholar.Utilities class, a set of methods to assist in data -// extraction. Some of the code here was stolen directly from the Piggy Bank -// project. - -Scholar.Utilities = function () {} - -// Adapter for Piggy Bank function to print debug messages; log level is -// fixed at 4 (could change this) -Scholar.Utilities.prototype.debugPrint = function(msg) { - Scholar.debug(msg, 4); -} - -// Appears to trim a string, chopping of newlines/spacing -Scholar.Utilities.prototype.trimString = function(s) { - var i = 0; - var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */; - while (i < s.length) { - var c = s.charAt(i); - if (spaceChars.indexOf(c) < 0) { - break; - } - i++; - } - - s = s.substring(i); - - i = s.length; - while (i > 0) { - var c = s.charAt(i - 1); - if (spaceChars.indexOf(c) < 0) { - break; - } - i--; - } - - return s.substring(0, i); -} - -/* - * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS - * Functions below this point are extensions to the utilities provided by - * Piggy Bank. When used in external code, the repository will need to add - * a function definition when exporting in Piggy Bank format. - */ - -/* - * Converts a JavaScript date object to an ISO-style date - */ -Scholar.Utilities.prototype.dateToISO = function(jsDate) { - var date = ""; - var year = jsDate.getFullYear().toString(); - var month = (jsDate.getMonth()+1).toString(); - var day = jsDate.getDate().toString(); - - for(var i = year.length; i<4; i++) { - date += "0"; - } - date += year+"-"; - - if(month.length == 1) { - date += "0"; - } - date += month+"-"; - - if(day.length == 1) { - date += "0"; - } - date += day; - - return date; -} - -/* - * Cleans extraneous punctuation off an author name - */ -Scholar.Utilities.prototype.cleanAuthor = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); - author = author.replace(/ +/, ' '); - // Add period for initials - if(author.substring(author.length-2, author.length-1) == " ") { - author += "."; - } - var splitNames = author.split(', '); - if(splitNames.length > 1) { - author = splitNames[1]+' '+splitNames[0]; - } - return author; -} - -/* - * Cleans whitespace off a string and replaces multiple spaces with one - */ -Scholar.Utilities.prototype.cleanString = function(s) { - s = s.replace(/[ \xA0]+/g, " "); - return this.trimString(s); -} - -/* - * Cleans any non-word non-parenthesis characters off the ends of a string - */ -Scholar.Utilities.prototype.superCleanString = function(x) { - var x = x.replace(/^[^\w(]+/, ""); - return x.replace(/[^\w)]+$/, ""); -} - -/* - * Eliminates HTML tags, replacing
s with /ns - */ -Scholar.Utilities.prototype.cleanTags = function(x) { - x = x.replace(/]*>/gi, "\n"); - return x.replace(/<[^>]+>/g, ""); -} - -// These functions are for use by importMARCRecord. They're private, because, -// while they are useful, it's also nice if as many of our scrapers as possible -// are PiggyBank compatible, and if our scrapers used functions, that would -// break compatibility -Scholar.Utilities.prototype._MARCCleanString = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - return author.replace(/ +/, ' '); -} - -Scholar.Utilities.prototype._MARCCleanNumber = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - var regexp = /^[^ ]*/; - var m = regexp.exec(author); - if(m) { - return m[0]; - } -} -Scholar.Utilities.prototype._MARCPullYear = function(text) { - var pullRe = /[0-9]+/; - var m = pullRe.exec(text); - if(m) { - return m[0]; - } -} - -Scholar.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) { - if(!part) { - part = 'a'; - } - var field = record.get_field_subfields(fieldNo); - Scholar.debug('Found '+field.length+' matches for '+fieldNo+part); - if(field) { - for(i in field) { - var value; - for(var j=0; j= 0 && !(addedURLs[href])) { - urls.unshift(href); - addedURLs[href] = true; - } - aElement = aElements.iterateNext(); - } - return urls; -} - -// For now, we're going to skip the getLLsFromAddresses function (which gets -// latitude and longitude pairs from a series of addresses, but requires the -// big mess of Java code that is the Piggy Bank server) and the geoHelper -// tools (which rely on getLLsFromAddresses) since these are probably not -// essential components for Scholar and would take a great deal of effort to -// implement. We can, however, always implement them later. - -/* - * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS - */ - -/* - * Gets a given node (assumes only one value) - */ -Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); -} - -/* - * Gets a given node as a string containing all child nodes - */ -Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { - var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); - var returnVar = ""; - for(var i=0; i