diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index 9f28a83b2..e150c71ee 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -41,8 +41,8 @@ Scholar_Ingester_Interface.chromeLoad = function() { // this gives us onLocationChange Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener, Components.interfaces.nsIWebProgress.NOTIFY_LOCATION); - // this gives us DOMContentLoaded - Scholar_Ingester_Interface.appContent.addEventListener("DOMContentLoaded", + // let's use load instead of DOMContentLoaded + Scholar_Ingester_Interface.appContent.addEventListener("load", Scholar_Ingester_Interface.contentLoad, true); } diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index 1a48b586f..78e14d34d 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -131,11 +131,13 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe // also logged in the Firefox Scholar log) Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window); + var myWindow = this.window; + var prevUrl, url; Scholar.debug("processDocuments called"); try { if (urls.length == 0) { - if (firstDoc) { + if(firstDoc) { processor(firstDoc, done); } else { done(); @@ -148,7 +150,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD urlIndex++; if (urlIndex < urls.length) { try { - var url = urls[urlIndex]; + url = urls[urlIndex]; Scholar.debug("loading "+url); hiddenBrowser.loadURI(url); } catch (e) { @@ -156,23 +158,26 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD exception(e); } } else { + hiddenBrowser.removeEventListener("load", onLoad, true); Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser); - hiddenBrowser.setTimeout(done, 10); + done(); } }; var onLoad = function() { - Scholar.debug("onLoad called"); - hiddenBrowser.removeEventListener("load", onLoad, true); - try { - var newHiddenBrowser = new Object(); - newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; - newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; - processor(newHiddenBrowser); - } catch (e) { - Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); - exception(e); + Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded"); + if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times + prevUrl = hiddenBrowser.contentDocument.location.href; + try { + var newHiddenBrowser = new Object(); + newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; + newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; + processor(newHiddenBrowser); + } catch (e) { + Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); + exception(e); + } + doLoad(); } - doLoad(); }; var init = function() { Scholar.debug("init called"); @@ -332,23 +337,33 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, var availableItems = new Object(); // Technically, associative arrays are objects // Require link to match this - var tagRegexp = new RegExp(); - tagRegexp.compile(urlRe); + if(urlRe) { + var urlRegexp = new RegExp(); + urlRegexp.compile(urlRe); + } // Do not allow text to match this - var rejectRegexp = new RegExp(); - rejectRegexp.compile(rejectRe); + if(rejectRe) { + var rejectRegexp = new RegExp(); + rejectRegexp.compile(rejectRe); + } - var links = inHere.getElementsByTagName("a"); - for(var i=0; i= 4) { - newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); + var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ + if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) { + newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); + } else { + var m; + var yearRe = /[0-9]{4}$/; + if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) { + newItem.setField("year", m[0]); + } + } } } diff --git a/scrapers.sql b/scrapers.sql index e5d0a6272..f28cb2ee1 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,9 +1,9 @@ --- 10 +-- 11 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 16:51:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 22:58:00')); -REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-18 10:15:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; @@ -13,59 +13,98 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -var uri = doc.location.href; - -// Retrieve authors -var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; +function scrape(doc) { + uri = doc.location.href; - model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here -} - -// Retrieve data from "Product Details" box -var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; - var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); - if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { - var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); - if(attribute == "Publisher:") { - if(value.lastIndexOf("(") != -1) { - var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1); - jsDate = new Date(jsDate); - var date = utilities.dateToISO(jsDate); - - value = value.substring(0, value.lastIndexOf("(")-1); + // Retrieve authors + var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; + var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + + model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here + } + + // Retrieve data from "Product Details" box + var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li''; + var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); + if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { + var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); + if(attribute == "Publisher:") { + if(value.lastIndexOf("(") != -1) { + var date = value.substring(value.lastIndexOf("(")+1, value.length-1); + jsDate = new Date(date); + if(!isNaN(jsDate.valueOf())) { + date = utilities.dateToISO(jsDate); + } + + value = value.substring(0, value.lastIndexOf("(")-1); + } + if(value.lastIndexOf(";") != -1) { + var edition = value.substring(value.lastIndexOf(";")+2, value.length); + value = value.substring(0, value.lastIndexOf(";")); + } + model.addStatement(uri, prefixDC + ''publisher'', value); + model.addStatement(uri, prefixDC + ''date'', date); + model.addStatement(uri, prefixDC + ''hasVersion'', edition); + } else if(attribute == "Language:") { + model.addStatement(uri, prefixDC + ''language'', value); + } else if(attribute == "ISBN:") { + model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value); + } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { + model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); + model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":"))); } - if(value.lastIndexOf(";") != -1) { - var edition = value.substring(value.lastIndexOf(";")+2, value.length); - value = value.substring(0, value.lastIndexOf(";")); - } - model.addStatement(uri, prefixDC + ''publisher'', value); - model.addStatement(uri, prefixDC + ''date'', date); - model.addStatement(uri, prefixDC + ''hasVersion'', edition); - } else if(attribute == "Language:") { - model.addStatement(uri, prefixDC + ''language'', value); - } else if(attribute == "ISBN:") { - model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value); - } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { - model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); - model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":"))); } } + + var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; + var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); + if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { + title = title.substring(0, title.lastIndexOf("(")-1); + } + model.addStatement(uri, prefixDC + ''title'', title); + model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); } -var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); -if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { - title = title.substring(0, title.lastIndexOf("(")-1); -} -model.addStatement(uri, prefixDC + ''title'', title); -model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);'); +var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)''); +var m = searchRe.exec(doc.location.href) +if(m) { + // Why can''t amazon use standard stylesheets + var xpath; + if(m == "gp/search/") { + xpath = ''//table[@class="searchresults"]''; + } else { + xpath = ''//table[@cellpadding="3"]''; + } + + var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var items = utilities.getItemArray(doc, searchresults, ''http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$''); + items = utilities.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(i in items) { + uris.push(i); + } + + utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { + utilities.debugPrint("look, done"); + done(); + }, function() {}); + + wait(); +} else { + scrape(doc); +}'); REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/', 'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {