From 064ecd17db3c8228d3db2e5a2b0d34b14700cd3c Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Fri, 11 Aug 2006 15:28:18 +0000 Subject: [PATCH] removes unnecessary pieces of piggy bank API from utilities and updates translators to abide by current translator guidelines --- .../content/scholar/xpcom/utilities.js | 105 +------- scrapers.sql | 225 +++++++++--------- 2 files changed, 120 insertions(+), 210 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index e01e170a5..385af17f9 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -1,61 +1,21 @@ // Scholar for Firefox Utilities -// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) -// This code is licensed according to the GPL ///////////////////////////////////////////////////////////////// // // Scholar.Utilities // ///////////////////////////////////////////////////////////////// -// Scholar.Utilities class, a set of methods to assist in data -// extraction. Some of the code here was stolen directly from the Piggy Bank -// project. Scholar.Utilities = function () {} -// Adapter for Piggy Bank function to print debug messages; log level is -// fixed at 4 (could change this) -Scholar.Utilities.prototype.debugPrint = function(msg) { +Scholar.Utilities.prototype.debug = function(msg) { Scholar.debug(msg, 4); } -// Appears to trim a string, chopping of newlines/spacing -Scholar.Utilities.prototype.trimString = function(s) { - var i = 0; - var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */; - while (i < s.length) { - var c = s.charAt(i); - if (spaceChars.indexOf(c) < 0) { - break; - } - i++; - } - - s = s.substring(i); - - i = s.length; - while (i > 0) { - var c = s.charAt(i - 1); - if (spaceChars.indexOf(c) < 0) { - break; - } - i--; - } - - return s.substring(0, i); -} - /* - * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS - * Functions below this point are extensions to the utilities provided by - * Piggy Bank. When used in external code, the repository will need to add - * a function definition when exporting in Piggy Bank format. + * Converts a JavaScript date object to an SQL-style date */ - -/* - * Converts a JavaScript date object to an ISO-style date - */ -Scholar.Utilities.prototype.dateToISO = function(jsDate) { +Scholar.Utilities.prototype.dateToSQL = function(jsDate) { var date = ""; var year = jsDate.getFullYear().toString(); var month = (jsDate.getMonth()+1).toString(); @@ -112,7 +72,8 @@ Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) { */ Scholar.Utilities.prototype.cleanString = function(s) { s = s.replace(/[ \xA0]+/g, " "); - return this.trimString(s); + s = s.replace(/^\s+/, ""); + return s.replace(/\s+$/, ""); } /* @@ -223,43 +184,6 @@ Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, paren return elmts; } -// Appears to look for links in a document containing a certain substring (kind -// of like getItemArray, only with NO REGEXP FUNCTIONALITY) -Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) { - var urls = []; - var addedURLs = []; - - var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); - var aElement = aElements.iterateNext(); - while (aElement) { - var href = aElement.href; - if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { - urls.unshift(href); - addedURLs[href] = true; - } - aElement = aElements.iterateNext(); - } - return urls; -} - -// For now, we're going to skip the getLLsFromAddresses function (which gets -// latitude and longitude pairs from a series of addresses, but requires the -// big mess of Java code that is the Piggy Bank server) and the geoHelper -// tools (which rely on getLLsFromAddresses) since these are probably not -// essential components for Scholar and would take a great deal of effort to -// implement. We can, however, always implement them later. - -/* - * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS - */ - -/* - * Gets a given node (assumes only one value) - */ -Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); -} - /* * Gets a given node as a string containing all child nodes */ @@ -325,10 +249,6 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) { return Scholar.OpenURL.parseContextObject(co, item); } -/* - * END SCHOLAR FOR FIREFOX EXTENSIONS - */ - // Ingester adapters for Scholar.Utilities.HTTP to handle proxies Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) { @@ -337,11 +257,13 @@ Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, fai } Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed); } -Scholar.Utilities.Ingester.prototype.processDocuments = function(firstDoc, urls, processor, done, exception) { - for(i in urls) { - urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); +Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) { + if(this.proxiedURL) { + for(i in urls) { + urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); + } } - Scholar.Utilities.HTTP.processDocuments(firstDoc, urls, processor, done, exception); + Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception); } Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) { @@ -615,10 +537,7 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times prevUrl = hiddenBrowser.contentDocument.location.href; try { - var newHiddenBrowser = new Object(); - newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; - newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; - processor(newHiddenBrowser); + processor(hiddenBrowser.contentDocument); } catch (e) { Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2); exception(e); diff --git a/scrapers.sql b/scrapers.sql index 4cc25101a..d225ec557 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,9 +1,9 @@ --- 41 +-- 42 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-08 17:12:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00')); -REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', +REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', 'function detectWeb(doc, url) { var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)''); if(searchRe.test(doc.location.href)) { @@ -28,7 +28,7 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006 var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; - var author = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue; + var author = doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author")); } @@ -40,15 +40,15 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006 for (var i = 0; i < elmts.length; i++) { try { var elmt = elmts[i]; - var attribute = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); - if(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { - var value = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); + var attribute = Scholar.Utilities.cleanString(doc.evaluate(''./B[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); + if(doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + var value = Scholar.Utilities.cleanString(doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); if(attribute == "Publisher:") { if(value.lastIndexOf("(") != -1) { var date = value.substring(value.lastIndexOf("(")+1, value.length-1); jsDate = new Date(date); if(!isNaN(jsDate.valueOf())) { - date = Scholar.Utilities.dateToISO(jsDate); + date = Scholar.Utilities.dateToSQL(jsDate); } newItem.date = date; @@ -74,7 +74,7 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006 var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - var title = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); + var title = Scholar.Utilities.cleanString(doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { title = title.substring(0, title.lastIndexOf("(")-1); } @@ -113,7 +113,7 @@ function doWeb(doc, url) { uris.push(i); } - Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, function() { Scholar.done(); }, function() {}); Scholar.wait(); @@ -122,7 +122,7 @@ function doWeb(doc, url) { } }'); -REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', +REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', 'function detectWeb(doc, url) { if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { return "book"; @@ -195,7 +195,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 } Scholar.Utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) { - Scholar.Utilities.debugPrint(text); + Scholar.Utilities.debug(text); var lineRegexp = new RegExp(); lineRegexp.compile("^([\\w() ]+): *(.*)$"); @@ -240,17 +240,17 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true)); } } else { - newItem.creators.push(Scholar.Utilities.trimString(match[2])); + newItem.creators.push(Scholar.Utilities.cleanString(match[2])); } } else if(match[1] == ''Publication'') { // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it. - match[2] = Scholar.Utilities.trimString(match[2]); + match[2] = Scholar.Utilities.cleanString(match[2]); if(match[2].substring(match[2].length-1) == '','') { match[2] = match[2].substring(0, match[2].length-1); } newItem.publisher = match[2]; /*} else if(match[1] == ''Language'') { - .addStatement(uri, prefixDC + ''language'', Scholar.Utilities.trimString(match[2]));*/ + .addStatement(uri, prefixDC + ''language'', Scholar.Utilities.cleanString(match[2]));*/ } else if(match[1] == ''Standard No'') { var identifiers = match[2].split(/ +/); var j=0; @@ -287,7 +287,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', +REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage', 'Simon Kornblith', 'Pwebrecon\.cgi', 'function detectWeb(doc, url) { var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; for(var i in export_options) { @@ -335,7 +335,7 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006 // Go through table rows for(var i=0; i= 0) { newItem.itemType = "magazineArticle"; @@ -1074,7 +1072,7 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 } } } else if(field == "isbn" || field == "issn" || field == "issn/isbn") { - var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(value) { var type; value = Scholar.Utilities.superCleanString(value.nodeValue); @@ -1110,7 +1108,7 @@ function doWeb(doc, url) { // Go through links for(var j=0; j]*>/gi); newItem.date = elementParts[1]; @@ -1369,14 +1367,14 @@ function doWeb(doc, url) { uris.push(i); } - Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, function() { Scholar.done(); }, function() {}); Scholar.wait(); } }'); -REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', +REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', 'function detectWeb(doc, url) { var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); @@ -1425,8 +1423,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006 } var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; + Scholar.Utilities.processDocuments(newUris, function(newDoc) { var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; @@ -1440,7 +1437,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006 var record = new marc.MARC_Record(); for(var i=0; i 0) { return "multiple"; @@ -1729,7 +1724,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006 }'); -REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', +REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', 'function detectWeb(doc, url) { if(doc.location.href.indexOf("/GeacQUERY") > 0) { return "multiple"; @@ -1764,8 +1759,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006 var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - Scholar.Utilities.processDocuments(null, uris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; + Scholar.Utilities.processDocuments(uris, function(newDoc) { var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; @@ -1817,7 +1811,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003', 'Simon Kornblith', '/uhtbin/cgisirsi', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -1870,7 +1864,7 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006 var links = Scholar.Utilities.gatherElementsOnXPath(doc, elmts[i], ''.//a'', nsResolver); // Collect title - var myTd = Scholar.Utilities.getNode(doc, elmts[i], "./td[2]", nsResolver); + var myTd = doc.evaluate("./td[2]", elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); var m = titleRe.exec(myTd.innerHTML); var title = unescapeHTML(m[1]); @@ -1895,9 +1889,9 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006 var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver); for(var i=0; i heading so we can parse as XML text = text.replace(/<\?xml[^?]+\?>/, ""); @@ -3043,9 +3034,9 @@ function doImport() { var xml = new XML(text); for each(var mods in xml.m::mods) { - Scholar.Utilities.debugPrint("item is: "); + Scholar.Utilities.debug("item is: "); for(var i in mods) { - Scholar.Utilities.debugPrint(i+" = "+mods[i].toString()); + Scholar.Utilities.debug(i+" = "+mods[i].toString()); } var newItem = new Scholar.Item(); @@ -3863,8 +3854,8 @@ function doImport() { } else if(type == n.bib+"Memo") { // check to see if this note is independent var arcs = Scholar.RDF.getArcsIn(node); - Scholar.Utilities.debugPrint("working on a note"); - Scholar.Utilities.debugPrint(arcs); + Scholar.Utilities.debug("working on a note"); + Scholar.Utilities.debug(arcs); var skip = false; for each(var arc in arcs) { arc = Scholar.RDF.getResourceURI(arc); @@ -4239,7 +4230,7 @@ function doImport() { var tag = data = false; do { // first valid line is type line = Scholar.read(); - Scholar.Utilities.debugPrint(line); + Scholar.Utilities.debug(line); } while(line !== false && line.substr(0, 6) != "TY - "); var item = new Scholar.Item(); @@ -4258,7 +4249,7 @@ function doImport() { tag = line.substr(0,2); data = line.substr(6); - Scholar.Utilities.debugPrint("tag: ''"+tag+"''; data: ''"+data+"''"); + Scholar.Utilities.debug("tag: ''"+tag+"''; data: ''"+data+"''"); if(tag == "ER") { // ER signals end of reference // unset info @@ -4692,7 +4683,7 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam part = ''a''; } var field = this.get_field_subfields(fieldNo); - Scholar.Utilities.debugPrint(''Found ''+field.length+'' matches for ''+fieldNo+part); + Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part); if(field) { for(var i in field) { var value = false;