removes unnecessary pieces of piggy bank API from utilities and updates translators to abide by current translator guidelines

This commit is contained in:
Simon Kornblith 2006-08-11 15:28:18 +00:00
parent 1e8aa81c02
commit 064ecd17db
2 changed files with 120 additions and 210 deletions

View File

@ -1,61 +1,21 @@
// Scholar for Firefox Utilities // Scholar for Firefox Utilities
// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
// This code is licensed according to the GPL
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// //
// Scholar.Utilities // Scholar.Utilities
// //
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// Scholar.Utilities class, a set of methods to assist in data
// extraction. Some of the code here was stolen directly from the Piggy Bank
// project.
Scholar.Utilities = function () {} Scholar.Utilities = function () {}
// Adapter for Piggy Bank function to print debug messages; log level is Scholar.Utilities.prototype.debug = function(msg) {
// fixed at 4 (could change this)
Scholar.Utilities.prototype.debugPrint = function(msg) {
Scholar.debug(msg, 4); Scholar.debug(msg, 4);
} }
// Appears to trim a string, chopping of newlines/spacing
Scholar.Utilities.prototype.trimString = function(s) {
var i = 0;
var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */;
while (i < s.length) {
var c = s.charAt(i);
if (spaceChars.indexOf(c) < 0) {
break;
}
i++;
}
s = s.substring(i);
i = s.length;
while (i > 0) {
var c = s.charAt(i - 1);
if (spaceChars.indexOf(c) < 0) {
break;
}
i--;
}
return s.substring(0, i);
}
/* /*
* BEGIN SCHOLAR FOR FIREFOX EXTENSIONS * Converts a JavaScript date object to an SQL-style date
* Functions below this point are extensions to the utilities provided by
* Piggy Bank. When used in external code, the repository will need to add
* a function definition when exporting in Piggy Bank format.
*/ */
Scholar.Utilities.prototype.dateToSQL = function(jsDate) {
/*
* Converts a JavaScript date object to an ISO-style date
*/
Scholar.Utilities.prototype.dateToISO = function(jsDate) {
var date = ""; var date = "";
var year = jsDate.getFullYear().toString(); var year = jsDate.getFullYear().toString();
var month = (jsDate.getMonth()+1).toString(); var month = (jsDate.getMonth()+1).toString();
@ -112,7 +72,8 @@ Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) {
*/ */
Scholar.Utilities.prototype.cleanString = function(s) { Scholar.Utilities.prototype.cleanString = function(s) {
s = s.replace(/[ \xA0]+/g, " "); s = s.replace(/[ \xA0]+/g, " ");
return this.trimString(s); s = s.replace(/^\s+/, "");
return s.replace(/\s+$/, "");
} }
/* /*
@ -223,43 +184,6 @@ Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, paren
return elmts; return elmts;
} }
// Appears to look for links in a document containing a certain substring (kind
// of like getItemArray, only with NO REGEXP FUNCTIONALITY)
Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) {
var urls = [];
var addedURLs = [];
var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
var aElement = aElements.iterateNext();
while (aElement) {
var href = aElement.href;
if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
urls.unshift(href);
addedURLs[href] = true;
}
aElement = aElements.iterateNext();
}
return urls;
}
// For now, we're going to skip the getLLsFromAddresses function (which gets
// latitude and longitude pairs from a series of addresses, but requires the
// big mess of Java code that is the Piggy Bank server) and the geoHelper
// tools (which rely on getLLsFromAddresses) since these are probably not
// essential components for Scholar and would take a great deal of effort to
// implement. We can, however, always implement them later.
/*
* BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
*/
/*
* Gets a given node (assumes only one value)
*/
Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) {
return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext();
}
/* /*
* Gets a given node as a string containing all child nodes * Gets a given node as a string containing all child nodes
*/ */
@ -325,10 +249,6 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) {
return Scholar.OpenURL.parseContextObject(co, item); return Scholar.OpenURL.parseContextObject(co, item);
} }
/*
* END SCHOLAR FOR FIREFOX EXTENSIONS
*/
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies // Ingester adapters for Scholar.Utilities.HTTP to handle proxies
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) { Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
@ -337,11 +257,13 @@ Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, fai
} }
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed); Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
} }
Scholar.Utilities.Ingester.prototype.processDocuments = function(firstDoc, urls, processor, done, exception) { Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
if(this.proxiedURL) {
for(i in urls) { for(i in urls) {
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
} }
Scholar.Utilities.HTTP.processDocuments(firstDoc, urls, processor, done, exception); }
Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception);
} }
Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) { Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) {
@ -615,10 +537,7 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
prevUrl = hiddenBrowser.contentDocument.location.href; prevUrl = hiddenBrowser.contentDocument.location.href;
try { try {
var newHiddenBrowser = new Object(); processor(hiddenBrowser.contentDocument);
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
processor(newHiddenBrowser);
} catch (e) { } catch (e) {
Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2); Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
exception(e); exception(e);

View File

@ -1,9 +1,9 @@
-- 41 -- 42
-- Set the following timestamp to the most recent scraper update date -- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-08 17:12:00')); REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00'));
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)''); var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)'');
if(searchRe.test(doc.location.href)) { if(searchRe.test(doc.location.href)) {
@ -28,7 +28,7 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006
var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) { for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i]; var elmt = elmts[i];
var author = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue; var author = doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author")); newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
} }
@ -40,15 +40,15 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006
for (var i = 0; i < elmts.length; i++) { for (var i = 0; i < elmts.length; i++) {
try { try {
var elmt = elmts[i]; var elmt = elmts[i];
var attribute = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); var attribute = Scholar.Utilities.cleanString(doc.evaluate(''./B[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
if(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { if(doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
var value = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); var value = Scholar.Utilities.cleanString(doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
if(attribute == "Publisher:") { if(attribute == "Publisher:") {
if(value.lastIndexOf("(") != -1) { if(value.lastIndexOf("(") != -1) {
var date = value.substring(value.lastIndexOf("(")+1, value.length-1); var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
jsDate = new Date(date); jsDate = new Date(date);
if(!isNaN(jsDate.valueOf())) { if(!isNaN(jsDate.valueOf())) {
date = Scholar.Utilities.dateToISO(jsDate); date = Scholar.Utilities.dateToSQL(jsDate);
} }
newItem.date = date; newItem.date = date;
@ -74,7 +74,7 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
var title = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); var title = Scholar.Utilities.cleanString(doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
title = title.substring(0, title.lastIndexOf("(")-1); title = title.substring(0, title.lastIndexOf("(")-1);
} }
@ -113,7 +113,7 @@ function doWeb(doc, url) {
uris.push(i); uris.push(i);
} }
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {}); function() { Scholar.done(); }, function() {});
Scholar.wait(); Scholar.wait();
@ -122,7 +122,7 @@ function doWeb(doc, url) {
} }
}'); }');
REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
return "book"; return "book";
@ -195,7 +195,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
} }
Scholar.Utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) { Scholar.Utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) {
Scholar.Utilities.debugPrint(text); Scholar.Utilities.debug(text);
var lineRegexp = new RegExp(); var lineRegexp = new RegExp();
lineRegexp.compile("^([\\w() ]+): *(.*)$"); lineRegexp.compile("^([\\w() ]+): *(.*)$");
@ -240,17 +240,17 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true)); newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true));
} }
} else { } else {
newItem.creators.push(Scholar.Utilities.trimString(match[2])); newItem.creators.push(Scholar.Utilities.cleanString(match[2]));
} }
} else if(match[1] == ''Publication'') { } else if(match[1] == ''Publication'') {
// Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it. // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
match[2] = Scholar.Utilities.trimString(match[2]); match[2] = Scholar.Utilities.cleanString(match[2]);
if(match[2].substring(match[2].length-1) == '','') { if(match[2].substring(match[2].length-1) == '','') {
match[2] = match[2].substring(0, match[2].length-1); match[2] = match[2].substring(0, match[2].length-1);
} }
newItem.publisher = match[2]; newItem.publisher = match[2];
/*} else if(match[1] == ''Language'') { /*} else if(match[1] == ''Language'') {
.addStatement(uri, prefixDC + ''language'', Scholar.Utilities.trimString(match[2]));*/ .addStatement(uri, prefixDC + ''language'', Scholar.Utilities.cleanString(match[2]));*/
} else if(match[1] == ''Standard No'') { } else if(match[1] == ''Standard No'') {
var identifiers = match[2].split(/ +/); var identifiers = match[2].split(/ +/);
var j=0; var j=0;
@ -287,7 +287,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage', 'Simon Kornblith', 'Pwebrecon\.cgi',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
for(var i in export_options) { for(var i in export_options) {
@ -335,7 +335,7 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006
// Go through table rows // Go through table rows
for(var i=0; i<tableRows.length; i++) { for(var i=0; i<tableRows.length; i++) {
// CHK is what we need to get it all as one file // CHK is what we need to get it all as one file
var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver); var input = doc.evaluate(''./td/input[@name="CHK"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
checkboxes[i] = input.value; checkboxes[i] = input.value;
var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
urls[i] = links[0].href; urls[i] = links[0].href;
@ -414,7 +414,7 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 4, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 4, 'JSTOR', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI; var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) { var nsResolver = namespace ? function(prefix) {
@ -484,7 +484,7 @@ function doWeb(doc, url) {
// Go through links // Go through links
for(var j=0; j<links.length; j++) { for(var j=0; j<links.length; j++) {
if(tagRegexp.test(links[j].href)) { if(tagRegexp.test(links[j].href)) {
var text = Scholar.Utilities.getNode(doc, tableRows[i], ''.//strong/text()'', null); var text = doc.evaluate(''.//strong/text()'', tableRows[i], null, XPathResult.ANY_TYPE, null).iterateNext();
if(text && text.nodeValue) { if(text && text.nodeValue) {
text = Scholar.Utilities.cleanString(text.nodeValue); text = Scholar.Utilities.cleanString(text.nodeValue);
if(availableItems[links[j].href]) { if(availableItems[links[j].href]) {
@ -562,7 +562,7 @@ function doWeb(doc, url) {
if(isNaN(date.valueOf())) { if(isNaN(date.valueOf())) {
newItem.date = fieldContent; newItem.date = fieldContent;
} else { } else {
newItem.date = Scholar.Utilities.dateToISO(date); newItem.date = Scholar.Utilities.dateToSQL(date);
} }
} else if(fieldCode == "PP") { } else if(fieldCode == "PP") {
newItem.pages = fieldContent; newItem.pages = fieldContent;
@ -589,7 +589,7 @@ function doWeb(doc, url) {
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
if(doc.title == "History Cooperative: Search Results") { if(doc.title == "History Cooperative: Search Results") {
return "multiple"; return "multiple";
@ -647,7 +647,7 @@ function doWeb(doc, url) {
uris.push(i); uris.push(i);
} }
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {}); function() { Scholar.done(); }, function() {});
Scholar.wait(); Scholar.wait();
@ -656,7 +656,7 @@ function doWeb(doc, url) {
} }
}'); }');
REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-08-06 21:45:00', 4, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-08-06 21:45:00', 4, 'InnoPAC', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button // First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button
var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$'');
@ -709,9 +709,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
if(newUri) { // single page if(newUri) { // single page
Scholar.Utilities.loadDocument(newUri, function(newBrowser) { Scholar.Utilities.loadDocument(newUri, function(newDoc) {
newDoc = newBrowser.contentDocument;
var namespace = newDoc.documentElement.namespaceURI; var namespace = newDoc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) { var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null; if (prefix == ''x'') return namespace; else return null;
@ -720,7 +718,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
var xpath = ''//pre''; var xpath = ''//pre'';
var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
var text = Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue; var text = doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var newItem = new Scholar.Item(); var newItem = new Scholar.Item();
newItem.source = uri; newItem.source = uri;
@ -781,7 +779,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
// Go through table rows // Go through table rows
for(var i=0; i<tableRows.length; i++) { for(var i=0; i<tableRows.length; i++) {
// CHK is what we need to get it all as one file // CHK is what we need to get it all as one file
var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./td/input[@type="checkbox"]'', nsResolver); var input = doc.evaluate(''./td/input[@type="checkbox"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
checkboxes[i] = input.name+"="+escape(input.value); checkboxes[i] = input.name+"="+escape(input.value);
var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
urls[i] = links[0].href; urls[i] = links[0].href;
@ -836,7 +834,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 4, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 4, 'SIRSI 2003+', 'Simon Kornblith', '/uhtbin/cgisirsi',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI; var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) { var nsResolver = namespace ? function(prefix) {
@ -872,12 +870,12 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006
for (var i = 0; i < elmts.length; i++) { for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i]; var elmt = elmts[i];
try { try {
var node = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); var node = doc.evaluate(''./TD[1]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(!node) { if(!node) {
var node = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); var node = doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
} }
if(node) { if(node) {
var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TH[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
field = field.toLowerCase(); field = field.toLowerCase();
var value = Scholar.Utilities.superCleanString(node.nodeValue); var value = Scholar.Utilities.superCleanString(node.nodeValue);
if(field == "publisher") { if(field == "publisher") {
@ -907,7 +905,7 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006
} catch (e) {} } catch (e) {}
} }
var callNumber = Scholar.Utilities.getNode(doc, doc, ''//tr/td[1][@class="holdingslist"]/text()'', nsResolver); var callNumber = doc.evaluate(''//tr/td[1][@class="holdingslist"]/text()'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(callNumber && callNumber.nodeValue) { if(callNumber && callNumber.nodeValue) {
newItem.callNumber = callNumber.nodeValue; newItem.callNumber = callNumber.nodeValue;
} }
@ -930,7 +928,7 @@ function doWeb(doc, url) {
var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver); var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver);
// Go through table rows // Go through table rows
for(var i=1; i<tableRows.length; i++) { for(var i=1; i<tableRows.length; i++) {
var input = Scholar.Utilities.getNode(doc, tableRows[i], ''.//input[@value="Details"]'', nsResolver); var input = doc.evaluate(''.//input[@value="Details"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
checkboxes[i] = input.name; checkboxes[i] = input.name;
var text = Scholar.Utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver); var text = Scholar.Utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver);
if(text) { if(text) {
@ -948,14 +946,14 @@ function doWeb(doc, url) {
var m = hostRe.exec(doc.location.href); var m = hostRe.exec(doc.location.href);
var hitlist = doc.forms.namedItem("hitlist"); var hitlist = doc.forms.namedItem("hitlist");
var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value; var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value;
Scholar.Utilities.debugPrint(baseUrl); Scholar.Utilities.debug(baseUrl);
var uris = new Array(); var uris = new Array();
for(var i in items) { for(var i in items) {
uris.push(baseUrl+"&"+checkboxes[i]+"=Details"); uris.push(baseUrl+"&"+checkboxes[i]+"=Details");
} }
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done() }, function() {}); function() { Scholar.done() }, function() {});
Scholar.wait(); Scholar.wait();
@ -963,7 +961,7 @@ function doWeb(doc, url) {
} }
'); ');
REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
if(doc.title == "Results") { if(doc.title == "Results") {
return "magazineArticle"; return "magazineArticle";
@ -1010,29 +1008,29 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006
var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) { for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i]; var elmt = elmts[i];
var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase(); var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue).toLowerCase();
if(field == "publication title") { if(field == "publication title") {
var publication = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver); var publication = doc.evaluate(''./TD[2]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(publication.nodeValue) { if(publication.nodeValue) {
newItem.publicationTitle = Scholar.Utilities.superCleanString(publication.nodeValue); newItem.publicationTitle = Scholar.Utilities.superCleanString(publication.nodeValue);
} }
var place = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); var place = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(place.nodeValue) { if(place.nodeValue) {
newItem.place = Scholar.Utilities.superCleanString(place.nodeValue); newItem.place = Scholar.Utilities.superCleanString(place.nodeValue);
} }
var date = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver); var date = doc.evaluate(''./TD[2]/A[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(date.nodeValue) { if(date.nodeValue) {
date = date.nodeValue; date = date.nodeValue;
var jsDate = new Date(Scholar.Utilities.superCleanString(date)); var jsDate = new Date(Scholar.Utilities.superCleanString(date));
if(!isNaN(jsDate.valueOf())) { if(!isNaN(jsDate.valueOf())) {
date = Scholar.Utilities.dateToISO(jsDate); date = Scholar.Utilities.dateToSQL(jsDate);
} }
newItem.date = date; newItem.date = date;
} }
var moreInfo = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver); var moreInfo = doc.evaluate(''./TD[2]/text()[2]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(moreInfo.nodeValue) { if(moreInfo.nodeValue) {
moreInfo = Scholar.Utilities.superCleanString(moreInfo.nodeValue); moreInfo = Scholar.Utilities.superCleanString(moreInfo.nodeValue);
var parts = moreInfo.split(";\xA0"); var parts = moreInfo.split(";\xA0");
@ -1060,10 +1058,10 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006
} }
} }
} else if(field == "source type") { } else if(field == "source type") {
var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(value.nodeValue) { if(value.nodeValue) {
value = Scholar.Utilities.superCleanString(value.nodeValue).toLowerCase(); value = Scholar.Utilities.superCleanString(value.nodeValue).toLowerCase();
Scholar.Utilities.debugPrint(value); Scholar.Utilities.debug(value);
if(value.indexOf("periodical") >= 0) { if(value.indexOf("periodical") >= 0) {
newItem.itemType = "magazineArticle"; newItem.itemType = "magazineArticle";
@ -1074,7 +1072,7 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006
} }
} }
} else if(field == "isbn" || field == "issn" || field == "issn/isbn") { } else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(value) { if(value) {
var type; var type;
value = Scholar.Utilities.superCleanString(value.nodeValue); value = Scholar.Utilities.superCleanString(value.nodeValue);
@ -1110,7 +1108,7 @@ function doWeb(doc, url) {
// Go through links // Go through links
for(var j=0; j<links.length; j++) { for(var j=0; j<links.length; j++) {
if(tagRegexp.test(links[j].href)) { if(tagRegexp.test(links[j].href)) {
var text = Scholar.Utilities.getNode(doc, tableRows[i], ''./a[@class="bold"]/text()'', null); var text = doc.evaluate(''./a[@class="bold"]/text()'', tableRows[i], null, XPathResult.ANY_TYPE, null).iterateNext();
if(text && text.nodeValue) { if(text && text.nodeValue) {
text = Scholar.Utilities.cleanString(text.nodeValue); text = Scholar.Utilities.cleanString(text.nodeValue);
items[links[j].href] = text; items[links[j].href] = text;
@ -1130,7 +1128,7 @@ function doWeb(doc, url) {
uris.push(i); uris.push(i);
} }
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {}); function() { Scholar.done(); }, function() {});
Scholar.wait(); Scholar.wait();
@ -1140,13 +1138,13 @@ function doWeb(doc, url) {
if(m && (m[1] == "1" || m[1] == "2")) { if(m && (m[1] == "1" || m[1] == "2")) {
scrape(doc); scrape(doc);
} else if(m) { } else if(m) {
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(browser) { scrape(browser.contentDocument); Scholar.done(); }, function() {}); Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, function() {});
Scholar.wait(); Scholar.wait();
} }
} }
}'); }');
REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 4, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 4, 'InfoTrac', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
if(doc.title.substring(0, 8) == "Article ") { if(doc.title.substring(0, 8) == "Article ") {
return "magazineArticle"; return "magazineArticle";
@ -1249,9 +1247,9 @@ function doWeb(doc, url) {
var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body//table/tbody/tr/td[a/b]'', nsResolver); var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body//table/tbody/tr/td[a/b]'', nsResolver);
// Go through table rows // Go through table rows
for(var i=0; i<tableRows.length; i++) { for(var i=0; i<tableRows.length; i++) {
var link = Scholar.Utilities.getNode(doc, tableRows[i], ''./a'', nsResolver); var link = doc.evaluate(''./a'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
uris[i] = link.href; uris[i] = link.href;
var article = Scholar.Utilities.getNode(doc, link, ''./b/text()'', nsResolver); var article = doc.evaluate(''./b/text()'', link, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
items[i] = article.nodeValue; items[i] = article.nodeValue;
// Chop off final period // Chop off final period
if(items[i].substr(items[i].length-1) == ".") { if(items[i].substr(items[i].length-1) == ".") {
@ -1272,7 +1270,7 @@ function doWeb(doc, url) {
} }
}'); }');
REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 4, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)', REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 4, 'LexisNexis', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var detailRe = new RegExp("^http://[^/]+/universe/document"); var detailRe = new RegExp("^http://[^/]+/universe/document");
if(detailRe.test(doc.location.href)) { if(detailRe.test(doc.location.href)) {
@ -1302,7 +1300,7 @@ REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006
var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML);
if(m) { if(m) {
var jsDate = new Date(m[1]+" "+m[2]); var jsDate = new Date(m[1]+" "+m[2]);
newItem.date = Scholar.Utilities.dateToISO(jsDate); newItem.date = Scholar.Utilities.dateToSQL(jsDate);
} else { } else {
var elementParts = centerElements[centerElements.length-1].innerHTML.split(/<br[^>]*>/gi); var elementParts = centerElements[centerElements.length-1].innerHTML.split(/<br[^>]*>/gi);
newItem.date = elementParts[1]; newItem.date = elementParts[1];
@ -1369,14 +1367,14 @@ function doWeb(doc, url) {
uris.push(i); uris.push(i);
} }
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {}); function() { Scholar.done(); }, function() {});
Scholar.wait(); Scholar.wait();
} }
}'); }');
REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}");
@ -1425,8 +1423,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
} }
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { Scholar.Utilities.processDocuments(newUris, function(newDoc) {
var newDoc = newBrowser.contentDocument;
var uri = newDoc.location.href; var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI; var namespace = newDoc.documentElement.namespaceURI;
@ -1440,7 +1437,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
var record = new marc.MARC_Record(); var record = new marc.MARC_Record();
for(var i=0; i<elmts.length; i++) { for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i]; var elmt = elmts[i];
var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue); var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
var value = Scholar.Utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver); var value = Scholar.Utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver);
var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1"); var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1");
@ -1467,7 +1464,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]'');
if(detailsRe.test(doc.location.href)) { if(detailsRe.test(doc.location.href)) {
@ -1511,8 +1508,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, uris, function(newBrowser) { Scholar.Utilities.processDocuments(uris, function(newDoc) {
var newDoc = newBrowser.contentDocument;
var uri = newDoc.location.href; var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI; var namespace = newDoc.documentElement.namespaceURI;
@ -1526,7 +1522,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
var record = new marc.MARC_Record(); var record = new marc.MARC_Record();
for(var i=0; i<elmts.length; i++) { for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i]; var elmt = elmts[i];
var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(newDoc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue); var field = Scholar.Utilities.superCleanString(newDoc.evaluate(''./TD[1]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
var value = Scholar.Utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver); var value = Scholar.Utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver);
value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1"); value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1");
@ -1555,13 +1551,13 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)', REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS', 'Simon Kornblith', '/chameleon(?:\?|$)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var node = Scholar.Utilities.getNode(doc, doc, ''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', null); var node = doc.evaluate(''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if(node) { if(node) {
return "multiple"; return "multiple";
} }
var node = Scholar.Utilities.getNode(doc, doc, ''//a[text()="marc"]'', null); var node = doc.evaluate(''//a[text()="marc"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if(node) { if(node) {
return "book"; return "book";
} }
@ -1602,7 +1598,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
// Collect title information // Collect title information
var fields = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''./td/table/tbody/tr[th]'', nsResolver); var fields = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''./td/table/tbody/tr[th]'', nsResolver);
for(var j=0; j<fields.length; j++) { for(var j=0; j<fields.length; j++) {
var field = Scholar.Utilities.getNode(doc, fields[j], ''./th/text()'', nsResolver); var field = doc.evaluate(''./th/text()'', fields[j], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(field.nodeValue == "Title") { if(field.nodeValue == "Title") {
var value = Scholar.Utilities.getNodeString(doc, fields[j], ''./td//text()'', nsResolver); var value = Scholar.Utilities.getNodeString(doc, fields[j], ''./td//text()'', nsResolver);
if(value) { if(value) {
@ -1620,15 +1616,14 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
} }
for(var i in items) { for(var i in items) {
Scholar.Utilities.debugPrint(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); Scholar.Utilities.debug(i.replace(/function=[A-Z]{7}/, "function=MARCSCR"));
newUris.push(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); newUris.push(i.replace(/function=[A-Z]{7}/, "function=MARCSCR"));
} }
} }
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { Scholar.Utilities.processDocuments(newUris, function(newDoc) {
var newDoc = newBrowser.contentDocument;
var uri = newDoc.location.href var uri = newDoc.location.href
var namespace = newDoc.documentElement.namespaceURI; var namespace = newDoc.documentElement.namespaceURI;
@ -1641,10 +1636,10 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
var record = new marc.MARC_Record(); var record = new marc.MARC_Record();
for(var i=0; i<elmts.length; i++) { for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i]; var elmt = elmts[i];
var field = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue; var field = doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var ind1 = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue; var ind1 = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var ind2 = Scholar.Utilities.getNode(doc, elmt, ''./TD[3]/text()[1]'', nsResolver).nodeValue; var ind2 = doc.evaluate(''./TD[3]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[4]/text()[1]'', nsResolver).nodeValue; var value = doc.evaluate(''./TD[4]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
value = value.replace(/\\([a-z]) /g, record.subfield_delimiter+"$1"); value = value.replace(/\\([a-z]) /g, record.subfield_delimiter+"$1");
record.add_field(field, ind1, ind2, value); record.add_field(field, ind1, ind2, value);
@ -1659,7 +1654,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 4, 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 4, 'DRA', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
if(doc.location.href.indexOf("/authority_hits") > 0) { if(doc.location.href.indexOf("/authority_hits") > 0) {
return "multiple"; return "multiple";
@ -1729,7 +1724,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006
}'); }');
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
if(doc.location.href.indexOf("/GeacQUERY") > 0) { if(doc.location.href.indexOf("/GeacQUERY") > 0) {
return "multiple"; return "multiple";
@ -1764,8 +1759,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, uris, function(newBrowser) { Scholar.Utilities.processDocuments(uris, function(newDoc) {
var newDoc = newBrowser.contentDocument;
var uri = newDoc.location.href; var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI; var namespace = newDoc.documentElement.namespaceURI;
@ -1817,7 +1811,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003', 'Simon Kornblith', '/uhtbin/cgisirsi',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI; var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) { var nsResolver = namespace ? function(prefix) {
@ -1870,7 +1864,7 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006
var links = Scholar.Utilities.gatherElementsOnXPath(doc, elmts[i], ''.//a'', nsResolver); var links = Scholar.Utilities.gatherElementsOnXPath(doc, elmts[i], ''.//a'', nsResolver);
// Collect title // Collect title
var myTd = Scholar.Utilities.getNode(doc, elmts[i], "./td[2]", nsResolver); var myTd = doc.evaluate("./td[2]", elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var m = titleRe.exec(myTd.innerHTML); var m = titleRe.exec(myTd.innerHTML);
var title = unescapeHTML(m[1]); var title = unescapeHTML(m[1]);
@ -1895,9 +1889,9 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006
var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver); var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver);
for(var i=0; i<elmts.length; i++) { for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i]; var elmt = elmts[i];
var initialText = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver); var initialText = doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(initialText && initialText.nodeValue && Scholar.Utilities.superCleanString(initialText.nodeValue) == "Viewing record") { if(initialText && initialText.nodeValue && Scholar.Utilities.superCleanString(initialText.nodeValue) == "Viewing record") {
recNumbers.push(Scholar.Utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue); recNumbers.push(doc.evaluate(''./b[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
break; break;
} }
} }
@ -1953,7 +1947,7 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]");
if(detailRe.test(doc.location.href)) { if(detailRe.test(doc.location.href)) {
@ -1989,8 +1983,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { Scholar.Utilities.processDocuments(newUris, function(newDoc) {
var newDoc = newBrowser.contentDocument;
var uri = newDoc.location.href; var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI; var namespace = newDoc.documentElement.namespaceURI;
@ -2006,8 +1999,8 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
for(var i=0; i<elmts.length; i++) { for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i]; var elmt = elmts[i];
tag = Scholar.Utilities.getNode(newDoc, elmt, ''./td[2]/tt[1]/text()[1]'', nsResolver).nodeValue; tag = newDoc.evaluate(''./td[2]/tt[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var inds = Scholar.Utilities.getNode(newDoc, elmt, ''./td[3]/tt[1]/text()[1]'', nsResolver).nodeValue; var inds = newDoc.evaluate(''./td[3]/tt[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
tag = tag.replace(/[\r\n]/g, ""); tag = tag.replace(/[\r\n]/g, "");
if(tag.length == 1) { if(tag.length == 1) {
@ -2051,7 +2044,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
Scholar.wait(); Scholar.wait();
}'); }');
REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 4, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 4, 'Project MUSE', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi");
if(searchRe.test(url)) { if(searchRe.test(url)) {
@ -2073,8 +2066,8 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006
// Go through table rows // Go through table rows
for(var i=0; i<tableRows.length; i++) { for(var i=0; i<tableRows.length; i++) {
// article_id is what we need to get it all as one file // article_id is what we need to get it all as one file
var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./tbody/tr/td/input[@name="article_id"]'', nsResolver); var input = doc.evaluate(''./tbody/tr/td/input[@name="article_id"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//b/i/a/text()'', nsResolver); var link = doc.evaluate(''.//b/i/a/text()'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(input && input.value && link && link.nodeValue) { if(input && input.value && link && link.nodeValue) {
items[input.value] = link.nodeValue; items[input.value] = link.nodeValue;
} }
@ -2163,7 +2156,7 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006
} }
}'); }');
REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-08-07 21:55:00', 12, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-08-07 21:55:00', 12, 'PubMed', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
if(doc.location.href.indexOf("list_uids=") >= 0) { if(doc.location.href.indexOf("list_uids=") >= 0) {
return "journalArticle"; return "journalArticle";
@ -2239,7 +2232,7 @@ function detectSearch(item) {
var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Day.text()+", "+article.Journal.JournalIssue.PubDate.Year.text(); var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Day.text()+", "+article.Journal.JournalIssue.PubDate.Year.text();
var jsDate = new Date(date); var jsDate = new Date(date);
if(!isNaN(jsDate.valueOf())) { if(!isNaN(jsDate.valueOf())) {
date = Scholar.Utilities.dateToISO(jsDate); date = Scholar.Utilities.dateToSQL(jsDate);
} }
} else if(article.Journal.JournalIssue.PubDate.Month.text().toString() != "") { } else if(article.Journal.JournalIssue.PubDate.Month.text().toString() != "") {
var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text(); var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text();
@ -2293,8 +2286,8 @@ function doWeb(doc, url) {
var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver); var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver);
// Go through table rows // Go through table rows
for(var i=0; i<tableRows.length; i++) { for(var i=0; i<tableRows.length; i++) {
var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//a'', nsResolver); var link = doc.evaluate(''.//a'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var article = Scholar.Utilities.getNode(doc, tableRows[i], ''./tr[2]/td[2]/text()[1]'', nsResolver); var article = doc.evaluate(''./tr[2]/td[2]/text()[1]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
items[link.href] = article.nodeValue; items[link.href] = article.nodeValue;
} }
@ -2318,7 +2311,7 @@ function doSearch(item) {
lookupPMIDs([getPMID(item.contextObject)]); lookupPMIDs([getPMID(item.contextObject)]);
}'); }');
REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 4, 'Embedded RDF Scraper', 'Simon Kornblith', NULL, REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 4, 'Embedded RDF', 'Simon Kornblith', NULL,
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var metaTags = doc.getElementsByTagName("meta"); var metaTags = doc.getElementsByTagName("meta");
@ -2347,7 +2340,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006
foundTitle = true; foundTitle = true;
} }
translator.Scholar.RDF.addStatement(url, dc + tag.substr(3), value, true); translator.Scholar.RDF.addStatement(url, dc + tag.substr(3), value, true);
Scholar.Utilities.debugPrint(tag.substr(3) + " = " + value); Scholar.Utilities.debug(tag.substr(3) + " = " + value);
} else if(tag && value && (tag == "author" || tag == "author-personal")) { } else if(tag && value && (tag == "author" || tag == "author-personal")) {
translator.Scholar.RDF.addStatement(url, dc + "creator", value, true); translator.Scholar.RDF.addStatement(url, dc + "creator", value, true);
} else if(tag && value && tag == "author-corporate") { } else if(tag && value && tag == "author-corporate") {
@ -2362,7 +2355,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006
translator.doImport(); translator.doImport();
}'); }');
REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS Scraper', 'Simon Kornblith', NULL, REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS', 'Simon Kornblith', NULL,
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var spanTags = doc.getElementsByTagName("span"); var spanTags = doc.getElementsByTagName("span");
@ -2413,7 +2406,7 @@ function retrieveNextCOinS(needFullItems, newItems) {
if(needFullItems.length) { if(needFullItems.length) {
var item = needFullItems.shift(); var item = needFullItems.shift();
Scholar.Utilities.debugPrint("looking up contextObject"); Scholar.Utilities.debug("looking up contextObject");
var search = Scholar.loadTranslator("search"); var search = Scholar.loadTranslator("search");
search.setHandler("itemDone", function(obj, item) { search.setHandler("itemDone", function(obj, item) {
newItems.push(item); newItems.push(item);
@ -2490,7 +2483,7 @@ function doWeb(doc, url) {
} }
}'); }');
REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)',
'function detectWeb(doc, url) { 'function detectWeb(doc, url) {
var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i''); var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');
if(re.test(doc.location.href)) { if(re.test(doc.location.href)) {
@ -2526,8 +2519,7 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
} }
} }
Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { Scholar.Utilities.processDocuments(newUris, function(newDoc) {
var newDoc = newBrowser.contentDocument;
var newItem = new Scholar.Item("book"); var newItem = new Scholar.Item("book");
newItem.source = newDoc.location.href; newItem.source = newDoc.location.href;
@ -2539,8 +2531,8 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
var xpath = ''//table[@id="bib"]/tbody/tr''; var xpath = ''//table[@id="bib"]/tbody/tr'';
var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
for(var i = 0; i<elmts.length; i++) { for(var i = 0; i<elmts.length; i++) {
var field = Scholar.Utilities.getNode(newDoc, elmts[i], ''./td[1]//text()'', nsResolver); var field = newDoc.evaluate(''./td[1]//text()'', elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var value = Scholar.Utilities.getNode(newDoc, elmts[i], ''./td[2]//text()'', nsResolver); var value = newDoc.evaluate(''./td[2]//text()'', elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(field && value) { if(field && value) {
field = Scholar.Utilities.superCleanString(field.nodeValue); field = Scholar.Utilities.superCleanString(field.nodeValue);
@ -2564,7 +2556,7 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
jsDate = new Date(value); jsDate = new Date(value);
if(!isNaN(jsDate.valueOf())) { if(!isNaN(jsDate.valueOf())) {
date = Scholar.Utilities.dateToISO(jsDate); date = Scholar.Utilities.dateToSQL(jsDate);
} }
newItem.date = date; newItem.date = date;
@ -2618,8 +2610,7 @@ function doSearch(item) {
var co = Scholar.Utilities.createContextObject(item); var co = Scholar.Utilities.createContextObject(item);
} }
Scholar.Utilities.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) { Scholar.Utilities.loadDocument("http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co, function(doc) {
var doc = browser.contentDocument;
// find new COinS in the Open WorldCat page // find new COinS in the Open WorldCat page
if(processOWC(doc)) { // we got a single item page if(processOWC(doc)) { // we got a single item page
Scholar.done(); Scholar.done();
@ -2644,16 +2635,16 @@ function doSearch(item) {
urlsToProcess.push(elmt.href); urlsToProcess.push(elmt.href);
} while(elmt = elmts.iterateNext()); } while(elmt = elmts.iterateNext());
Scholar.Utilities.processDocuments(null, urlsToProcess, function(browser) { Scholar.Utilities.processDocuments(urlsToProcess, function(doc) {
// per URL // per URL
processOWC(browser.contentDocument); processOWC(doc);
}, function() { // done }, function() { // done
Scholar.done(); Scholar.done();
}, function() { // error }, function() { // error
Scholar.done(false); Scholar.done(false);
}); });
} }
}, null, function() { }, function() {
error(); error();
}); });
@ -3031,7 +3022,7 @@ function doImport() {
while(read = Scholar.read(16384)) { while(read = Scholar.read(16384)) {
text += read; text += read;
} }
Scholar.Utilities.debugPrint("read in"); Scholar.Utilities.debug("read in");
// eliminate <?xml ?> heading so we can parse as XML // eliminate <?xml ?> heading so we can parse as XML
text = text.replace(/<\?xml[^?]+\?>/, ""); text = text.replace(/<\?xml[^?]+\?>/, "");
@ -3043,9 +3034,9 @@ function doImport() {
var xml = new XML(text); var xml = new XML(text);
for each(var mods in xml.m::mods) { for each(var mods in xml.m::mods) {
Scholar.Utilities.debugPrint("item is: "); Scholar.Utilities.debug("item is: ");
for(var i in mods) { for(var i in mods) {
Scholar.Utilities.debugPrint(i+" = "+mods[i].toString()); Scholar.Utilities.debug(i+" = "+mods[i].toString());
} }
var newItem = new Scholar.Item(); var newItem = new Scholar.Item();
@ -3863,8 +3854,8 @@ function doImport() {
} else if(type == n.bib+"Memo") { } else if(type == n.bib+"Memo") {
// check to see if this note is independent // check to see if this note is independent
var arcs = Scholar.RDF.getArcsIn(node); var arcs = Scholar.RDF.getArcsIn(node);
Scholar.Utilities.debugPrint("working on a note"); Scholar.Utilities.debug("working on a note");
Scholar.Utilities.debugPrint(arcs); Scholar.Utilities.debug(arcs);
var skip = false; var skip = false;
for each(var arc in arcs) { for each(var arc in arcs) {
arc = Scholar.RDF.getResourceURI(arc); arc = Scholar.RDF.getResourceURI(arc);
@ -4239,7 +4230,7 @@ function doImport() {
var tag = data = false; var tag = data = false;
do { // first valid line is type do { // first valid line is type
line = Scholar.read(); line = Scholar.read();
Scholar.Utilities.debugPrint(line); Scholar.Utilities.debug(line);
} while(line !== false && line.substr(0, 6) != "TY - "); } while(line !== false && line.substr(0, 6) != "TY - ");
var item = new Scholar.Item(); var item = new Scholar.Item();
@ -4258,7 +4249,7 @@ function doImport() {
tag = line.substr(0,2); tag = line.substr(0,2);
data = line.substr(6); data = line.substr(6);
Scholar.Utilities.debugPrint("tag: ''"+tag+"''; data: ''"+data+"''"); Scholar.Utilities.debug("tag: ''"+tag+"''; data: ''"+data+"''");
if(tag == "ER") { // ER signals end of reference if(tag == "ER") { // ER signals end of reference
// unset info // unset info
@ -4692,7 +4683,7 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam
part = ''a''; part = ''a'';
} }
var field = this.get_field_subfields(fieldNo); var field = this.get_field_subfields(fieldNo);
Scholar.Utilities.debugPrint(''Found ''+field.length+'' matches for ''+fieldNo+part); Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part);
if(field) { if(field) {
for(var i in field) { for(var i in field) {
var value = false; var value = false;