removes unnecessary pieces of piggy bank API from utilities and updates translators to abide by current translator guidelines

This commit is contained in:
Simon Kornblith 2006-08-11 15:28:18 +00:00
parent 1e8aa81c02
commit 064ecd17db
2 changed files with 120 additions and 210 deletions

View File

@ -1,61 +1,21 @@
// Scholar for Firefox Utilities
// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
// This code is licensed according to the GPL
/////////////////////////////////////////////////////////////////
//
// Scholar.Utilities
//
/////////////////////////////////////////////////////////////////
// Scholar.Utilities class, a set of methods to assist in data
// extraction. Some of the code here was stolen directly from the Piggy Bank
// project.
Scholar.Utilities = function () {}
// Adapter for Piggy Bank function to print debug messages; log level is
// fixed at 4 (could change this)
Scholar.Utilities.prototype.debugPrint = function(msg) {
Scholar.Utilities.prototype.debug = function(msg) {
Scholar.debug(msg, 4);
}
// Appears to trim a string, chopping of newlines/spacing
Scholar.Utilities.prototype.trimString = function(s) {
var i = 0;
var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */;
while (i < s.length) {
var c = s.charAt(i);
if (spaceChars.indexOf(c) < 0) {
break;
}
i++;
}
s = s.substring(i);
i = s.length;
while (i > 0) {
var c = s.charAt(i - 1);
if (spaceChars.indexOf(c) < 0) {
break;
}
i--;
}
return s.substring(0, i);
}
/*
* BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
* Functions below this point are extensions to the utilities provided by
* Piggy Bank. When used in external code, the repository will need to add
* a function definition when exporting in Piggy Bank format.
* Converts a JavaScript date object to an SQL-style date
*/
/*
* Converts a JavaScript date object to an ISO-style date
*/
Scholar.Utilities.prototype.dateToISO = function(jsDate) {
Scholar.Utilities.prototype.dateToSQL = function(jsDate) {
var date = "";
var year = jsDate.getFullYear().toString();
var month = (jsDate.getMonth()+1).toString();
@ -112,7 +72,8 @@ Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) {
*/
Scholar.Utilities.prototype.cleanString = function(s) {
s = s.replace(/[ \xA0]+/g, " ");
return this.trimString(s);
s = s.replace(/^\s+/, "");
return s.replace(/\s+$/, "");
}
/*
@ -223,43 +184,6 @@ Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, paren
return elmts;
}
// Appears to look for links in a document containing a certain substring (kind
// of like getItemArray, only with NO REGEXP FUNCTIONALITY)
Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) {
var urls = [];
var addedURLs = [];
var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
var aElement = aElements.iterateNext();
while (aElement) {
var href = aElement.href;
if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
urls.unshift(href);
addedURLs[href] = true;
}
aElement = aElements.iterateNext();
}
return urls;
}
// For now, we're going to skip the getLLsFromAddresses function (which gets
// latitude and longitude pairs from a series of addresses, but requires the
// big mess of Java code that is the Piggy Bank server) and the geoHelper
// tools (which rely on getLLsFromAddresses) since these are probably not
// essential components for Scholar and would take a great deal of effort to
// implement. We can, however, always implement them later.
/*
* BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
*/
/*
* Gets a given node (assumes only one value)
*/
Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) {
return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext();
}
/*
* Gets a given node as a string containing all child nodes
*/
@ -325,10 +249,6 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) {
return Scholar.OpenURL.parseContextObject(co, item);
}
/*
* END SCHOLAR FOR FIREFOX EXTENSIONS
*/
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
@ -337,11 +257,13 @@ Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, fai
}
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
}
Scholar.Utilities.Ingester.prototype.processDocuments = function(firstDoc, urls, processor, done, exception) {
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
if(this.proxiedURL) {
for(i in urls) {
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
}
Scholar.Utilities.HTTP.processDocuments(firstDoc, urls, processor, done, exception);
}
Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception);
}
Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) {
@ -615,10 +537,7 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
prevUrl = hiddenBrowser.contentDocument.location.href;
try {
var newHiddenBrowser = new Object();
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
processor(newHiddenBrowser);
processor(hiddenBrowser.contentDocument);
} catch (e) {
Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
exception(e);

View File

@ -1,9 +1,9 @@
-- 41
-- 42
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-08 17:12:00'));
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00'));
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
'function detectWeb(doc, url) {
var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)'');
if(searchRe.test(doc.location.href)) {
@ -28,7 +28,7 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006
var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i];
var author = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue;
var author = doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
}
@ -40,15 +40,15 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006
for (var i = 0; i < elmts.length; i++) {
try {
var elmt = elmts[i];
var attribute = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
if(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
var value = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
var attribute = Scholar.Utilities.cleanString(doc.evaluate(''./B[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
if(doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
var value = Scholar.Utilities.cleanString(doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
if(attribute == "Publisher:") {
if(value.lastIndexOf("(") != -1) {
var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
jsDate = new Date(date);
if(!isNaN(jsDate.valueOf())) {
date = Scholar.Utilities.dateToISO(jsDate);
date = Scholar.Utilities.dateToSQL(jsDate);
}
newItem.date = date;
@ -74,7 +74,7 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
var title = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
var title = Scholar.Utilities.cleanString(doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
title = title.substring(0, title.lastIndexOf("(")-1);
}
@ -113,7 +113,7 @@ function doWeb(doc, url) {
uris.push(i);
}
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) },
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {});
Scholar.wait();
@ -122,7 +122,7 @@ function doWeb(doc, url) {
}
}');
REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/',
REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/',
'function detectWeb(doc, url) {
if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
return "book";
@ -195,7 +195,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
}
Scholar.Utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) {
Scholar.Utilities.debugPrint(text);
Scholar.Utilities.debug(text);
var lineRegexp = new RegExp();
lineRegexp.compile("^([\\w() ]+): *(.*)$");
@ -240,17 +240,17 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true));
}
} else {
newItem.creators.push(Scholar.Utilities.trimString(match[2]));
newItem.creators.push(Scholar.Utilities.cleanString(match[2]));
}
} else if(match[1] == ''Publication'') {
// Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
match[2] = Scholar.Utilities.trimString(match[2]);
match[2] = Scholar.Utilities.cleanString(match[2]);
if(match[2].substring(match[2].length-1) == '','') {
match[2] = match[2].substring(0, match[2].length-1);
}
newItem.publisher = match[2];
/*} else if(match[1] == ''Language'') {
.addStatement(uri, prefixDC + ''language'', Scholar.Utilities.trimString(match[2]));*/
.addStatement(uri, prefixDC + ''language'', Scholar.Utilities.cleanString(match[2]));*/
} else if(match[1] == ''Standard No'') {
var identifiers = match[2].split(/ +/);
var j=0;
@ -287,7 +287,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage', 'Simon Kornblith', 'Pwebrecon\.cgi',
'function detectWeb(doc, url) {
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
for(var i in export_options) {
@ -335,7 +335,7 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006
// Go through table rows
for(var i=0; i<tableRows.length; i++) {
// CHK is what we need to get it all as one file
var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver);
var input = doc.evaluate(''./td/input[@name="CHK"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
checkboxes[i] = input.value;
var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
urls[i] = links[0].href;
@ -414,7 +414,7 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 4, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)',
REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 4, 'JSTOR', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)',
'function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
@ -484,7 +484,7 @@ function doWeb(doc, url) {
// Go through links
for(var j=0; j<links.length; j++) {
if(tagRegexp.test(links[j].href)) {
var text = Scholar.Utilities.getNode(doc, tableRows[i], ''.//strong/text()'', null);
var text = doc.evaluate(''.//strong/text()'', tableRows[i], null, XPathResult.ANY_TYPE, null).iterateNext();
if(text && text.nodeValue) {
text = Scholar.Utilities.cleanString(text.nodeValue);
if(availableItems[links[j].href]) {
@ -562,7 +562,7 @@ function doWeb(doc, url) {
if(isNaN(date.valueOf())) {
newItem.date = fieldContent;
} else {
newItem.date = Scholar.Utilities.dateToISO(date);
newItem.date = Scholar.Utilities.dateToSQL(date);
}
} else if(fieldCode == "PP") {
newItem.pages = fieldContent;
@ -589,7 +589,7 @@ function doWeb(doc, url) {
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)',
REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)',
'function detectWeb(doc, url) {
if(doc.title == "History Cooperative: Search Results") {
return "multiple";
@ -647,7 +647,7 @@ function doWeb(doc, url) {
uris.push(i);
}
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) },
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {});
Scholar.wait();
@ -656,7 +656,7 @@ function doWeb(doc, url) {
}
}');
REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-08-06 21:45:00', 4, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)',
REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-08-06 21:45:00', 4, 'InnoPAC', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)',
'function detectWeb(doc, url) {
// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button
var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$'');
@ -709,9 +709,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
if(newUri) { // single page
Scholar.Utilities.loadDocument(newUri, function(newBrowser) {
newDoc = newBrowser.contentDocument;
Scholar.Utilities.loadDocument(newUri, function(newDoc) {
var namespace = newDoc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
@ -720,7 +718,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
var xpath = ''//pre'';
var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
var text = Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue;
var text = doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var newItem = new Scholar.Item();
newItem.source = uri;
@ -781,7 +779,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
// Go through table rows
for(var i=0; i<tableRows.length; i++) {
// CHK is what we need to get it all as one file
var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./td/input[@type="checkbox"]'', nsResolver);
var input = doc.evaluate(''./td/input[@type="checkbox"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
checkboxes[i] = input.name+"="+escape(input.value);
var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
urls[i] = links[0].href;
@ -836,7 +834,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 4, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 4, 'SIRSI 2003+', 'Simon Kornblith', '/uhtbin/cgisirsi',
'function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
@ -872,12 +870,12 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006
for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i];
try {
var node = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
var node = doc.evaluate(''./TD[1]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(!node) {
var node = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
var node = doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
}
if(node) {
var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TH[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
field = field.toLowerCase();
var value = Scholar.Utilities.superCleanString(node.nodeValue);
if(field == "publisher") {
@ -907,7 +905,7 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006
} catch (e) {}
}
var callNumber = Scholar.Utilities.getNode(doc, doc, ''//tr/td[1][@class="holdingslist"]/text()'', nsResolver);
var callNumber = doc.evaluate(''//tr/td[1][@class="holdingslist"]/text()'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(callNumber && callNumber.nodeValue) {
newItem.callNumber = callNumber.nodeValue;
}
@ -930,7 +928,7 @@ function doWeb(doc, url) {
var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver);
// Go through table rows
for(var i=1; i<tableRows.length; i++) {
var input = Scholar.Utilities.getNode(doc, tableRows[i], ''.//input[@value="Details"]'', nsResolver);
var input = doc.evaluate(''.//input[@value="Details"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
checkboxes[i] = input.name;
var text = Scholar.Utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver);
if(text) {
@ -948,14 +946,14 @@ function doWeb(doc, url) {
var m = hostRe.exec(doc.location.href);
var hitlist = doc.forms.namedItem("hitlist");
var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value;
Scholar.Utilities.debugPrint(baseUrl);
Scholar.Utilities.debug(baseUrl);
var uris = new Array();
for(var i in items) {
uris.push(baseUrl+"&"+checkboxes[i]+"=Details");
}
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) },
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done() }, function() {});
Scholar.wait();
@ -963,7 +961,7 @@ function doWeb(doc, url) {
}
');
REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)',
REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)',
'function detectWeb(doc, url) {
if(doc.title == "Results") {
return "magazineArticle";
@ -1010,29 +1008,29 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006
var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i];
var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue).toLowerCase();
if(field == "publication title") {
var publication = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
var publication = doc.evaluate(''./TD[2]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(publication.nodeValue) {
newItem.publicationTitle = Scholar.Utilities.superCleanString(publication.nodeValue);
}
var place = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
var place = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(place.nodeValue) {
newItem.place = Scholar.Utilities.superCleanString(place.nodeValue);
}
var date = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);
var date = doc.evaluate(''./TD[2]/A[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(date.nodeValue) {
date = date.nodeValue;
var jsDate = new Date(Scholar.Utilities.superCleanString(date));
if(!isNaN(jsDate.valueOf())) {
date = Scholar.Utilities.dateToISO(jsDate);
date = Scholar.Utilities.dateToSQL(jsDate);
}
newItem.date = date;
}
var moreInfo = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
var moreInfo = doc.evaluate(''./TD[2]/text()[2]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(moreInfo.nodeValue) {
moreInfo = Scholar.Utilities.superCleanString(moreInfo.nodeValue);
var parts = moreInfo.split(";\xA0");
@ -1060,10 +1058,10 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006
}
}
} else if(field == "source type") {
var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(value.nodeValue) {
value = Scholar.Utilities.superCleanString(value.nodeValue).toLowerCase();
Scholar.Utilities.debugPrint(value);
Scholar.Utilities.debug(value);
if(value.indexOf("periodical") >= 0) {
newItem.itemType = "magazineArticle";
@ -1074,7 +1072,7 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006
}
}
} else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(value) {
var type;
value = Scholar.Utilities.superCleanString(value.nodeValue);
@ -1110,7 +1108,7 @@ function doWeb(doc, url) {
// Go through links
for(var j=0; j<links.length; j++) {
if(tagRegexp.test(links[j].href)) {
var text = Scholar.Utilities.getNode(doc, tableRows[i], ''./a[@class="bold"]/text()'', null);
var text = doc.evaluate(''./a[@class="bold"]/text()'', tableRows[i], null, XPathResult.ANY_TYPE, null).iterateNext();
if(text && text.nodeValue) {
text = Scholar.Utilities.cleanString(text.nodeValue);
items[links[j].href] = text;
@ -1130,7 +1128,7 @@ function doWeb(doc, url) {
uris.push(i);
}
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) },
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {});
Scholar.wait();
@ -1140,13 +1138,13 @@ function doWeb(doc, url) {
if(m && (m[1] == "1" || m[1] == "2")) {
scrape(doc);
} else if(m) {
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(browser) { scrape(browser.contentDocument); Scholar.done(); }, function() {});
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, function() {});
Scholar.wait();
}
}
}');
REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 4, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 4, 'InfoTrac', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
'function detectWeb(doc, url) {
if(doc.title.substring(0, 8) == "Article ") {
return "magazineArticle";
@ -1249,9 +1247,9 @@ function doWeb(doc, url) {
var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body//table/tbody/tr/td[a/b]'', nsResolver);
// Go through table rows
for(var i=0; i<tableRows.length; i++) {
var link = Scholar.Utilities.getNode(doc, tableRows[i], ''./a'', nsResolver);
var link = doc.evaluate(''./a'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
uris[i] = link.href;
var article = Scholar.Utilities.getNode(doc, link, ''./b/text()'', nsResolver);
var article = doc.evaluate(''./b/text()'', link, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
items[i] = article.nodeValue;
// Chop off final period
if(items[i].substr(items[i].length-1) == ".") {
@ -1272,7 +1270,7 @@ function doWeb(doc, url) {
}
}');
REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 4, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)',
REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 4, 'LexisNexis', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)',
'function detectWeb(doc, url) {
var detailRe = new RegExp("^http://[^/]+/universe/document");
if(detailRe.test(doc.location.href)) {
@ -1302,7 +1300,7 @@ REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006
var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML);
if(m) {
var jsDate = new Date(m[1]+" "+m[2]);
newItem.date = Scholar.Utilities.dateToISO(jsDate);
newItem.date = Scholar.Utilities.dateToSQL(jsDate);
} else {
var elementParts = centerElements[centerElements.length-1].innerHTML.split(/<br[^>]*>/gi);
newItem.date = elementParts[1];
@ -1369,14 +1367,14 @@ function doWeb(doc, url) {
uris.push(i);
}
Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) },
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Scholar.done(); }, function() {});
Scholar.wait();
}
}');
REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)',
REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)',
'function detectWeb(doc, url) {
var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}");
@ -1425,8 +1423,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
}
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) {
var newDoc = newBrowser.contentDocument;
Scholar.Utilities.processDocuments(newUris, function(newDoc) {
var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI;
@ -1440,7 +1437,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
var record = new marc.MARC_Record();
for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i];
var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue);
var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
var value = Scholar.Utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver);
var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1");
@ -1467,7 +1464,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)',
REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)',
'function detectWeb(doc, url) {
var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]'');
if(detailsRe.test(doc.location.href)) {
@ -1511,8 +1508,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, uris, function(newBrowser) {
var newDoc = newBrowser.contentDocument;
Scholar.Utilities.processDocuments(uris, function(newDoc) {
var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI;
@ -1526,7 +1522,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
var record = new marc.MARC_Record();
for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i];
var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(newDoc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue);
var field = Scholar.Utilities.superCleanString(newDoc.evaluate(''./TD[1]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
var value = Scholar.Utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver);
value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1");
@ -1555,13 +1551,13 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)',
REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS', 'Simon Kornblith', '/chameleon(?:\?|$)',
'function detectWeb(doc, url) {
var node = Scholar.Utilities.getNode(doc, doc, ''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', null);
var node = doc.evaluate(''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if(node) {
return "multiple";
}
var node = Scholar.Utilities.getNode(doc, doc, ''//a[text()="marc"]'', null);
var node = doc.evaluate(''//a[text()="marc"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if(node) {
return "book";
}
@ -1602,7 +1598,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
// Collect title information
var fields = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''./td/table/tbody/tr[th]'', nsResolver);
for(var j=0; j<fields.length; j++) {
var field = Scholar.Utilities.getNode(doc, fields[j], ''./th/text()'', nsResolver);
var field = doc.evaluate(''./th/text()'', fields[j], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(field.nodeValue == "Title") {
var value = Scholar.Utilities.getNodeString(doc, fields[j], ''./td//text()'', nsResolver);
if(value) {
@ -1620,15 +1616,14 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
}
for(var i in items) {
Scholar.Utilities.debugPrint(i.replace(/function=[A-Z]{7}/, "function=MARCSCR"));
Scholar.Utilities.debug(i.replace(/function=[A-Z]{7}/, "function=MARCSCR"));
newUris.push(i.replace(/function=[A-Z]{7}/, "function=MARCSCR"));
}
}
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) {
var newDoc = newBrowser.contentDocument;
Scholar.Utilities.processDocuments(newUris, function(newDoc) {
var uri = newDoc.location.href
var namespace = newDoc.documentElement.namespaceURI;
@ -1641,10 +1636,10 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
var record = new marc.MARC_Record();
for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i];
var field = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue;
var ind1 = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue;
var ind2 = Scholar.Utilities.getNode(doc, elmt, ''./TD[3]/text()[1]'', nsResolver).nodeValue;
var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[4]/text()[1]'', nsResolver).nodeValue;
var field = doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var ind1 = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var ind2 = doc.evaluate(''./TD[3]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var value = doc.evaluate(''./TD[4]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
value = value.replace(/\\([a-z]) /g, record.subfield_delimiter+"$1");
record.add_field(field, ind1, ind2, value);
@ -1659,7 +1654,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 4, 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)',
REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 4, 'DRA', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)',
'function detectWeb(doc, url) {
if(doc.location.href.indexOf("/authority_hits") > 0) {
return "multiple";
@ -1729,7 +1724,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006
}');
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
'function detectWeb(doc, url) {
if(doc.location.href.indexOf("/GeacQUERY") > 0) {
return "multiple";
@ -1764,8 +1759,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, uris, function(newBrowser) {
var newDoc = newBrowser.contentDocument;
Scholar.Utilities.processDocuments(uris, function(newDoc) {
var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI;
@ -1817,7 +1811,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003', 'Simon Kornblith', '/uhtbin/cgisirsi',
'function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
@ -1870,7 +1864,7 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006
var links = Scholar.Utilities.gatherElementsOnXPath(doc, elmts[i], ''.//a'', nsResolver);
// Collect title
var myTd = Scholar.Utilities.getNode(doc, elmts[i], "./td[2]", nsResolver);
var myTd = doc.evaluate("./td[2]", elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var m = titleRe.exec(myTd.innerHTML);
var title = unescapeHTML(m[1]);
@ -1895,9 +1889,9 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006
var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver);
for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i];
var initialText = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver);
var initialText = doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(initialText && initialText.nodeValue && Scholar.Utilities.superCleanString(initialText.nodeValue) == "Viewing record") {
recNumbers.push(Scholar.Utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue);
recNumbers.push(doc.evaluate(''./b[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
break;
}
}
@ -1953,7 +1947,7 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)',
REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)',
'function detectWeb(doc, url) {
var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]");
if(detailRe.test(doc.location.href)) {
@ -1989,8 +1983,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) {
var newDoc = newBrowser.contentDocument;
Scholar.Utilities.processDocuments(newUris, function(newDoc) {
var uri = newDoc.location.href;
var namespace = newDoc.documentElement.namespaceURI;
@ -2006,8 +1999,8 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
for(var i=0; i<elmts.length; i++) {
var elmt = elmts[i];
tag = Scholar.Utilities.getNode(newDoc, elmt, ''./td[2]/tt[1]/text()[1]'', nsResolver).nodeValue;
var inds = Scholar.Utilities.getNode(newDoc, elmt, ''./td[3]/tt[1]/text()[1]'', nsResolver).nodeValue;
tag = newDoc.evaluate(''./td[2]/tt[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var inds = newDoc.evaluate(''./td[3]/tt[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
tag = tag.replace(/[\r\n]/g, "");
if(tag.length == 1) {
@ -2051,7 +2044,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
Scholar.wait();
}');
REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 4, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)',
REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 4, 'Project MUSE', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)',
'function detectWeb(doc, url) {
var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi");
if(searchRe.test(url)) {
@ -2073,8 +2066,8 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006
// Go through table rows
for(var i=0; i<tableRows.length; i++) {
// article_id is what we need to get it all as one file
var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./tbody/tr/td/input[@name="article_id"]'', nsResolver);
var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//b/i/a/text()'', nsResolver);
var input = doc.evaluate(''./tbody/tr/td/input[@name="article_id"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var link = doc.evaluate(''.//b/i/a/text()'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(input && input.value && link && link.nodeValue) {
items[input.value] = link.nodeValue;
}
@ -2163,7 +2156,7 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006
}
}');
REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-08-07 21:55:00', 12, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)',
REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-08-07 21:55:00', 12, 'PubMed', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)',
'function detectWeb(doc, url) {
if(doc.location.href.indexOf("list_uids=") >= 0) {
return "journalArticle";
@ -2239,7 +2232,7 @@ function detectSearch(item) {
var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Day.text()+", "+article.Journal.JournalIssue.PubDate.Year.text();
var jsDate = new Date(date);
if(!isNaN(jsDate.valueOf())) {
date = Scholar.Utilities.dateToISO(jsDate);
date = Scholar.Utilities.dateToSQL(jsDate);
}
} else if(article.Journal.JournalIssue.PubDate.Month.text().toString() != "") {
var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text();
@ -2293,8 +2286,8 @@ function doWeb(doc, url) {
var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver);
// Go through table rows
for(var i=0; i<tableRows.length; i++) {
var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//a'', nsResolver);
var article = Scholar.Utilities.getNode(doc, tableRows[i], ''./tr[2]/td[2]/text()[1]'', nsResolver);
var link = doc.evaluate(''.//a'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var article = doc.evaluate(''./tr[2]/td[2]/text()[1]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
items[link.href] = article.nodeValue;
}
@ -2318,7 +2311,7 @@ function doSearch(item) {
lookupPMIDs([getPMID(item.contextObject)]);
}');
REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 4, 'Embedded RDF Scraper', 'Simon Kornblith', NULL,
REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 4, 'Embedded RDF', 'Simon Kornblith', NULL,
'function detectWeb(doc, url) {
var metaTags = doc.getElementsByTagName("meta");
@ -2347,7 +2340,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006
foundTitle = true;
}
translator.Scholar.RDF.addStatement(url, dc + tag.substr(3), value, true);
Scholar.Utilities.debugPrint(tag.substr(3) + " = " + value);
Scholar.Utilities.debug(tag.substr(3) + " = " + value);
} else if(tag && value && (tag == "author" || tag == "author-personal")) {
translator.Scholar.RDF.addStatement(url, dc + "creator", value, true);
} else if(tag && value && tag == "author-corporate") {
@ -2362,7 +2355,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006
translator.doImport();
}');
REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS Scraper', 'Simon Kornblith', NULL,
REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS', 'Simon Kornblith', NULL,
'function detectWeb(doc, url) {
var spanTags = doc.getElementsByTagName("span");
@ -2413,7 +2406,7 @@ function retrieveNextCOinS(needFullItems, newItems) {
if(needFullItems.length) {
var item = needFullItems.shift();
Scholar.Utilities.debugPrint("looking up contextObject");
Scholar.Utilities.debug("looking up contextObject");
var search = Scholar.loadTranslator("search");
search.setHandler("itemDone", function(obj, item) {
newItems.push(item);
@ -2490,7 +2483,7 @@ function doWeb(doc, url) {
}
}');
REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)',
REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)',
'function detectWeb(doc, url) {
var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');
if(re.test(doc.location.href)) {
@ -2526,8 +2519,7 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
}
}
Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) {
var newDoc = newBrowser.contentDocument;
Scholar.Utilities.processDocuments(newUris, function(newDoc) {
var newItem = new Scholar.Item("book");
newItem.source = newDoc.location.href;
@ -2539,8 +2531,8 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
var xpath = ''//table[@id="bib"]/tbody/tr'';
var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
for(var i = 0; i<elmts.length; i++) {
var field = Scholar.Utilities.getNode(newDoc, elmts[i], ''./td[1]//text()'', nsResolver);
var value = Scholar.Utilities.getNode(newDoc, elmts[i], ''./td[2]//text()'', nsResolver);
var field = newDoc.evaluate(''./td[1]//text()'', elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var value = newDoc.evaluate(''./td[2]//text()'', elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(field && value) {
field = Scholar.Utilities.superCleanString(field.nodeValue);
@ -2564,7 +2556,7 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
jsDate = new Date(value);
if(!isNaN(jsDate.valueOf())) {
date = Scholar.Utilities.dateToISO(jsDate);
date = Scholar.Utilities.dateToSQL(jsDate);
}
newItem.date = date;
@ -2618,8 +2610,7 @@ function doSearch(item) {
var co = Scholar.Utilities.createContextObject(item);
}
Scholar.Utilities.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) {
var doc = browser.contentDocument;
Scholar.Utilities.loadDocument("http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co, function(doc) {
// find new COinS in the Open WorldCat page
if(processOWC(doc)) { // we got a single item page
Scholar.done();
@ -2644,16 +2635,16 @@ function doSearch(item) {
urlsToProcess.push(elmt.href);
} while(elmt = elmts.iterateNext());
Scholar.Utilities.processDocuments(null, urlsToProcess, function(browser) {
Scholar.Utilities.processDocuments(urlsToProcess, function(doc) {
// per URL
processOWC(browser.contentDocument);
processOWC(doc);
}, function() { // done
Scholar.done();
}, function() { // error
Scholar.done(false);
});
}
}, null, function() {
}, function() {
error();
});
@ -3031,7 +3022,7 @@ function doImport() {
while(read = Scholar.read(16384)) {
text += read;
}
Scholar.Utilities.debugPrint("read in");
Scholar.Utilities.debug("read in");
// eliminate <?xml ?> heading so we can parse as XML
text = text.replace(/<\?xml[^?]+\?>/, "");
@ -3043,9 +3034,9 @@ function doImport() {
var xml = new XML(text);
for each(var mods in xml.m::mods) {
Scholar.Utilities.debugPrint("item is: ");
Scholar.Utilities.debug("item is: ");
for(var i in mods) {
Scholar.Utilities.debugPrint(i+" = "+mods[i].toString());
Scholar.Utilities.debug(i+" = "+mods[i].toString());
}
var newItem = new Scholar.Item();
@ -3863,8 +3854,8 @@ function doImport() {
} else if(type == n.bib+"Memo") {
// check to see if this note is independent
var arcs = Scholar.RDF.getArcsIn(node);
Scholar.Utilities.debugPrint("working on a note");
Scholar.Utilities.debugPrint(arcs);
Scholar.Utilities.debug("working on a note");
Scholar.Utilities.debug(arcs);
var skip = false;
for each(var arc in arcs) {
arc = Scholar.RDF.getResourceURI(arc);
@ -4239,7 +4230,7 @@ function doImport() {
var tag = data = false;
do { // first valid line is type
line = Scholar.read();
Scholar.Utilities.debugPrint(line);
Scholar.Utilities.debug(line);
} while(line !== false && line.substr(0, 6) != "TY - ");
var item = new Scholar.Item();
@ -4258,7 +4249,7 @@ function doImport() {
tag = line.substr(0,2);
data = line.substr(6);
Scholar.Utilities.debugPrint("tag: ''"+tag+"''; data: ''"+data+"''");
Scholar.Utilities.debug("tag: ''"+tag+"''; data: ''"+data+"''");
if(tag == "ER") { // ER signals end of reference
// unset info
@ -4692,7 +4683,7 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam
part = ''a'';
}
var field = this.get_field_subfields(fieldNo);
Scholar.Utilities.debugPrint(''Found ''+field.length+'' matches for ''+fieldNo+part);
Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part);
if(field) {
for(var i in field) {
var value = false;