Scrapable search results for SIRSI 2003+ scraper
This commit is contained in:
parent
9742283389
commit
83c36f330d
175
scrapers.sql
175
scrapers.sql
|
@ -1,7 +1,7 @@
|
||||||
-- 12
|
-- 12
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 10:11:00'));
|
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 12:17:00'));
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
|
@ -661,7 +661,7 @@ if(newUri) {
|
||||||
|
|
||||||
wait();');
|
wait();');
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-12 09:58:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
|
REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-23 12:17:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
|
||||||
'var namespace = doc.documentElement.namespaceURI;
|
'var namespace = doc.documentElement.namespaceURI;
|
||||||
var nsResolver = namespace ? function(prefix) {
|
var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
|
@ -672,6 +672,12 @@ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
if(elmts.length) {
|
if(elmts.length) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
var xpath = ''//td[@class="searchsum"]/table'';
|
||||||
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
|
if(elmts.length) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return false;',
|
return false;',
|
||||||
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
|
@ -683,77 +689,122 @@ var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
} : null;
|
} : null;
|
||||||
|
|
||||||
var uri = doc.location.href;
|
|
||||||
var data = new Object();
|
var data = new Object();
|
||||||
|
|
||||||
var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
|
function scrape(doc) {
|
||||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
var uri = doc.location.href;
|
||||||
for (var i = 0; i < elmts.length; i++) {
|
|
||||||
var elmt = elmts[i];
|
|
||||||
try {
|
|
||||||
var node = utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
|
|
||||||
if(!node) {
|
|
||||||
var node = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
|
|
||||||
}
|
|
||||||
if(node) {
|
|
||||||
var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
|
|
||||||
field = field.toLowerCase();
|
|
||||||
var value = utilities.superCleanString(node.nodeValue);
|
|
||||||
var rdfUri = null;
|
|
||||||
if(field == "publisher") {
|
|
||||||
rdfUri = prefixDC + ''publisher'';
|
|
||||||
} else if(field == "pub date") {
|
|
||||||
rdfUri = prefixDC + ''year'';
|
|
||||||
|
|
||||||
var re = /[0-9]+/;
|
var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
|
||||||
var m = re.exec(value);
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
value = m[0];
|
if(!elmts.length) {
|
||||||
} else if(field == "isbn") {
|
return false;
|
||||||
rdfUri = prefixDC + ''identifier'';
|
}
|
||||||
|
for (var i = 0; i < elmts.length; i++) {
|
||||||
var re = /^[0-9](?:[0-9X]+)/;
|
var elmt = elmts[i];
|
||||||
var m = re.exec(value);
|
try {
|
||||||
value = m[0];
|
var node = utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
|
||||||
} else if(field == "title") {
|
if(!node) {
|
||||||
rdfUri = prefixDC + ''title'';
|
var node = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
|
||||||
var titleParts = value.split(" / ");
|
|
||||||
value = titleParts[0];
|
|
||||||
} else if(field == "publication info") {
|
|
||||||
rdfUri = prefixDummy + ''place'';
|
|
||||||
var pubParts = value.split(" : ");
|
|
||||||
value = pubParts[0];
|
|
||||||
} else if(field == "personal author") {
|
|
||||||
rdfUri = prefixDC + ''creator'';
|
|
||||||
value = utilities.cleanAuthor(node.nodeValue);
|
|
||||||
} else if(field == "added author") {
|
|
||||||
rdfUri = prefixDC + ''contributor'';
|
|
||||||
value = utilities.cleanAuthor(node.nodeValue);
|
|
||||||
} else if(field == "corporate author") {
|
|
||||||
rdfUri = prefixDummy + ''corporateCreator'';
|
|
||||||
}
|
}
|
||||||
if(rdfUri) {
|
if(node) {
|
||||||
var insert = true;
|
var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
|
||||||
if(data && data[rdfUri]) {
|
field = field.toLowerCase();
|
||||||
for(j in data[rdfUri]) {
|
var value = utilities.superCleanString(node.nodeValue);
|
||||||
if(data[rdfUri][j] == value) {
|
var rdfUri = null;
|
||||||
insert = false;
|
if(field == "publisher") {
|
||||||
break;
|
rdfUri = prefixDC + ''publisher'';
|
||||||
|
} else if(field == "pub date") {
|
||||||
|
rdfUri = prefixDC + ''year'';
|
||||||
|
|
||||||
|
var re = /[0-9]+/;
|
||||||
|
var m = re.exec(value);
|
||||||
|
value = m[0];
|
||||||
|
} else if(field == "isbn") {
|
||||||
|
rdfUri = prefixDC + ''identifier'';
|
||||||
|
|
||||||
|
var re = /^[0-9](?:[0-9X]+)/;
|
||||||
|
var m = re.exec(value);
|
||||||
|
value = m[0];
|
||||||
|
} else if(field == "title") {
|
||||||
|
rdfUri = prefixDC + ''title'';
|
||||||
|
var titleParts = value.split(" / ");
|
||||||
|
value = titleParts[0];
|
||||||
|
} else if(field == "publication info") {
|
||||||
|
rdfUri = prefixDummy + ''place'';
|
||||||
|
var pubParts = value.split(" : ");
|
||||||
|
value = pubParts[0];
|
||||||
|
} else if(field == "personal author") {
|
||||||
|
rdfUri = prefixDC + ''creator'';
|
||||||
|
value = utilities.cleanAuthor(node.nodeValue);
|
||||||
|
} else if(field == "added author") {
|
||||||
|
rdfUri = prefixDC + ''contributor'';
|
||||||
|
value = utilities.cleanAuthor(node.nodeValue);
|
||||||
|
} else if(field == "corporate author") {
|
||||||
|
rdfUri = prefixDummy + ''corporateCreator'';
|
||||||
|
}
|
||||||
|
if(rdfUri) {
|
||||||
|
var insert = true;
|
||||||
|
if(data && data[rdfUri]) {
|
||||||
|
for(j in data[rdfUri]) {
|
||||||
|
if(data[rdfUri][j] == value) {
|
||||||
|
insert = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else if(!data[rdfUri]) {
|
||||||
|
data[rdfUri] = new Array();
|
||||||
|
}
|
||||||
|
if(insert) {
|
||||||
|
data[rdfUri].push(value);
|
||||||
|
model.addStatement(uri, rdfUri, value, true);
|
||||||
}
|
}
|
||||||
} else if(!data[rdfUri]) {
|
|
||||||
data[rdfUri] = new Array();
|
|
||||||
}
|
|
||||||
if(insert) {
|
|
||||||
data[rdfUri].push(value);
|
|
||||||
model.addStatement(uri, rdfUri, value, true);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} catch (e) {}
|
||||||
} catch (e) {}
|
}
|
||||||
|
|
||||||
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
if(!scrape(doc)) {
|
||||||
|
var checkboxes = new Array();
|
||||||
|
var urls = new Array();
|
||||||
|
var availableItems = new Array();
|
||||||
|
|
||||||
|
var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver);
|
||||||
|
// Go through table rows
|
||||||
|
for(var i=1; i<tableRows.length; i++) {
|
||||||
|
var input = utilities.getNode(doc, tableRows[i], ''.//input[@value="Details"]'', nsResolver);
|
||||||
|
checkboxes[i] = input.name;
|
||||||
|
var text = utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver);
|
||||||
|
if(text) {
|
||||||
|
availableItems[i] = text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var items = utilities.selectItems(availableItems);
|
||||||
|
|
||||||
|
if(!items) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var hostRe = new RegExp("^http://[^/]+");
|
||||||
|
var m = hostRe.exec(doc.location.href);
|
||||||
|
var hitlist = doc.forms.namedItem("hitlist");
|
||||||
|
var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value;
|
||||||
|
utilities.debugPrint(baseUrl);
|
||||||
|
|
||||||
|
var uris = new Array();
|
||||||
|
for(i in items) {
|
||||||
|
uris.push(baseUrl+"&"+checkboxes[i]+"=Details");
|
||||||
|
}
|
||||||
|
|
||||||
|
utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
|
||||||
|
function() { done() }, function() {});
|
||||||
|
|
||||||
|
wait();
|
||||||
|
}
|
||||||
');
|
');
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
|
REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user