
- Make Amazon scraper work with multiple documents - Fix bugs in processDocuments - Make Scholar.Ingester.Utilities.getItemArray() willing to take an array of DOM nodes to search for links, and finally take advantage of the fact that objects have no length
1545 lines
55 KiB
SQL
1545 lines
55 KiB
SQL
-- 11
|
|
|
|
-- Set the following timestamp to the most recent scraper update date
|
|
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 22:58:00'));
|
|
|
|
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
function scrape(doc) {
|
|
uri = doc.location.href;
|
|
|
|
// Retrieve authors
|
|
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
for (var i = 0; i < elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
|
|
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
|
|
}
|
|
|
|
// Retrieve data from "Product Details" box
|
|
var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
for (var i = 0; i < elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
|
|
if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
|
|
var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
|
|
if(attribute == "Publisher:") {
|
|
if(value.lastIndexOf("(") != -1) {
|
|
var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
|
jsDate = new Date(date);
|
|
if(!isNaN(jsDate.valueOf())) {
|
|
date = utilities.dateToISO(jsDate);
|
|
}
|
|
|
|
value = value.substring(0, value.lastIndexOf("(")-1);
|
|
}
|
|
if(value.lastIndexOf(";") != -1) {
|
|
var edition = value.substring(value.lastIndexOf(";")+2, value.length);
|
|
value = value.substring(0, value.lastIndexOf(";"));
|
|
}
|
|
model.addStatement(uri, prefixDC + ''publisher'', value);
|
|
model.addStatement(uri, prefixDC + ''date'', date);
|
|
model.addStatement(uri, prefixDC + ''hasVersion'', edition);
|
|
} else if(attribute == "Language:") {
|
|
model.addStatement(uri, prefixDC + ''language'', value);
|
|
} else if(attribute == "ISBN:") {
|
|
model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
|
|
} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
|
|
model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
|
|
model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
|
|
}
|
|
}
|
|
}
|
|
|
|
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
|
|
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
|
|
title = title.substring(0, title.lastIndexOf("(")-1);
|
|
}
|
|
model.addStatement(uri, prefixDC + ''title'', title);
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
|
}
|
|
|
|
var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)'');
|
|
var m = searchRe.exec(doc.location.href)
|
|
if(m) {
|
|
// Why can''t amazon use standard stylesheets
|
|
var xpath;
|
|
if(m == "gp/search/") {
|
|
xpath = ''//table[@class="searchresults"]'';
|
|
} else {
|
|
xpath = ''//table[@cellpadding="3"]'';
|
|
}
|
|
|
|
var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
var items = utilities.getItemArray(doc, searchresults, ''http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$'');
|
|
items = utilities.selectItems(items);
|
|
|
|
if(!items) {
|
|
return true;
|
|
}
|
|
|
|
var uris = new Array();
|
|
for(i in items) {
|
|
uris.push(i);
|
|
}
|
|
|
|
utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
|
|
function() {
|
|
utilities.debugPrint("look, done");
|
|
done();
|
|
}, function() {});
|
|
|
|
wait();
|
|
} else {
|
|
scrape(doc);
|
|
}');
|
|
|
|
REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
|
|
'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
|
|
return true;
|
|
}
|
|
return false;',
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/;
|
|
var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/;
|
|
var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/;
|
|
var lineRegexp = /^([\w() ]+): *(.*)$/;
|
|
var publisherRegexp = /^(.*), (.*?),?$/;
|
|
|
|
var uri = doc.location.href;
|
|
|
|
var sMatch = sessionRegexp.exec(uri);
|
|
var sessionid = sMatch[1];
|
|
|
|
var nMatch = numberRegexp.exec(uri);
|
|
if(nMatch) {
|
|
var number = nMatch[1];
|
|
} else {
|
|
number = 1;
|
|
}
|
|
|
|
var rMatch = resultsetRegexp.exec(uri);
|
|
if(rMatch) {
|
|
var resultset = rMatch[1];
|
|
} else {
|
|
// It''s in an XPCNativeWrapper, so we have to do this black magic
|
|
resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value;
|
|
}
|
|
|
|
var newUri = ''http://newfirstsearch.oclc.org/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0'';
|
|
|
|
utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintext'', null, function(text) {
|
|
var lines = text.split(''\n'');
|
|
for(var i=0;i<lines.length;i++) {
|
|
match = lineRegexp.exec(lines[i]);
|
|
if(match) {
|
|
if(match[1] == ''Title'') {
|
|
var title = match[2];
|
|
if(!lineRegexp.test(lines[i+1])) {
|
|
i++;
|
|
title += '' ''+lines[i];
|
|
}
|
|
if(title.substring(title.length-2) == " /") {
|
|
title = title.substring(0, title.length-2);
|
|
}
|
|
model.addStatement(uri, prefixDC + ''title'', title);
|
|
} else if(match[1] == ''Author(s)'') {
|
|
var authors = match[2].split('';'');
|
|
if(authors) {
|
|
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(authors[0]));
|
|
for(var j=1; j<authors.length; j+=2) {
|
|
if(authors[j-1].substring(0, 1) == ''('') {
|
|
j++;
|
|
}
|
|
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(authors[j]));
|
|
}
|
|
} else {
|
|
model.addStatement(uri, prefixDC + ''creator'', utilities.trimString(match[2]));
|
|
}
|
|
} else if(match[1] == ''Publication'') {
|
|
// Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
|
|
match[2] = utilities.trimString(match[2]);
|
|
if(match[2].substring(match[2].length-1) == '','') {
|
|
match[2] = match[2].substring(0, match[2].length-1);
|
|
}
|
|
model.addStatement(uri, prefixDC + ''publisher'', match[2]);
|
|
} else if(match[1] == ''Language'') {
|
|
model.addStatement(uri, prefixDC + ''language'', utilities.trimString(match[2]));
|
|
} else if(match[1] == ''Standard No'') {
|
|
var identifiers = match[2].split(/ +/);
|
|
var j=0;
|
|
while(j<(identifiers.length-1)) {
|
|
var type = identifiers[j].substring(0, identifiers[j].length-1);
|
|
var lastChar;
|
|
var value;
|
|
|
|
j++;
|
|
while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') {
|
|
if(identifiers[j].substring(0, 1) != ''('') {
|
|
if(lastChar == '';'') {
|
|
value = identifiers[j].substring(0, identifiers[j].length-1);
|
|
} else {
|
|
value = identifiers[j];
|
|
}
|
|
model.addStatement(uri, prefixDC + ''identifier'', type + '' '' + value);
|
|
}
|
|
j++;
|
|
}
|
|
}
|
|
} else if(match[1] == ''Year'') {
|
|
model.addStatement(uri, prefixDC + ''year'', match[2]);
|
|
}
|
|
}
|
|
}
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
|
|
|
done();
|
|
})
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-22 16:51:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
|
|
'var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
|
|
for(i in export_options) {
|
|
if(export_options[i].text == ''Latin1 MARC''
|
|
|| export_options[i].text == ''Raw MARC''
|
|
|| export_options[i].text == ''UTF-8''
|
|
|| export_options[i].text == ''MARC (Unicode/UTF-8)''
|
|
|| export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
|
|
// We have an exportable single record
|
|
return true;
|
|
}
|
|
}
|
|
return false;',
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
var postString = '''';
|
|
var form = doc.forms.namedItem(''frm'');
|
|
var newUri = form.action;
|
|
var multiple = false;
|
|
|
|
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
|
multiple = true;
|
|
|
|
var availableItems = new Object(); // Technically, associative arrays are objects
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
// Require link to match this
|
|
var tagRegexp = new RegExp();
|
|
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
|
|
// Do not allow text to match this
|
|
var rejectRegexp = new RegExp();
|
|
rejectRegexp.compile(''\[ [0-9]+ \]'');
|
|
|
|
var checkboxes = new Array();
|
|
var urls = new Array();
|
|
|
|
var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/table/tbody/tr[td/input[@type="checkbox"]]'', nsResolver);
|
|
// Go through table rows
|
|
for(var i=0; i<tableRows.length; i++) {
|
|
// CHK is what we need to get it all as one file
|
|
var input = utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver);
|
|
checkboxes[i] = input.value;
|
|
var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
|
|
urls[i] = links[0].href;
|
|
utilities.debugPrint(urls[i]+" = "+links[0].href);
|
|
// Go through links
|
|
for(var j=0; j<links.length; j++) {
|
|
if(tagRegexp.test(links[j].href)) {
|
|
var text = utilities.getNodeString(doc, links[j], ''.//text()'', null);
|
|
if(text) {
|
|
text = utilities.cleanString(text);
|
|
if(!rejectRegexp.test(text)) {
|
|
if(availableItems[i]) {
|
|
availableItems[i] += " "+text;
|
|
} else {
|
|
availableItems[i] = text;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var items = utilities.selectItems(availableItems);
|
|
if(!items) {
|
|
return true;
|
|
}
|
|
|
|
// add arguments for items we need to grab
|
|
for(i in items) {
|
|
postString += "CHK="+checkboxes[i]+"&";
|
|
}
|
|
}
|
|
|
|
var raw, unicode, latin1;
|
|
|
|
for(i in form.elements) {
|
|
if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') {
|
|
postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&'';
|
|
}
|
|
}
|
|
|
|
var export_options = form.elements.namedItem(''RD'').options;
|
|
for(i in export_options) {
|
|
if(export_options[i].text == ''Raw MARC''
|
|
|| export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
|
|
raw = i;
|
|
} if(export_options[i].text == ''Latin1 MARC'') {
|
|
latin1 = i;
|
|
} else if(export_options[i].text == ''UTF-8''
|
|
|| export_options[i].text == ''MARC (Unicode/UTF-8)'') {
|
|
unicode = i;
|
|
}
|
|
}
|
|
postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT'';
|
|
|
|
utilities.debugPrint(postString);
|
|
|
|
// No idea why this doesn''t work as post
|
|
utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
|
|
var records = text.split("\x1D");
|
|
for(var i=0; i<(records.length-1); i++) {
|
|
if(multiple) {
|
|
utilities.debugPrint("uri = urls["+i+"]");
|
|
uri = urls[i];
|
|
utilities.debugPrint("my uri = "+uri);
|
|
}
|
|
var record = new MARC_Record();
|
|
record.load(records[i], "binary");
|
|
utilities.importMARCRecord(record, uri, model);
|
|
}
|
|
done();
|
|
})
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)',
|
|
'var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
// If this is a view page, find the link to the citation
|
|
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
if(!elmts.length) {
|
|
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
}
|
|
if(elmts && elmts.length) {
|
|
return true;
|
|
}
|
|
return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var uri = doc.location.href;
|
|
|
|
// If this is a view page, find the link to the citation
|
|
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
if(!elmts.length) {
|
|
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
}
|
|
|
|
var saveCitation = elmts[0].href;
|
|
var viewSavedCitations = elmts[1].href;
|
|
saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');
|
|
|
|
// Parse save citation link
|
|
var importantCitationRegexp = /userID.*$/;
|
|
var match = importantCitationRegexp.exec(viewSavedCitations);
|
|
var postData = match[0]+''&citationAction=removeAll&confirmRemAll=on'';
|
|
utilities.HTTPUtilities.doPost(''http://www.jstor.org/browse'', postData, null, function() { // clear marked
|
|
utilities.HTTPUtilities.doGet(saveCitation, null, function() { // mark this
|
|
utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse/citations.txt?exportAction=Save+as+Text+File&exportFormat=cm&''+match[0], null, function(text) {
|
|
// get marked
|
|
var lines = text.split("\n");
|
|
var haveStarted = false;
|
|
|
|
var data = new Object();
|
|
data[prefixDC + "title"] = new Array();
|
|
data[prefixDC + "creator"] = new Array();
|
|
data[prefixDummy + "publication"] = new Array();
|
|
data[prefixDummy + "volume"] = new Array();
|
|
data[prefixDummy + "number"] = new Array();
|
|
data[prefixDummy + "series"] = new Array();
|
|
data[prefixDC + "date"] = new Array();
|
|
data[prefixDummy + "pages"] = new Array();
|
|
data[prefixDC + "identifier"] = new Array();
|
|
data[prefixDC + "publisher"] = new Array();
|
|
|
|
var stableURL;
|
|
|
|
for(i in lines) {
|
|
if(haveStarted) {
|
|
var fieldCode = lines[i].substring(0, 2);
|
|
var fieldContent = utilities.cleanString(lines[i].substring(5));
|
|
|
|
if(lines[i].substring(2, 5) != " : ") {
|
|
break;
|
|
}
|
|
|
|
if(fieldCode == "TI") {
|
|
data[prefixDC + "title"].push(fieldContent);
|
|
} else if(fieldCode == "AU") {
|
|
var authors = fieldContent.split(";");
|
|
for(j in authors) {
|
|
var author = authors[j];
|
|
if(author) {
|
|
var splitNames = author.split('', '');
|
|
if(splitNames) {
|
|
author = splitNames[1]+'' ''+splitNames[0];
|
|
}
|
|
data[prefixDC + "creator"].push(author);
|
|
}
|
|
}
|
|
} else if(fieldCode == "SO") {
|
|
data[prefixDummy + "publication"].push(fieldContent);
|
|
} else if(fieldCode == "VO") {
|
|
data[prefixDummy + "volume"].push(fieldContent);
|
|
} else if(fieldCode == "NO") {
|
|
data[prefixDummy + "number"].push(fieldContent);
|
|
} else if(fieldCode == "SE") {
|
|
data[prefixDummy + "series"].push(fieldContent);
|
|
} else if(fieldCode == "DA") {
|
|
var date = new Date(fieldContent.replace(".", ""));
|
|
if(isNaN(date.valueOf())) {
|
|
data[prefixDC + "date"].push(fieldContent);
|
|
} else {
|
|
data[prefixDC + "date"].push(utilities.dateToISO(date));
|
|
}
|
|
} else if(fieldCode == "PP") {
|
|
data[prefixDummy + "pages"].push(fieldContent);
|
|
} else if(fieldCode == "EI") {
|
|
stableURL = fieldContent;
|
|
} else if(fieldCode == "IN") {
|
|
data[prefixDC + "identifier"].push("ISSN "+fieldContent);
|
|
} else if(fieldCode == "PB") {
|
|
data[prefixDC + "publisher"].push(fieldContent);
|
|
}
|
|
}
|
|
if(lines[i].substring(0,3) == "<1>") {
|
|
haveStarted = true;
|
|
}
|
|
}
|
|
|
|
// Loop through again so that we can add with the stableURL
|
|
model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journal", false);
|
|
for(i in data) {
|
|
if(data[i].length) {
|
|
for(j in data[i]) {
|
|
model.addStatement(stableURL, i, data[i][j]);
|
|
}
|
|
}
|
|
}
|
|
|
|
done();
|
|
})
|
|
})
|
|
});
|
|
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-18 11:02:00', 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/journals/.+/.+/.+\.html', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
|
|
var month, year;
|
|
|
|
var metaTags = doc.getElementsByTagName("meta");
|
|
|
|
function associateMeta(field, rdfUri) {
|
|
var field = metaTags.namedItem(field);
|
|
if(field) {
|
|
model.addStatement(uri, rdfUri, field.getAttribute("content"), false);
|
|
}
|
|
}
|
|
|
|
associateMeta("Title", prefixDC + "title");
|
|
associateMeta("Journal", prefixDummy + "publication");
|
|
associateMeta("Volume", prefixDummy + "volume");
|
|
associateMeta("Issue", prefixDummy + "number");
|
|
|
|
var author = metaTags.namedItem("Author");
|
|
if(author) {
|
|
var authors = author.getAttribute("content").split(" and ");
|
|
for(j in authors) {
|
|
model.addStatement(uri, prefixDC + "creator", authors[j], false);
|
|
}
|
|
}
|
|
|
|
var month = metaTags.namedItem("PublicationMonth");
|
|
var year = metaTags.namedItem("PublicationYear");
|
|
if(month && year) {
|
|
model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false);
|
|
}
|
|
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
|
|
');
|
|
|
|
REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-18 16:55:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)',
|
|
'// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button
|
|
var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$'');
|
|
if(matchRegexp.test(doc.location.href)) {
|
|
return true;
|
|
}
|
|
// Next, look for the MARC button
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var xpath = ''//a[img[@alt="MARC Display"]]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
if(elmts.length) {
|
|
return true;
|
|
}
|
|
return false;
|
|
',
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$'');
|
|
var m = matchRegexp.exec(uri);
|
|
if(m) {
|
|
var newUri = m[1]+''marc''+m[2];
|
|
} else {
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var xpath = ''//a[img[@alt="MARC Display"]]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
var newUri = elmts[0].href;
|
|
}
|
|
|
|
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|
newDoc = newBrowser.contentDocument;
|
|
|
|
var namespace = newDoc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var xpath = ''//pre'';
|
|
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
|
|
|
|
var text = utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue;
|
|
|
|
var record = new MARC_Record();
|
|
record.load(text, "MARC_PAC");
|
|
utilities.importMARCRecord(record, uri, model);
|
|
done();
|
|
}, function() {});
|
|
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-12 09:58:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
|
|
'var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
if(elmts.length) {
|
|
return true;
|
|
}
|
|
return false;',
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var uri = doc.location.href;
|
|
var data = new Object();
|
|
|
|
var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
for (var i = 0; i < elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
try {
|
|
var node = utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
|
|
if(!node) {
|
|
var node = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
|
|
}
|
|
if(node) {
|
|
var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
|
|
field = field.toLowerCase();
|
|
var value = utilities.superCleanString(node.nodeValue);
|
|
var rdfUri = null;
|
|
if(field == "publisher") {
|
|
rdfUri = prefixDC + ''publisher'';
|
|
} else if(field == "pub date") {
|
|
rdfUri = prefixDC + ''year'';
|
|
|
|
var re = /[0-9]+/;
|
|
var m = re.exec(value);
|
|
value = m[0];
|
|
} else if(field == "isbn") {
|
|
rdfUri = prefixDC + ''identifier'';
|
|
|
|
var re = /^[0-9](?:[0-9X]+)/;
|
|
var m = re.exec(value);
|
|
value = m[0];
|
|
} else if(field == "title") {
|
|
rdfUri = prefixDC + ''title'';
|
|
var titleParts = value.split(" / ");
|
|
value = titleParts[0];
|
|
} else if(field == "publication info") {
|
|
rdfUri = prefixDummy + ''place'';
|
|
var pubParts = value.split(" : ");
|
|
value = pubParts[0];
|
|
} else if(field == "personal author") {
|
|
rdfUri = prefixDC + ''creator'';
|
|
value = utilities.cleanAuthor(node.nodeValue);
|
|
} else if(field == "added author") {
|
|
rdfUri = prefixDC + ''contributor'';
|
|
value = utilities.cleanAuthor(node.nodeValue);
|
|
} else if(field == "corporate author") {
|
|
rdfUri = prefixDummy + ''corporateCreator'';
|
|
}
|
|
if(rdfUri) {
|
|
var insert = true;
|
|
if(data && data[rdfUri]) {
|
|
for(j in data[rdfUri]) {
|
|
if(data[rdfUri][j] == value) {
|
|
insert = false;
|
|
break;
|
|
}
|
|
}
|
|
} else if(!data[rdfUri]) {
|
|
data[rdfUri] = new Array();
|
|
}
|
|
if(insert) {
|
|
data[rdfUri].push(value);
|
|
model.addStatement(uri, rdfUri, value, true);
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {}
|
|
|
|
}
|
|
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
|
');
|
|
|
|
REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var uri = doc.location.href;
|
|
var data = new Object();
|
|
|
|
// Title
|
|
var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
var title = "";
|
|
for (var i = 0; i < elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
title += elmt.nodeValue;
|
|
}
|
|
if(title) {
|
|
model.addStatement(uri, prefixDC + ''title'', title, true);
|
|
}
|
|
|
|
// Authors
|
|
var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="textMedium"]/a/em'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
for (var i = 0; i < elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
|
|
// Dirty hack to fix highlighted words
|
|
var xpath = ''.//text()'';
|
|
var author = "";
|
|
var authorElmts = utilities.gatherElementsOnXPath(doc, elmt, xpath, nsResolver);
|
|
for (var j = 0; j < authorElmts.length; j++) {
|
|
var authorElmt = authorElmts[j];
|
|
author += authorElmt.nodeValue;
|
|
}
|
|
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(author), true);
|
|
}
|
|
|
|
// Other info
|
|
var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
for (var i = 0; i < elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
|
|
if(field == "publication title") {
|
|
var publication = utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
|
|
if(publication.nodeValue) {
|
|
model.addStatement(uri, prefixDummy + ''publication'', utilities.superCleanString(publication.nodeValue), true);
|
|
}
|
|
var place = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
|
|
if(place.nodeValue) {
|
|
model.addStatement(uri, prefixDummy + ''place'', utilities.superCleanString(place.nodeValue), true);
|
|
}
|
|
var date = utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);
|
|
if(date.nodeValue) {
|
|
var jsDate = new Date(utilities.superCleanString(date.nodeValue));
|
|
model.addStatement(uri, prefixDC + ''date'', utilities.dateToISO(jsDate), true);
|
|
}
|
|
var moreInfo = utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
|
|
if(moreInfo.nodeValue) {
|
|
moreInfo = utilities.superCleanString(moreInfo.nodeValue);
|
|
var parts = moreInfo.split(";\xA0");
|
|
|
|
var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/
|
|
var issueInfo = parts[0].split(",\xA0");
|
|
for(j in issueInfo) {
|
|
var m = issueRegexp.exec(issueInfo[j]);
|
|
var info = m[1].toLowerCase();
|
|
if(info == "vol") {
|
|
model.addStatement(uri, prefixDummy + ''volume'', utilities.superCleanString(m[2]), true);
|
|
} else if(info == "iss" || info == "no") {
|
|
model.addStatement(uri, prefixDummy + ''number'', utilities.superCleanString(m[2]), true);
|
|
}
|
|
}
|
|
if(parts[1] && utilities.superCleanString(parts[1]).substring(0, 3).toLowerCase() == "pg.") {
|
|
var re = /[0-9\-]+/;
|
|
var m = re.exec(parts[1]);
|
|
|
|
if(m) {
|
|
model.addStatement(uri, prefixDummy + ''pages'', m[0], true);
|
|
}
|
|
}
|
|
}
|
|
} else if(field == "source type") {
|
|
var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
|
|
if(value.nodeValue) {
|
|
value = utilities.superCleanString(value.nodeValue).toLowerCase();
|
|
|
|
if(value == "periodical") {
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
|
|
} else if(value == "newspaper") {
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false);
|
|
} else {
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
|
}
|
|
}
|
|
} else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
|
|
var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
|
|
if(value) {
|
|
var type;
|
|
value = utilities.superCleanString(value.nodeValue);
|
|
if(value.length == 10 || value.length == 13) {
|
|
type = "ISBN";
|
|
} else if(value.length == 8) {
|
|
type = "ISSN";
|
|
}
|
|
if(type) {
|
|
model.addStatement(uri, prefixDC + "identifier", type+" "+value, false);
|
|
}
|
|
}
|
|
}
|
|
}');
|
|
|
|
REPLACE INTO "scrapers" VALUES('6773a9af-5375-3224-d148-d32793884dec', '2006-06-18 11:19:00', 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
|
|
'if(doc.title.substring(0, 8) == "Article ") {
|
|
return true;
|
|
}
|
|
return false;',
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var uri = doc.location.href;
|
|
|
|
var xpath = ''/html/body//comment()'';
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
|
for (var i = 0; i < elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
var colon = elmt.nodeValue.indexOf(":");
|
|
var field = elmt.nodeValue.substring(1, colon).toLowerCase();
|
|
var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1);
|
|
if(field == "title") {
|
|
model.addStatement(uri, prefixDC + "title", value, false);
|
|
} else if(field == "journal") {
|
|
model.addStatement(uri, prefixDummy + "publication", value, false);
|
|
} else if(field == "pi") {
|
|
parts = value.split(" ");
|
|
var date = "";
|
|
var isDate = true;
|
|
var rdfUri;
|
|
for(j in parts) {
|
|
firstChar = parts[j].substring(0, 1);
|
|
rdfUri = false;
|
|
|
|
if(firstChar == "v") {
|
|
rdfUri = prefixDummy + "volume";
|
|
} else if(firstChar == "i") {
|
|
rdfUri = prefixDummy + "number";
|
|
} else if(firstChar == "p") {
|
|
rdfUri = prefixDummy + "pages";
|
|
var pagesRegexp = /p(\w+)\((\w+)\)/;
|
|
var match = pagesRegexp.exec(parts[j]);
|
|
if(match) {
|
|
var finalPage = parseInt(match[1])+parseInt(match[2])
|
|
parts[j] = "p"+match[1]+"-"+finalPage.toString();
|
|
}
|
|
}
|
|
|
|
if(rdfUri) {
|
|
isDate = false;
|
|
if(parts[j] != "pNA") { // not a real page number
|
|
var content = parts[j].substring(1);
|
|
model.addStatement(uri, rdfUri, content, true);
|
|
}
|
|
} else if(isDate) {
|
|
date += " "+parts[j];
|
|
}
|
|
}
|
|
if(date != "") {
|
|
model.addStatement(uri, prefixDC + "date", date.substring(1), false);
|
|
}
|
|
} else if(field == "author") {
|
|
model.addStatement(uri, prefixDC + "creator", utilities.cleanAuthor(value), false);
|
|
}
|
|
}
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);');
|
|
|
|
REPLACE INTO "scrapers" VALUES('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-18 10:13:00', 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL,
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
|
|
var citationDataDiv;
|
|
var divs = doc.getElementsByTagName("div");
|
|
for(i in divs) {
|
|
if(divs[i].className == "bodytext") {
|
|
citationDataDiv = divs[i];
|
|
break;
|
|
}
|
|
}
|
|
|
|
centerElements = citationDataDiv.getElementsByTagName("center");
|
|
var elementParts = centerElements[0].innerHTML.split(/<br[^>]*>/gi);
|
|
model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true);
|
|
|
|
var dateRegexp = /<br[^>]*>(?:<b>)?([A-Z][a-z]+)(?:<\/b>)? ([0-9]+, [0-9]{4})/;
|
|
var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML);
|
|
if(m) {
|
|
var jsDate = new Date(m[1]+" "+m[2]);
|
|
model.addStatement(uri, prefixDC + "date", utilities.dateToISO(jsDate), true);
|
|
} else {
|
|
var elementParts = centerElements[centerElements.length-1].innerHTML.split(/<br[^>]*>/gi);
|
|
model.addStatement(uri, prefixDC + "date", elementParts[1], true);
|
|
}
|
|
|
|
var cutIndex = citationDataDiv.innerHTML.indexOf("<b>BODY:</b>");
|
|
if(cutIndex < 0) {
|
|
cutIndex = citationDataDiv.innerHTML.indexOf("<b>TEXT:</b>");
|
|
}
|
|
if(cutIndex > 0) {
|
|
citationData = citationDataDiv.innerHTML.substring(0, cutIndex);
|
|
} else {
|
|
citationData = citationDataDiv.innerHTML;
|
|
}
|
|
|
|
citationData = utilities.cleanTags(citationData);
|
|
|
|
var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/;
|
|
var m = headlineRegexp.exec(citationData);
|
|
if(m) {
|
|
model.addStatement(uri, prefixDC + "title", utilities.cleanTags(m[1]), true);
|
|
}
|
|
|
|
var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/;
|
|
var m = bylineRegexp.exec(citationData);
|
|
if(m) {
|
|
if(m[1].substring(0, 3).toLowerCase() == "by ") {
|
|
m[1] = m[1].substring(3);
|
|
}
|
|
model.addStatement(uri, prefixDC + "creator", m[1], true);
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false);
|
|
} else {
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
|
|
}
|
|
|
|
var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/;
|
|
var m = authorRegexp.exec(citationData);
|
|
if(m) {
|
|
var authors = m[1].split(/, (?:and )?/);
|
|
for(i in authors) {
|
|
model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true);
|
|
}
|
|
}');
|
|
|
|
REPLACE INTO "scrapers" VALUES('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-21 09:55:00', 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL,
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
|
|
var newUri = uri.replace("&format=999", "&format=001");
|
|
|
|
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|
newDoc = newBrowser.contentDocument;
|
|
|
|
var namespace = newDoc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]'';
|
|
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
|
|
var record = new MARC_Record();
|
|
for(var i=0; i<elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue);
|
|
var value = utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver);
|
|
var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1");
|
|
|
|
if(field != "FMT" && field != "LDR") {
|
|
var ind1 = "";
|
|
var ind2 = "";
|
|
var code = field.substring(0, 3);
|
|
if(field.length > 3) {
|
|
var ind1 = field.charAt(3);
|
|
if(field.length > 4) {
|
|
var ind2 = field.charAt(4);
|
|
}
|
|
}
|
|
record.add_field(code, ind1, ind2, value);
|
|
}
|
|
}
|
|
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
|
utilities.importMARCRecord(record, uri, model);
|
|
done();
|
|
}, function() {});
|
|
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-21 09:55:00', 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*uri=full=[0-9]', NULL,
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
var newUri = uri+''&fullmarc=true'';
|
|
|
|
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|
newDoc = newBrowser.contentDocument;
|
|
|
|
var namespace = newDoc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]'';
|
|
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
|
|
var record = new MARC_Record();
|
|
for(var i=0; i<elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
var field = utilities.superCleanString(utilities.getNode(newDoc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue);
|
|
var value = utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver);
|
|
value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1");
|
|
|
|
if(field != "FMT" && field != "LDR") {
|
|
var ind1 = "";
|
|
var ind2 = "";
|
|
var valRegexp = /^([0-9])([0-9])? (.*)$/;
|
|
var m = valRegexp.exec(value);
|
|
if(m) {
|
|
ind1 = m[1];
|
|
if(ind2) {
|
|
ind2 = m[2]
|
|
}
|
|
value = m[3];
|
|
}
|
|
record.add_field(field, ind1, ind2, value);
|
|
}
|
|
}
|
|
|
|
utilities.importMARCRecord(record, uri, model);
|
|
done();
|
|
}, function() {})
|
|
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-18 11:19:00', 'VTLS Scraper', 'Simon Kornblith', 'chameleon\?.*function=(?:CARDSCR|INITREQ)', NULL,
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
var newUri = uri.replace(/function=[A-Z]{7}/, "function=MARCSCR");
|
|
|
|
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|
newDoc = newBrowser.contentDocument;
|
|
|
|
var namespace = newDoc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var xpath = ''//table[@class="outertable"]/tbody/tr[td[4]]'';
|
|
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
|
|
var record = new MARC_Record();
|
|
for(var i=0; i<elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
var field = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue;
|
|
var ind1 = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue;
|
|
var ind2 = utilities.getNode(doc, elmt, ''./TD[3]/text()[1]'', nsResolver).nodeValue;
|
|
var value = utilities.getNode(doc, elmt, ''./TD[4]/text()[1]'', nsResolver).nodeValue;
|
|
value = value.replace(/\\([a-z]) /g, record.subfield_delimiter+"$1");
|
|
|
|
record.add_field(field, ind1, ind2, value);
|
|
}
|
|
|
|
utilities.importMARCRecord(record, uri, model);
|
|
done();
|
|
}, function() {})
|
|
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-18 11:19:00', 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)',
|
|
'if(doc.location.href.indexOf("authority_hits") > 0) {
|
|
var body = doc.getElementsByTagName("body");
|
|
if(body[0].innerHTML.indexOf("ISBN") < 0) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;',
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i;
|
|
var m = uriRegexp.exec(uri);
|
|
if(uri.indexOf("authority_hits") < 0) {
|
|
var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3];
|
|
} else {
|
|
var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc";
|
|
}
|
|
|
|
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
|
var record = new MARC_Record();
|
|
record.load(text, "binary");
|
|
utilities.importMARCRecord(record, uri, model);
|
|
done();
|
|
})
|
|
wait();');
|
|
|
|
|
|
REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-18 11:19:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)', NULL,
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html");
|
|
newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html");
|
|
|
|
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|
newDoc = newBrowser.contentDocument;
|
|
|
|
var namespace = newDoc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var record = new MARC_Record();
|
|
|
|
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''//pre/text()'', nsResolver);
|
|
var tag, ind1, ind2, content;
|
|
|
|
for(var i=0; i<elmts.length; i++) {
|
|
var line = elmts[i].nodeValue;
|
|
|
|
if(line.substring(0, 6) == " ") {
|
|
content += " "+line.substring(6);
|
|
continue;
|
|
} else {
|
|
if(tag) {
|
|
record.add_field(tag, ind1, ind2, content);
|
|
}
|
|
}
|
|
|
|
line = line.replace(/\xA0/g," "); // nbsp
|
|
line = line.replace(/_/g," ");
|
|
line = line.replace(/\t/g,"");
|
|
|
|
tag = line.substring(0, 3);
|
|
if(parseInt(tag) > 10) {
|
|
ind1 = line.substring(4, 5);
|
|
ind2 = line.substring(5, 6);
|
|
content = line.substring(7);
|
|
content = content.replace(/\$([a-z])(?: |$)/g, record.subfield_delimiter+"$1");
|
|
} else {
|
|
ind1 = "";
|
|
ind2 = "";
|
|
content = line.substring(4);
|
|
}
|
|
|
|
}
|
|
|
|
utilities.importMARCRecord(record, uri, model);
|
|
done();
|
|
}, function() {});
|
|
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-18 11:19:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
|
|
'var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver);
|
|
for(i in elmts) {
|
|
if(elmts[i].nodeValue == "\n\nViewing record\n") {
|
|
return true;
|
|
}
|
|
}
|
|
return false;',
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var uri = doc.location.href;
|
|
var uriRegexp = /^(.*)(\/[0-9]+)$/;
|
|
var m = uriRegexp.exec(uri);
|
|
var newUri = m[1]+"/40";
|
|
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver);
|
|
for(i in elmts) {
|
|
var elmt = elmts[i];
|
|
var initialText = utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver);
|
|
if(initialText.nodeValue == "\n\nViewing record\n") {
|
|
var recNumber = utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue;
|
|
}
|
|
}
|
|
|
|
utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', null, function(text) {
|
|
var texts = text.split("<PRE>");
|
|
texts = texts[1].split("</PRE>");
|
|
text = texts[0];
|
|
var lines = text.split("\n");
|
|
|
|
var record = new MARC_Record();
|
|
|
|
var tag, ind1, ind2, content;
|
|
for(var i=0; i<lines.length; i++) {
|
|
var line = lines[i];
|
|
|
|
if(line.substr(0, 1) == "." && line.substr(4,2) == ". ") {
|
|
if(tag) {
|
|
content = content.replace(/\|([a-z])/g, record.subfield_delimiter+"$1");
|
|
record.add_field(tag, ind1, ind2, content);
|
|
}
|
|
} else {
|
|
content += " "+line.substring(6);
|
|
continue;
|
|
}
|
|
|
|
tag = line.substr(1, 3);
|
|
|
|
if(parseInt(tag) > 10) {
|
|
ind1 = line.substr(6, 1);
|
|
ind2 = line.substr(7, 1);
|
|
content = line.substr(8);
|
|
} else {
|
|
ind1 = "";
|
|
ind2 = "";
|
|
content = line.substring(6);
|
|
}
|
|
}
|
|
|
|
utilities.importMARCRecord(record, uri, model);
|
|
done();
|
|
})
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-18 11:19:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]', NULL,
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var uri = doc.location.href;
|
|
var newUri = uri.replace("LabelDisplay", "MARCDisplay");
|
|
|
|
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|
newDoc = newBrowser.contentDocument;
|
|
|
|
var namespace = newDoc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var record = new MARC_Record();
|
|
|
|
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''/html/body/table/tbody/tr[td[4]]'', nsResolver);
|
|
var tag, ind1, ind2, content;
|
|
|
|
for(var i=0; i<elmts.length; i++) {
|
|
var elmt = elmts[i];
|
|
|
|
tag = utilities.getNode(newDoc, elmt, ''./td[2]/tt[1]/text()[1]'', nsResolver).nodeValue;
|
|
var inds = utilities.getNode(newDoc, elmt, ''./td[3]/tt[1]/text()[1]'', nsResolver).nodeValue;
|
|
|
|
tag = tag.replace(/[\r\n]/g, "");
|
|
if(tag.length == 1) {
|
|
tag = "00"+tag;
|
|
} else if(tag.length == 2) {
|
|
tag = "0"+tag;
|
|
}
|
|
inds = inds.replace(/[\r\n]/g, "");
|
|
|
|
// Get indicators, fix possible problems with s
|
|
ind1 = inds.substr(0, 1);
|
|
ind2 = inds.substr(1, 1);
|
|
if(ind1 == "\xA0") {
|
|
ind1 = "";
|
|
}
|
|
if(ind2 == "\xA0") {
|
|
ind2 = "";
|
|
}
|
|
|
|
var children = utilities.gatherElementsOnXPath(newDoc, elmt, ''./td[4]/tt[1]//text()'', nsResolver);
|
|
content = "";
|
|
if(children.length == 1) {
|
|
content = children[0].nodeValue;
|
|
} else {
|
|
for(var j=0; j<children.length; j+=2) {
|
|
var subfield = children[j].nodeValue.substr(1, 1);
|
|
var fieldContent = children[j+1].nodeValue;
|
|
content += record.subfield_delimiter+subfield+fieldContent;
|
|
}
|
|
}
|
|
|
|
record.add_field(tag, ind1, ind2, content);
|
|
}
|
|
|
|
utilities.importMARCRecord(record, uri, model);
|
|
done();
|
|
}, function() {});
|
|
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-18 11:19:00', 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/journals/[^/]+/[^/]+/[^/]+\.html$', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var uri = doc.location.href;
|
|
|
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, ''//comment()'', nsResolver);
|
|
for(i in elmts) {
|
|
if(elmts[i].nodeValue.substr(0, 10) == "HeaderData") {
|
|
var headerRegexp = /HeaderData((?:.|\n)*)\#\#EndHeaders/i
|
|
var m = headerRegexp.exec(elmts[i].nodeValue);
|
|
var headerData = m[1];
|
|
}
|
|
}
|
|
|
|
// Use E4X rather than DOM/XPath, because the Mozilla gods have decided not to
|
|
// expose DOM/XPath to sandboxed scripts
|
|
var newDOM = new XML(headerData);
|
|
|
|
function mapRDF(text, rdfUri) {
|
|
if(text) {
|
|
model.addStatement(uri, rdfUri, text, true);
|
|
}
|
|
}
|
|
|
|
mapRDF(newDOM.journal.text(), prefixDummy + "publication");
|
|
mapRDF(newDOM.volume.text(), prefixDummy + "volume");
|
|
mapRDF(newDOM.issue.text(), prefixDummy + "number");
|
|
mapRDF(newDOM.year.text(), prefixDummy + "year");
|
|
mapRDF(newDOM.pubdate.text(), prefixDC + "date");
|
|
mapRDF(newDOM.doctitle.text(), prefixDC + "title");
|
|
|
|
// Do ISSN
|
|
var issn = newDOM.issn.text();
|
|
if(issn) {
|
|
model.addStatement(uri, prefixDC + "identifier", "ISSN "+issn.replace(/[^0-9]/g, ""), true);
|
|
}
|
|
|
|
// Do pages
|
|
var fpage = newDOM.fpage.text();
|
|
var lpage = newDOM.lpage.text();
|
|
if(fpage != "") {
|
|
var pages = fpage;
|
|
if(lpage) {
|
|
pages += "-"+lpage;
|
|
}
|
|
model.addStatement(uri, prefixDummy + "pages", pages, true);
|
|
}
|
|
|
|
// Do authors
|
|
var elmts = newDOM.docauthor;
|
|
for(i in elmts) {
|
|
var fname = elmts[i].fname.text();
|
|
var surname = elmts[i].surname.text();
|
|
model.addStatement(uri, prefixDC + "creator", fname+" "+surname, true);
|
|
}
|
|
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);');
|
|
|
|
REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-18 11:19:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
function mapRDF(text, rdfUri) {
|
|
if(text != "") {
|
|
model.addStatement(uri, rdfUri, text, true);
|
|
}
|
|
}
|
|
|
|
var uri = doc.location.href;
|
|
var newUri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=PubMed&retmode=xml&rettype=citation&id=";
|
|
var idRegexp = /[\?\&]list_uids=([0-9\,]+)/;
|
|
var m = idRegexp.exec(uri);
|
|
newUri += m[1];
|
|
|
|
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
|
// Remove xml parse instruction and doctype
|
|
text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "");
|
|
|
|
var xml = new XML(text);
|
|
|
|
for(var i=0; i<xml.PubmedArticle.length(); i++) {
|
|
var citation = xml.PubmedArticle[i].MedlineCitation;
|
|
|
|
if(citation.PMID.length()) {
|
|
model.addStatement(uri, prefixDC + "identifier", "PMID "+citation.PMID.text(), true);
|
|
}
|
|
|
|
var article = citation.Article;
|
|
if(article.ArticleTitle.length()) {
|
|
var title = article.ArticleTitle.text().toString();
|
|
if(title.substr(-1) == ".") {
|
|
title = title.substring(0, title.length-1);
|
|
}
|
|
model.addStatement(uri, prefixDC + "title", title, true);
|
|
}
|
|
|
|
if(article.Journal.length()) {
|
|
var issn = article.Journal.ISSN.text();
|
|
if(issn) {
|
|
model.addStatement(uri, prefixDC + "identifier", "ISSN "+issn.replace(/[^0-9]/g, ""), true);
|
|
}
|
|
|
|
if(article.Journal.Title.length()) {
|
|
model.addStatement(uri, prefixDummy + "publication", utilities.superCleanString(article.Journal.Title.text()), true);
|
|
} else if(citation.MedlineJournalInfo.MedlineTA.length()) {
|
|
model.addStatement(uri, prefixDummy + "publication", utilities.superCleanString(citation.MedlineJournalInfo.MedlineTA.text()), true);
|
|
}
|
|
|
|
if(article.Journal.JournalIssue.length()) {
|
|
mapRDF(article.Journal.JournalIssue.Volume.text(), prefixDummy + "volume");
|
|
mapRDF(article.Journal.JournalIssue.Issue.text(), prefixDummy + "number");
|
|
if(article.Journal.JournalIssue.PubDate.length()) {
|
|
model.addStatement(uri, prefixDC + "date", article.Journal.JournalIssue.PubDate.Day.text()+" "+article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text(), true);
|
|
}
|
|
}
|
|
}
|
|
|
|
if(article.AuthorList.length() && article.AuthorList.Author.length()) {
|
|
var authors = article.AuthorList.Author;
|
|
for(var j=0; j<authors.length(); j++) {
|
|
var lastName = authors[j].LastName.text();
|
|
var firstName = authors[j].FirstName.text();
|
|
if(firstName == "") {
|
|
var firstName = authors[j].ForeName.text();
|
|
}
|
|
if(firstName && lastName) {
|
|
model.addStatement(uri, prefixDC + "creator", firstName + " " + lastName);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
done();
|
|
})
|
|
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
|
|
wait();');
|
|
|
|
REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-20 10:52:00', 'Scraper for Dublin Core expressed as HTML META elements', 'Simon Kornblith', NULL,
|
|
'var metaTags = doc.getElementsByTagName("meta");
|
|
|
|
if(metaTags) {
|
|
for(var i=0; i<metaTags.length; i++) {
|
|
var tag = metaTags[i].getAttribute("name");
|
|
var value = metaTags[i].getAttribute("content");
|
|
if(tag && value && tag.substr(0, 3).toLowerCase() == "dc.") {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
|
|
var metaTags = doc.getElementsByTagName("meta");
|
|
|
|
for(var i=0; i<metaTags.length; i++) {
|
|
var tag = metaTags[i].getAttribute("name");
|
|
var value = metaTags[i].getAttribute("content");
|
|
if(tag && value && tag.substr(0, 3).toLowerCase() == "dc.") {
|
|
var suffix = tag.substr(3);
|
|
if(suffix == "creator") {
|
|
// Everyone uses different methods of encoding the DC creator; clean them
|
|
value = utilities.cleanAuthor(value);
|
|
}
|
|
model.addStatement(uri, prefixDC + suffix, value, true);
|
|
}
|
|
}');
|
|
|
|
REPLACE INTO "scrapers" VALUES('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-21 10:28:00', 'Google Books Scraper', 'Simon Kornblith', 'http://books\.google\.com/books\?vid=.*\&id=.*', NULL,
|
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|
|
|
var uri = doc.location.href;
|
|
var re = new RegExp(''http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');
|
|
var urlParts = re.exec(uri);
|
|
var newUri = ''http://books.google.com/books?vid=''+urlParts[1]+''&id=''+urlParts[2];
|
|
|
|
utilities.debugPrint(newUri);
|
|
|
|
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|
newDoc = newBrowser.contentDocument;
|
|
|
|
var namespace = newDoc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == ''x'') return namespace; else return null;
|
|
} : null;
|
|
|
|
var xpath = ''/html/body/table/tbody/tr[3]/td[2][@class="content"]/div[@class="content"]/table/tbody/tr/td/p[@class="e"]/table/tbody/tr'';
|
|
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
|
|
for(var i = 0; i<elmts.length; i++) {
|
|
var field = utilities.getNode(newDoc, elmts[i], ''./td[1]//text()'', nsResolver);
|
|
var value = utilities.getNode(newDoc, elmts[i], ''./td[2]//text()'', nsResolver);
|
|
|
|
if(field && value) {
|
|
field = utilities.cleanString(field.nodeValue);
|
|
value = utilities.cleanString(value.nodeValue);
|
|
if(field == "Title") {
|
|
model.addStatement(uri, prefixDC + ''title'', value);
|
|
} else if(field == "Author(s)") {
|
|
var authors = value.split(", ");
|
|
for(j in authors) {
|
|
model.addStatement(uri, prefixDC + ''creator'', authors[j]);
|
|
}
|
|
} else if(field == "Editor(s)") {
|
|
var authors = value.split(", ");
|
|
for(j in authors) {
|
|
model.addStatement(uri, prefixDummy + ''editor'', authors[j]);
|
|
}
|
|
} else if(field == "Publisher") {
|
|
model.addStatement(uri, prefixDC + ''publisher'', value);
|
|
} else if(field == "Publication Date") {
|
|
var date = value;
|
|
|
|
jsDate = new Date(value);
|
|
if(!isNaN(jsDate.valueOf())) {
|
|
date = utilities.dateToISO(jsDate);
|
|
}
|
|
|
|
model.addStatement(uri, prefixDC + ''date'', date);
|
|
} else if(field == "Format") {
|
|
model.addStatement(uri, prefixDC + ''medium'', value);
|
|
} else if(field == "ISBN") {
|
|
model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
|
|
}
|
|
}
|
|
}
|
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
|
|
|
done();
|
|
}, function() {});
|
|
|
|
wait();'); |