diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
index 6f062bccc..ad768d915 100644
--- a/chrome/chromeFiles/content/scholar/ingester/browser.js
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -35,6 +35,7 @@ Scholar.Ingester.Interface.init = function() {
*/
Scholar.Ingester.Interface.chromeLoad = function() {
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
+ Scholar.Ingester.Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image");
@@ -189,7 +190,7 @@ Scholar.Ingester.Interface._setDocument = function(browser) {
browser.setAttribute("scholar-key", key);
}
}
- Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
+ Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar.Ingester.Interface.hiddenBrowser);
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
}
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.xul b/chrome/chromeFiles/content/scholar/ingester/browser.xul
index 649a12371..d252a0416 100755
--- a/chrome/chromeFiles/content/scholar/ingester/browser.xul
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.xul
@@ -19,4 +19,7 @@
+
+
+
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
index 8404ddbbc..627f137d0 100644
--- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -48,7 +48,9 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
/////////////////////////////////////////////////////////////////
// Scholar.Ingester.Utilities class, a set of methods to assist in data
// extraction. Most code here was stolen directly from the Piggy Bank project.
-Scholar.Ingester.Utilities = function() {}
+Scholar.Ingester.Utilities = function(hiddenBrowser) {
+ this.hiddenBrowser = hiddenBrowser;
+}
// Adapter for Piggy Bank function to print debug messages; log level is
// fixed at 4 (could change this)
@@ -99,6 +101,7 @@ Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, paren
// Loads a single document for a scraper, running succeeded() on success or
// failed() on failure
Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
+ Scholar.debug("loadDocument called");
this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
}
@@ -112,6 +115,9 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
// exception - a function to execute if an exception occurs (exceptions are
// also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
+ var hiddenBrowser = this.hiddenBrowser;
+ Scholar.debug("processDocuments called");
+
try {
if (urls.length == 0) {
if (firstDoc) {
@@ -128,53 +134,51 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
if (urlIndex < urls.length) {
try {
var url = urls[urlIndex];
- var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
- b.loadURI(url);
+ Scholar.debug("loading "+url);
+ hiddenBrowser.loadURI(url);
} catch (e) {
- exception(e);
Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
+ exception(e);
}
} else {
- window.setTimeout(done, 10);
+ hiddenBrowser.setTimeout(done, 10);
}
};
var onLoad = function() {
- try {
- var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
- processor(b.contentDocument, doLoad);
- } catch (e) {
- exception(e);
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+ Scholar.debug("onLoad called");
+ if(hiddenBrowser.id == "scholar-hidden-browser") {
+ hiddenBrowser.removeEventListener("DOMContentLoaded", onLoad, true);
+ try {
+ var newHiddenBrowser = new Object();
+ Scholar.debug("new hidden browser");
+ newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
+ newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
+ Scholar.debug("added attributes");
+ processor(newHiddenBrowser);
+ Scholar.debug("called processor");
+ } catch (e) {
+ Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+ exception(e);
+ }
}
};
var init = function() {
- var listener;
- listener.onStateChange = function(webProgress, request, stateFlags, status) {
- if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
- request.name == urls[urlIndex]) {
- try {
- Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
- } catch (e) {
- exception(e);
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
- }
- }
- };
-
- var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
- tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
+ Scholar.debug("init called");
+ hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true);
if (firstDoc) {
+ Scholar.debug("processing");
processor(firstDoc, doLoad);
} else {
+ Scholar.debug("doing load");
doLoad();
}
}
- w.addEventListener("load", init, false);
+ init();
} catch (e) {
+ Scholar.debug("processDocuments: " + e);
exception(e);
- PB_Debug.print("processDocuments: " + e);
}
}
@@ -209,12 +213,18 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
// break compatibility
Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- return author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ return author.replace(/ +/, ' ');
}
Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
+ author = author.replace(/ +/, ' ');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
var splitNames = author.split(', ');
if(splitNames.length > 1) {
author = splitNames[1]+' '+splitNames[0];
@@ -222,6 +232,16 @@ Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
return author;
}
+Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+ author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ var regexp = /^[^ ]*/;
+ var m = regexp.exec(author);
+ if(m) {
+ return m[0];
+ }
+}
+
Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
if(!part) {
part = 'a';
@@ -253,27 +273,29 @@ Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri,
// This is an extension to PiggyBank's architecture. It's here so that we don't
// need an enormous library for each scraper that wants to use MARC records
-Scholar.Ingester.Utilities.prototype.importMARCRecord = function(text, format, uri, model) {
+Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) {
var prefixDC = 'http://purl.org/dc/elements/1.1/';
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
- var record = new Scholar.Ingester.MARC_Record();
- record.load(text, format);
-
// Extract ISBNs
- model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
+ model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
// Extract ISSNs
- model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
+ model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
// Extract creators
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor);
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
- if(!model.data[uri][prefixDC + 'creator']) {
+ model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor);
+ model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
+ if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author
+ // in the person subject field as the first entry
var field = record.get_field_subfields('600');
- if(field) {
- model = this.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
+ if(field[0]) {
+ model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
}
}
// Extract title
@@ -403,12 +425,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
/*
* Constructor for Document object
*/
-Scholar.Ingester.Document = function(browserWindow){
+Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
this.browser = browserWindow;
+ this.model = new Scholar.Ingester.Model();
this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
.getService(Ci.nsIAppShellService);
- this.scraper = null
- this.model = new Scholar.Ingester.Model();
+ this.scraper = null;
+ this.hiddenBrowser = hiddenBrowser;
this._generateSandbox();
}
@@ -530,11 +553,13 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
this.sandbox.browser = this.browser;
this.sandbox.doc = this.sandbox.browser.contentDocument;
- this.sandbox.utilities = new Scholar.Ingester.Utilities;
+ this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser);
this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
this.sandbox.window = this.window;
this.sandbox.model = this.model;
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
+ this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
+ this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
var me = this;
this.sandbox.wait = function(){ me._waitForCompletion = true; };
@@ -552,50 +577,90 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
for(var uri in this.model.data) {
- var newItem = Scholar.Items.getNewItemByType(1);
+ if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
+ var newItem = Scholar.Items.getNewItemByType(2);
+ } else {
+ var newItem = Scholar.Items.getNewItemByType(1);
+ }
newItem.setField("source", uri);
if(this.model.data[uri][prefixDC + 'title']) {
newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
}
- if(this.model.data[uri][prefixDC + 'publisher']) {
- newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
- }
- if(this.model.data[uri][prefixDC + 'year']) {
- if(this.model.data[uri][prefixDC + 'year'].length == 4) {
- newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
- } else {
- try {
- newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
- this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
- this.model.data[uri][prefixDC + 'year'][0].length));
- } catch(e) {}
- }
- }
- if(this.model.data[uri][prefixDC + 'edition']) {
- newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'series']) {
- newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'place']) {
- newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
- }
- if(this.model.data[uri][prefixDC + 'identifier']) {
- for(i in this.model.data[uri][prefixDC + 'identifier']) {
- if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
- newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
- break;
- }
- }
- }
+ var creatorIndex = 0;
if(this.model.data[uri][prefixDC + 'creator']) {
for(i in this.model.data[uri][prefixDC + 'creator']) {
var creator = this.model.data[uri][prefixDC + 'creator'][i];
var spaceIndex = creator.lastIndexOf(" ");
var lastName = creator.substring(spaceIndex+1, creator.length);
var firstName = creator.substring(0, spaceIndex);
+
+ newItem.setCreator(creatorIndex, firstName, lastName, 1);
+ creatorIndex++;
+ }
+ }
+ if(this.model.data[uri][prefixDC + 'contributor']) {
+ for(i in this.model.data[uri][prefixDC + 'contributor']) {
+ var creator = this.model.data[uri][prefixDC + 'contributor'][i];
+ var spaceIndex = creator.lastIndexOf(" ");
+ var lastName = creator.substring(spaceIndex+1, creator.length);
+ var firstName = creator.substring(0, spaceIndex);
- newItem.setCreator(i, firstName, lastName);
+ newItem.setCreator(creatorIndex, firstName, lastName, 2);
+ creatorIndex++;
+ }
+ }
+ if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
+ if(this.model.data[uri][prefixDummy + 'publication']) {
+ newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'volume']) {
+ newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'number']) {
+ newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'pages']) {
+ newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'identifier']) {
+ for(i in this.model.data[uri][prefixDC + 'identifier']) {
+ if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') {
+ newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+ break;
+ }
+ }
+ }
+ } else {
+ if(this.model.data[uri][prefixDC + 'publisher']) {
+ newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'year']) {
+ if(this.model.data[uri][prefixDC + 'year'].length == 4) {
+ newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
+ } else {
+ try {
+ newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
+ this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
+ this.model.data[uri][prefixDC + 'year'][0].length));
+ } catch(e) {}
+ }
+ }
+ if(this.model.data[uri][prefixDC + 'edition']) {
+ newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'series']) {
+ newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'place']) {
+ newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'identifier']) {
+ for(i in this.model.data[uri][prefixDC + 'identifier']) {
+ if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
+ newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+ break;
+ }
+ }
}
}
newItem.save();
diff --git a/chrome/chromeFiles/content/scholar/xpcom/marc.js b/chrome/chromeFiles/content/scholar/xpcom/marc.js
index 13ae7cfb2..6cf46d146 100644
--- a/chrome/chromeFiles/content/scholar/xpcom/marc.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/marc.js
@@ -80,8 +80,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
this.add_field(tag,ind1,ind2,value);
}
- }
- if (f == 'MARC_Harvard') {
+ } else if (f == 'MARC_Harvard') {
var linee = s.split('\n');
for (var i=0; i '008' && tag < '899') { // jumps low and high tags
+ if (tag != '040') this.add_field(tag,ind1,ind2,value);
+ }
+ }
+ this.add_field_005();
}
this.update_record_length();
@@ -310,7 +347,7 @@ Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existen
return false;
}
-function MARC_field(rec,tag,ind1,ind2,value) { // new MARC gield
+Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield
this.tag = tag;
this.occ = rec.count_occ(tag)+1; // occurrence order no.
this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' ';
@@ -428,7 +465,7 @@ Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { //
Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
if (tag.length != 3) { return false; }
- var F = new MARC_field(this,tag,ind1,ind2,value);
+ var F = new this.MARC_field(this,tag,ind1,ind2,value);
// adds pointer to list of fields
this.variable_fields[this.variable_fields.length] = F;
// adds the entry to the directory
diff --git a/scrapers.sql b/scrapers.sql
new file mode 100644
index 000000000..abb6c123f
--- /dev/null
+++ b/scrapers.sql
@@ -0,0 +1,1014 @@
+BEGIN TRANSACTION;
+DELETE FROM scrapers;
+INSERT INTO "scrapers" VALUES(1, NULL, NULL, 20060603002000, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+var cleanString = function(s) {
+ s = utilities.trimString(s);
+ return s.replace(/ +/g, " ");
+}
+
+var uri = doc.location.href;
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+
+// Retrieve authors
+var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+
+ model.addStatement(uri, prefixDC + ''creator'', cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
+}
+
+// Retrieve data from "Product Details" box
+var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
+ if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
+ var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
+
+ if(attribute == "Publisher:") {
+ if(value.lastIndexOf("(") != -1) {
+ var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
+ value = value.substring(0, value.lastIndexOf("(")-1);
+ }
+ if(value.lastIndexOf(";") != -1) {
+ var edition = value.substring(value.lastIndexOf(";")+2, value.length);
+ value = value.substring(0, value.lastIndexOf(";"));
+ }
+ model.addStatement(uri, prefixDC + ''publisher'', value);
+ model.addStatement(uri, prefixDC + ''date'', date);
+ model.addStatement(uri, prefixDC + ''hasVersion'', edition);
+ } else if(attribute == "Language:") {
+ model.addStatement(uri, prefixDC + ''language'', value);
+ } else if(attribute == "ISBN:") {
+ model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
+ } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
+ model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
+ model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
+ }
+ }
+}
+
+var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+var title = cleanString(getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
+if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
+ title = title.substring(0, title.lastIndexOf("(")-1);
+}
+model.addStatement(uri, prefixDC + ''title'', title);');
+
+INSERT INTO "scrapers" VALUES(2, NULL, NULL, 20060603002000, 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
+'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
+ return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/;
+var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/;
+var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/;
+var lineRegexp = /^([\w() ]+): *(.*)$/;
+var publisherRegexp = /^(.*), (.*?),?$/;
+
+var uri = doc.location.href;
+
+var sMatch = sessionRegexp.exec(uri);
+var sessionid = sMatch[1];
+
+var nMatch = numberRegexp.exec(uri);
+if(nMatch) {
+ var number = nMatch[1];
+} else {
+ number = 1;
+}
+
+var rMatch = resultsetRegexp.exec(uri);
+if(rMatch) {
+ var resultset = rMatch[1];
+} else {
+ // It''s in an XPCNativeWrapper, so we have to do this black magic
+ resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value;
+}
+
+var newUri = ''http://newfirstsearch.oclc.org/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0'';
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+
+function cleanAuthor(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split('', '');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ return author;
+}
+
+utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintext'', null, function(text) {
+ var lines = text.split(''\n'');
+ for(var i=0;i") {
+ haveStarted = true;
+ }
+ }
+
+ // Loop through again so that we can add with the stableURL
+ model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journal", false);
+ for(i in data) {
+ if(data[i].length) {
+ for(j in data[i]) {
+ model.addStatement(stableURL, i, data[i][j]);
+ }
+ }
+ }
+
+ done();
+ })
+ })
+});
+
+wait();');
+
+INSERT INTO "scrapers" VALUES(5, NULL, NULL, 20060603002000, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/journals/.+/.+/.+\.html', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+
+var month, year;
+
+var metaTags = doc.getElementsByTagName("meta");
+
+function associateMeta(field, rdfUri) {
+ var field = metaTags.namedItem(field);
+ if(field) {
+ model.addStatement(uri, rdfUri, field.getAttribute("content"), false);
+ }
+}
+
+associateMeta("Title", prefixDC + "title");
+associateMeta("Journal", prefixDummy + "publication");
+associateMeta("Volume", prefixDummy + "volume");
+associateMeta("Issue", prefixDummy + "number");
+
+var author = metaTags.namedItem("Author");
+if(author) {
+ var authors = author.getAttribute("content").split(" and ");
+ for(j in authors) {
+ model.addStatement(uri, prefixDC + "creator", authors[j], false);
+ }
+}
+
+var month = metaTags.namedItem("PublicationMonth");
+var year = metaTags.namedItem("PublicationYear");
+if(month && year) {
+ model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false);
+}
+');
+
+INSERT INTO "scrapers" VALUES(6, NULL, NULL, 20060603002000, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/frameset\&FF=', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$'');
+var m = matchRegexp.exec(uri);
+var newUri = m[1]+''marc''+m[2];
+
+utilities.loadDocument(newUri, browser, function(newBrowser) {
+ newDoc = newBrowser.contentDocument;
+
+ var namespace = newDoc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+ }
+
+ var xpath = ''//pre'';
+ var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
+
+ var text = getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue;
+
+ var record = new MARC_Record();
+ record.load(text, "MARC_PAC");
+ model = utilities.importMARCRecord(record, uri, model);
+ done();
+}, function() {})
+
+wait();');
+
+INSERT INTO "scrapers" VALUES(7, NULL, NULL, 20060603002000, 'SIRSI Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
+'var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+if(elmts.length) {
+ return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+function stringTrimmer(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+function getAnyNumber(x) {
+ var re = /[0-9]+/;
+ var m = re.exec(x);
+ if(m) {
+ return m[0];
+ }
+}
+
+function getISBN(x) {
+ var re = /^[0-9](?:[0-9X]+)/;
+ var m = re.exec(x);
+ if(m) {
+ return m[0];
+ }
+}
+
+function cleanAuthor(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split('', '');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ return author;
+}
+
+var uri = doc.location.href;
+var data = new Object();
+
+var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ try {
+ var node = getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
+ if(!node) {
+ var node = getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
+ }
+ if(node) {
+ var field = stringTrimmer(getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
+ field = field.toLowerCase();
+ var value = stringTrimmer(node.nodeValue);
+ var rdfUri = null;
+ if(field == "publisher") {
+ rdfUri = prefixDC + ''publisher'';
+ } else if(field == "pub date") {
+ rdfUri = prefixDC + ''date'';
+ value = getAnyNumber(value);
+ } else if(field == "isbn") {
+ rdfUri = prefixDC + ''identifier'';
+ value = ''ISBN ''+getISBN(value);
+ } else if(field == "title") {
+ rdfUri = prefixDC + ''title'';
+ var titleParts = value.split(" / ");
+ value = titleParts[0];
+ } else if(field == "publication info") {
+ rdfUri = prefixDummy + ''place'';
+ var pubParts = value.split(" : ");
+ value = pubParts[0];
+ } else if(field == "personal author") {
+ rdfUri = prefixDC + ''creator'';
+ value = cleanAuthor(node.nodeValue);
+ } else if(field == "added author") {
+ rdfUri = prefixDC + ''contributor'';
+ value = cleanAuthor(node.nodeValue);
+ } else if(field == "corporate author") {
+ rdfUri = prefixDC + ''creator'';
+ }
+ if(rdfUri) {
+ var insert = true;
+ if(data && data[rdfUri]) {
+ for(j in data[rdfUri]) {
+ if(data[rdfUri][j] == value) {
+ insert = false;
+ break;
+ }
+ }
+ } else if(!data[rdfUri]) {
+ data[rdfUri] = new Array();
+ }
+ if(insert) {
+ data[rdfUri].push(value);
+ model.addStatement(uri, rdfUri, value, true);
+ }
+ }
+ }
+ } catch (e) {}
+
+}
+');
+
+INSERT INTO "scrapers" VALUES(8, NULL, NULL, 20060603002000, 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+function stringTrimmer(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+function getPageRange(x) {
+ var re = /[0-9\-]+/;
+ var m = re.exec(x);
+ if(m) {
+ return m[0];
+ }
+}
+
+function cleanAuthor(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split('', '');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ return author;
+}
+
+var uri = doc.location.href;
+var data = new Object();
+
+// Title
+var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+var title = "";
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ title += elmt.nodeValue;
+}
+if(title) {
+ model.addStatement(uri, prefixDC + ''title'', title, true);
+}
+
+// Authors
+var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="textMedium"]/a/em'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+
+ // Dirty hack to fix highlighted words
+ var xpath = ''.//text()'';
+ var author = "";
+ var authorElmts = utilities.gatherElementsOnXPath(doc, elmt, xpath, nsResolver);
+ for (var j = 0; j < authorElmts.length; j++) {
+ var authorElmt = authorElmts[j];
+ author += authorElmt.nodeValue;
+ }
+ model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(author), true);
+}
+
+// Other info
+var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
+ if(field == "publication title") {
+ var publication = getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
+ if(publication.nodeValue) {
+ model.addStatement(uri, prefixDummy + ''publication'', stringTrimmer(publication.nodeValue), true);
+ }
+ var place = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(place.nodeValue) {
+ model.addStatement(uri, prefixDummy + ''place'', stringTrimmer(place.nodeValue), true);
+ }
+ var date = getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);
+ if(date.nodeValue) {
+ model.addStatement(uri, prefixDC + ''date'', stringTrimmer(date.nodeValue), true);
+ }
+ var moreInfo = getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
+ if(moreInfo.nodeValue) {
+ moreInfo = stringTrimmer(moreInfo.nodeValue);
+ var parts = moreInfo.split(";\xA0");
+
+ var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/
+ var issueInfo = parts[0].split(",\xA0");
+ for(j in issueInfo) {
+ var m = issueRegexp.exec(issueInfo[j]);
+ var info = m[1].toLowerCase();
+ if(info == "vol") {
+ model.addStatement(uri, prefixDummy + ''volume'', stringTrimmer(m[2]), true);
+ } else if(info == "iss" || info == "no") {
+ model.addStatement(uri, prefixDummy + ''number'', stringTrimmer(m[2]), true);
+ }
+ }
+ if(parts[1] && stringTrimmer(parts[1]).substring(0, 3).toLowerCase() == "pg.") {
+ var pages = getPageRange(parts[1]);
+ if(pages) {
+ model.addStatement(uri, prefixDummy + ''pages'', pages, true);
+ }
+ }
+ }
+ } else if(field == "source type") {
+ var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(value.nodeValue) {
+ value = stringTrimmer(value.nodeValue).toLowerCase();
+
+ if(value == "newspaper" || value == "periodical") {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+ } else {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+ }
+ }
+ } else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
+ var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(value) {
+ var type;
+ value = stringTrimmer(value.nodeValue);
+ if(value.length == 10 || value.length == 13) {
+ type = "ISBN";
+ } else if(value.length == 8) {
+ type = "ISSN";
+ }
+ if(type) {
+ model.addStatement(uri, prefixDC + "identifier", type+" "+value, false);
+ }
+ }
+ }
+}');
+
+INSERT INTO "scrapers" VALUES(9, NULL, NULL, 20060603002000, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
+'if(doc.title.substring(0, 8) == "Article ") {
+ return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+function cleanAuthor(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split('', '');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ return author;
+}
+
+var uri = doc.location.href;
+
+var xpath = ''/html/body//comment()'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ var colon = elmt.nodeValue.indexOf(":");
+ var field = elmt.nodeValue.substring(1, colon).toLowerCase();
+ var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1);
+ if(field == "title") {
+ model.addStatement(uri, prefixDC + "title", value, false);
+ } else if(field == "journal") {
+ model.addStatement(uri, prefixDummy + "publication", value, false);
+ } else if(field == "pi") {
+ parts = value.split(" ");
+ var date = "";
+ var isDate = true;
+ var rdfUri;
+ for(j in parts) {
+ firstChar = parts[j].substring(0, 1);
+ rdfUri = false;
+
+ if(firstChar == "v") {
+ rdfUri = prefixDummy + "volume";
+ } else if(firstChar == "i") {
+ rdfUri = prefixDummy + "issue";
+ } else if(firstChar == "p") {
+ rdfUri = prefixDummy + "pages";
+ var pagesRegexp = /p(\w+)\((\w+)\)/;
+ var match = pagesRegexp.exec(parts[j]);
+ if(match) {
+ var finalPage = parseInt(match[1])+parseInt(match[2])
+ parts[j] = "p"+match[1]+"-"+finalPage.toString();
+ }
+ }
+
+ if(rdfUri) {
+ isDate = false;
+ if(parts[j] != "pNA") { // not a real page number
+ var content = parts[j].substring(1);
+ model.addStatement(uri, rdfUri, content, true);
+ }
+ } else if(isDate) {
+ date += " "+parts[j];
+ }
+ }
+ if(date != "") {
+ model.addStatement(uri, prefixDC + "date", date.substring(1), false);
+ }
+ } else if(field == "author") {
+ model.addStatement(uri, prefixDC + "creator", cleanAuthor(value), false);
+ }
+}
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);');
+
+INSERT INTO "scrapers" VALUES(10, NULL, NULL, 20060603002000, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+function clearTags(x) {
+ x = x.replace(/
]*>/gi, "\n");
+ return x.replace(/<[^>]+>/g, "");
+}
+
+var uri = doc.location.href;
+
+var citationDataDiv;
+var divs = doc.getElementsByTagName("div");
+for(i in divs) {
+ if(divs[i].className == "bodytext") {
+ citationDataDiv = divs[i];
+ break;
+ }
+}
+
+centerElements = citationDataDiv.getElementsByTagName("center");
+var elementParts = centerElements[0].innerHTML.split(/
]*>/gi);
+model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true);
+
+var dateRegexp = /
]*>([A-Z][a-z]+)<\/b> ([0-9]+, [0-9]{4})/;
+var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML);
+if(m) {
+ model.addStatement(uri, prefixDC + "date", m[1]+" "+m[2], true);
+} else {
+ var elementParts = centerElements[centerElements.length-1].innerHTML.split(/
]*>/gi);
+ model.addStatement(uri, prefixDC + "date", elementParts[1], true);
+}
+
+var cutIndex = citationDataDiv.innerHTML.indexOf("BODY:");
+if(cutIndex < 0) {
+ cutIndex = citationDataDiv.innerHTML.indexOf("TEXT:");
+}
+if(cutIndex > 0) {
+ citationData = citationDataDiv.innerHTML.substring(0, cutIndex);
+} else {
+ citationData = citationDataDiv.innerHTML;
+}
+
+citationData = clearTags(citationData);
+
+var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/;
+var m = headlineRegexp.exec(citationData);
+if(m) {
+ model.addStatement(uri, prefixDC + "title", clearTags(m[1]), true);
+}
+
+var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/;
+var m = bylineRegexp.exec(citationData);
+if(m) {
+ utilities.debugPrint(m[1].substring(0, 3).toLowerCase());
+ if(m[1].substring(0, 3).toLowerCase() == "by ") {
+ m[1] = m[1].substring(3);
+ }
+ model.addStatement(uri, prefixDC + "creator", m[1], true);
+}
+
+var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/;
+var m = authorRegexp.exec(citationData);
+if(m) {
+ var authors = m[1].split(/, (?:and )?/);
+ for(i in authors) {
+ model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true);
+ }
+}
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+
+utilities.debugPrint(citationData);');
+
+INSERT INTO "scrapers" VALUES(11, NULL, NULL, 20060603002000, 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var newUri = uri.replace("&format=999", "&format=001");
+utilities.debugPrint(newUri);
+
+function stringTrimmer(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+utilities.loadDocument(newUri, browser, function(newBrowser) {
+ newDoc = newBrowser.contentDocument;
+
+ var namespace = newDoc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+ }
+
+ var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]'';
+ var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
+ var record = new MARC_Record();
+ for(var i=0; i 3) {
+ var ind1 = field.charAt(3);
+ if(field.length > 4) {
+ var ind2 = field.charAt(4);
+ }
+ }
+ record.add_field(code, ind1, ind2, value);
+ }
+ }
+
+ model = utilities.importMARCRecord(record, uri, model);
+ done();
+}, function() {})
+
+wait();');
+
+
+INSERT INTO "scrapers" VALUES(12, NULL, NULL, 20060603002000, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*uri=full=[0-9]', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+var newUri = uri+''&fullmarc=true'';
+utilities.debugPrint(newUri);
+
+function stringTrimmer(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+var getNodeString = function(doc, contextNode, xpath, nsResolver) {
+ var elmts = utilities.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
+ var returnVar = "";
+ for(var i=0; i