The Voyager scraper now actually works on the search results page.
This commit is contained in:
parent
3890e5f122
commit
470f7c463f
|
@ -21,7 +21,7 @@ Scholar_Ingester_Interface_SelectItems = function() {}
|
||||||
Scholar_Ingester_Interface_SelectItems.init = function() {
|
Scholar_Ingester_Interface_SelectItems.init = function() {
|
||||||
this.io = window.arguments[0];
|
this.io = window.arguments[0];
|
||||||
this.Scholar_Ingester_Interface = window.arguments[1];
|
this.Scholar_Ingester_Interface = window.arguments[1];
|
||||||
this.listbox = document.getElementById("scholar-selectitems-links");
|
var listbox = document.getElementById("scholar-selectitems-links");
|
||||||
|
|
||||||
for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
|
for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
|
||||||
var itemNode = document.createElement("listitem");
|
var itemNode = document.createElement("listitem");
|
||||||
|
@ -29,16 +29,29 @@ Scholar_Ingester_Interface_SelectItems.init = function() {
|
||||||
itemNode.setAttribute("value", i);
|
itemNode.setAttribute("value", i);
|
||||||
itemNode.setAttribute("label", this.io.dataIn[i]);
|
itemNode.setAttribute("label", this.io.dataIn[i]);
|
||||||
itemNode.setAttribute("checked", false);
|
itemNode.setAttribute("checked", false);
|
||||||
this.listbox.appendChild(itemNode);
|
listbox.appendChild(itemNode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
|
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
|
||||||
|
var listbox = document.getElementById("scholar-selectitems-links");
|
||||||
|
|
||||||
|
var returnObject = false;
|
||||||
this.io.dataOut = new Object();
|
this.io.dataOut = new Object();
|
||||||
|
|
||||||
// collect scrapeURLList from listbox
|
// collect scrapeURLList from listbox
|
||||||
for(var i=0; i<this.listbox.length; i++) {
|
for(var i=0; i<listbox.childNodes.length; i++) {
|
||||||
var itemNode = this.listbox[i];
|
var itemNode = listbox.childNodes[i];
|
||||||
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
if(itemNode.getAttribute("checked") == "true") {
|
||||||
|
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
||||||
|
returnObject = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// What a hack! this makes code down the road much easier because otherwise
|
||||||
|
// an empty array is true but empty and we can't figure that out, because
|
||||||
|
// there's no length
|
||||||
|
if(!returnObject) {
|
||||||
|
this.io.dataOut = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -10,7 +10,6 @@ Borrowed from Linky, originally MPL/GPL/LGPL (now GPL, and modified into oblivio
|
||||||
persist="width height screenX screenY"
|
persist="width height screenX screenY"
|
||||||
buttons="cancel,accept"
|
buttons="cancel,accept"
|
||||||
ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()"
|
ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()"
|
||||||
ondialogcancel="self.close()"
|
|
||||||
id="scholar-selectitems"
|
id="scholar-selectitems"
|
||||||
onload="Scholar_Ingester_Interface_SelectItems.init()">
|
onload="Scholar_Ingester_Interface_SelectItems.init()">
|
||||||
|
|
||||||
|
|
|
@ -700,6 +700,13 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue)
|
||||||
if(this._scrapeCallback) {
|
if(this._scrapeCallback) {
|
||||||
this._scrapeCallback(this, returnValue);
|
this._scrapeCallback(this, returnValue);
|
||||||
}
|
}
|
||||||
|
// Get us ready for another scrape
|
||||||
|
delete this.model;
|
||||||
|
delete this.items;
|
||||||
|
this.model = new Scholar.Ingester.Model();
|
||||||
|
this.items = new Array();
|
||||||
|
// This is perhaps a bit paranoid, but we need to get the model redone anyway
|
||||||
|
this._generateSandbox();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
97
scrapers.sql
97
scrapers.sql
|
@ -192,21 +192,72 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||||
|
|
||||||
|
var uri = doc.location.href;
|
||||||
|
var postString = '''';
|
||||||
|
var form = doc.forms.namedItem(''frm'');
|
||||||
|
var newUri = form.action;
|
||||||
|
var multiple = false;
|
||||||
|
|
||||||
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
||||||
var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]'');
|
multiple = true;
|
||||||
var items = utilities.selectItems(items);
|
|
||||||
|
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||||
|
|
||||||
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
|
var nsResolver = namespace ? function(prefix) {
|
||||||
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
|
} : null;
|
||||||
|
|
||||||
|
// Require link to match this
|
||||||
|
var tagRegexp = new RegExp();
|
||||||
|
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
|
||||||
|
// Do not allow text to match this
|
||||||
|
var rejectRegexp = new RegExp();
|
||||||
|
rejectRegexp.compile(''\[ [0-9]+ \]'');
|
||||||
|
|
||||||
|
var checkboxes = new Array();
|
||||||
|
var urls = new Array();
|
||||||
|
|
||||||
|
var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/table/tbody/tr[td/input[@type="checkbox"]]'', nsResolver);
|
||||||
|
// Go through table rows
|
||||||
|
for(var i=0; i<tableRows.length; i++) {
|
||||||
|
// CHK is what we need to get it all as one file
|
||||||
|
var input = utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver);
|
||||||
|
checkboxes[i] = input.value;
|
||||||
|
var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
|
||||||
|
urls[i] = links[0].href;
|
||||||
|
utilities.debugPrint(urls[i]+" = "+links[0].href);
|
||||||
|
// Go through links
|
||||||
|
for(var j=0; j<links.length; j++) {
|
||||||
|
if(tagRegexp.test(links[j].href)) {
|
||||||
|
var text = utilities.getNodeString(doc, links[j], ''.//text()'', null);
|
||||||
|
if(text) {
|
||||||
|
text = utilities.cleanString(text);
|
||||||
|
if(!rejectRegexp.test(text)) {
|
||||||
|
if(availableItems[i]) {
|
||||||
|
availableItems[i] += " "+text;
|
||||||
|
} else {
|
||||||
|
availableItems[i] = text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var items = utilities.selectItems(availableItems);
|
||||||
if(!items) {
|
if(!items) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add arguments for items we need to grab
|
||||||
|
for(i in items) {
|
||||||
|
postString += "CHK="+checkboxes[i]+"&";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var uri = doc.location.href;
|
|
||||||
|
|
||||||
var raw, unicode, latin1;
|
var raw, unicode, latin1;
|
||||||
|
|
||||||
var form = doc.forms.namedItem(''frm'');
|
|
||||||
var newUri = form.action;
|
|
||||||
var postString = '''';
|
|
||||||
for(i in form.elements) {
|
for(i in form.elements) {
|
||||||
if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') {
|
if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') {
|
||||||
postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&'';
|
postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&'';
|
||||||
|
@ -227,11 +278,21 @@ for(i in export_options) {
|
||||||
}
|
}
|
||||||
postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT'';
|
postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT'';
|
||||||
|
|
||||||
|
utilities.debugPrint(postString);
|
||||||
|
|
||||||
// No idea why this doesn''t work as post
|
// No idea why this doesn''t work as post
|
||||||
utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
|
utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
|
||||||
var record = new MARC_Record();
|
var records = text.split("\x1D");
|
||||||
record.load(text, "binary");
|
for(var i=0; i<(records.length-1); i++) {
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
if(multiple) {
|
||||||
|
utilities.debugPrint("uri = urls["+i+"]");
|
||||||
|
uri = urls[i];
|
||||||
|
utilities.debugPrint("my uri = "+uri);
|
||||||
|
}
|
||||||
|
var record = new MARC_Record();
|
||||||
|
record.load(records[i], "binary");
|
||||||
|
utilities.importMARCRecord(record, uri, model);
|
||||||
|
}
|
||||||
done();
|
done();
|
||||||
})
|
})
|
||||||
wait();');
|
wait();');
|
||||||
|
@ -466,7 +527,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||||
|
|
||||||
var record = new MARC_Record();
|
var record = new MARC_Record();
|
||||||
record.load(text, "MARC_PAC");
|
record.load(text, "MARC_PAC");
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
utilities.importMARCRecord(record, uri, model);
|
||||||
done();
|
done();
|
||||||
}, function() {});
|
}, function() {});
|
||||||
|
|
||||||
|
@ -867,7 +928,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||||
}
|
}
|
||||||
|
|
||||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
utilities.importMARCRecord(record, uri, model);
|
||||||
done();
|
done();
|
||||||
}, function() {});
|
}, function() {});
|
||||||
|
|
||||||
|
@ -915,7 +976,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
utilities.importMARCRecord(record, uri, model);
|
||||||
done();
|
done();
|
||||||
}, function() {})
|
}, function() {})
|
||||||
|
|
||||||
|
@ -952,7 +1013,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||||
record.add_field(field, ind1, ind2, value);
|
record.add_field(field, ind1, ind2, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
utilities.importMARCRecord(record, uri, model);
|
||||||
done();
|
done();
|
||||||
}, function() {})
|
}, function() {})
|
||||||
|
|
||||||
|
@ -983,7 +1044,7 @@ if(uri.indexOf("authority_hits") < 0) {
|
||||||
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
||||||
var record = new MARC_Record();
|
var record = new MARC_Record();
|
||||||
record.load(text, "binary");
|
record.load(text, "binary");
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
utilities.importMARCRecord(record, uri, model);
|
||||||
done();
|
done();
|
||||||
})
|
})
|
||||||
wait();');
|
wait();');
|
||||||
|
@ -1042,7 +1103,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
utilities.importMARCRecord(record, uri, model);
|
||||||
done();
|
done();
|
||||||
}, function() {});
|
}, function() {});
|
||||||
|
|
||||||
|
@ -1120,7 +1181,7 @@ utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
utilities.importMARCRecord(record, uri, model);
|
||||||
done();
|
done();
|
||||||
})
|
})
|
||||||
wait();');
|
wait();');
|
||||||
|
@ -1191,7 +1252,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||||
record.add_field(tag, ind1, ind2, content);
|
record.add_field(tag, ind1, ind2, content);
|
||||||
}
|
}
|
||||||
|
|
||||||
model = utilities.importMARCRecord(record, uri, model);
|
utilities.importMARCRecord(record, uri, model);
|
||||||
done();
|
done();
|
||||||
}, function() {});
|
}, function() {});
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user