- Made ingester automatically create hidden browser objects, given a window object. This should make things much easier for both David and me.
- Multiple item detection code is now a part of the scraperJavaScript, rather than the scrapeDetectCode, and code to choose which items to add is part of Scholar.Ingester.Utilities, accessible from inside scrapers. The alternative approach would result in one request (or, in the case of JSTOR, three requests) per new item, while in some cases (e.g. Voyager) only one request is necessary to get all of the items.
This commit is contained in:
parent
726364d091
commit
3890e5f122
|
@ -35,7 +35,6 @@ Scholar_Ingester_Interface.init = function() {
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface.chromeLoad = function() {
|
Scholar_Ingester_Interface.chromeLoad = function() {
|
||||||
Scholar_Ingester_Interface.tabBrowser = document.getElementById("content");
|
Scholar_Ingester_Interface.tabBrowser = document.getElementById("content");
|
||||||
Scholar_Ingester_Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
|
|
||||||
Scholar_Ingester_Interface.appContent = document.getElementById("appcontent");
|
Scholar_Ingester_Interface.appContent = document.getElementById("appcontent");
|
||||||
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
|
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
|
||||||
|
|
||||||
|
@ -61,21 +60,11 @@ Scholar_Ingester_Interface.chromeUnload = function() {
|
||||||
Scholar_Ingester_Interface.scrapeThisPage = function() {
|
Scholar_Ingester_Interface.scrapeThisPage = function() {
|
||||||
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||||
if(documentObject.scraper) {
|
if(documentObject.scraper) {
|
||||||
if(documentObject.scrapeURLList) {
|
|
||||||
// In the case that there are multiple scrapable URLs, make the user choose
|
|
||||||
Scholar_Ingester_Interface.chooseURL(documentObject);
|
|
||||||
}
|
|
||||||
Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
|
Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
|
||||||
documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping);
|
documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar_Ingester_Interface.chooseURL = function(documentObject) {
|
|
||||||
Scholar.debug("chooseURL called");
|
|
||||||
var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
|
|
||||||
"_blank","chrome,modal,centerscreen,resizable=yes", documentObject);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Updates the status of the capture icon to reflect the scrapability or lack
|
* Updates the status of the capture icon to reflect the scrapability or lack
|
||||||
* thereof of the current page
|
* thereof of the current page
|
||||||
|
@ -182,7 +171,7 @@ Scholar_Ingester_Interface._setDocument = function(browser) {
|
||||||
browser.setAttribute("scholar-key", key);
|
browser.setAttribute("scholar-key", key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar_Ingester_Interface.hiddenBrowser);
|
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, window);
|
||||||
Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper();
|
Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -203,7 +192,7 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
|
||||||
/*
|
/*
|
||||||
* Callback to be executed when scraping is complete
|
* Callback to be executed when scraping is complete
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface._finishScraping = function(obj) {
|
Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) {
|
||||||
if(obj.items.length) {
|
if(obj.items.length) {
|
||||||
try { // Encased in a try block to fix a as-of-yet unresolved issue
|
try { // Encased in a try block to fix a as-of-yet unresolved issue
|
||||||
var item1 = obj.items[0];
|
var item1 = obj.items[0];
|
||||||
|
@ -243,12 +232,14 @@ Scholar_Ingester_Interface._finishScraping = function(obj) {
|
||||||
for(i in obj.items) {
|
for(i in obj.items) {
|
||||||
obj.items[i].save();
|
obj.items[i].save();
|
||||||
}
|
}
|
||||||
|
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
|
||||||
|
} else if(returnValue) {
|
||||||
|
Scholar_Ingester_Interface.scrapeProgress.kill();
|
||||||
} else {
|
} else {
|
||||||
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
|
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
|
||||||
Scholar_Ingester_Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
|
Scholar_Ingester_Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
|
||||||
}
|
|
||||||
|
|
||||||
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
|
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -333,7 +324,6 @@ Scholar_Ingester_Interface.Progress.prototype.addDescription = function(descript
|
||||||
this.table.appendChild(tr);
|
this.table.appendChild(tr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Scholar_Ingester_Interface.Progress.prototype.fade = function() {
|
Scholar_Ingester_Interface.Progress.prototype.fade = function() {
|
||||||
// Icky, icky hack to keep objects
|
// Icky, icky hack to keep objects
|
||||||
var me = this;
|
var me = this;
|
||||||
|
@ -349,3 +339,8 @@ Scholar_Ingester_Interface.Progress.prototype.fade = function() {
|
||||||
// Begin fade
|
// Begin fade
|
||||||
this._fader();
|
this._fader();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Scholar_Ingester_Interface.Progress.prototype.kill = function() {
|
||||||
|
this.div.style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,10 +12,4 @@
|
||||||
<hbox id="urlbar-icons">
|
<hbox id="urlbar-icons">
|
||||||
<image src="chrome://scholar/skin/treeitem-book.png" id="scholar-status-image" onclick="Scholar_Ingester_Interface.scrapeThisPage()" position="1" hidden="true"/>
|
<image src="chrome://scholar/skin/treeitem-book.png" id="scholar-status-image" onclick="Scholar_Ingester_Interface.scrapeThisPage()" position="1" hidden="true"/>
|
||||||
</hbox>
|
</hbox>
|
||||||
|
|
||||||
<window id="main-window">
|
|
||||||
<box style="visibility: collapse">
|
|
||||||
<browser id="scholar-hidden-browser" />
|
|
||||||
</box>
|
|
||||||
</window>
|
|
||||||
</overlay>
|
</overlay>
|
||||||
|
|
|
@ -19,26 +19,26 @@ Scholar_Ingester_Interface_SelectItems = function() {}
|
||||||
* loading
|
* loading
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface_SelectItems.init = function() {
|
Scholar_Ingester_Interface_SelectItems.init = function() {
|
||||||
this.documentObject = window.arguments[0];
|
this.io = window.arguments[0];
|
||||||
|
this.Scholar_Ingester_Interface = window.arguments[1];
|
||||||
this.listbox = document.getElementById("scholar-selectitems-links");
|
this.listbox = document.getElementById("scholar-selectitems-links");
|
||||||
|
|
||||||
for(i in this.documentObject.scrapeURLList) { // we could use a tree for this if we wanted to
|
for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
|
||||||
var itemNode = document.createElement("listitem");
|
var itemNode = document.createElement("listitem");
|
||||||
itemNode.setAttribute("type", "checkbox");
|
itemNode.setAttribute("type", "checkbox");
|
||||||
itemNode.setAttribute("value", i);
|
itemNode.setAttribute("value", i);
|
||||||
itemNode.setAttribute("label", this.documentObject.scrapeURLList[i]);
|
itemNode.setAttribute("label", this.io.dataIn[i]);
|
||||||
itemNode.setAttribute("checked", false);
|
itemNode.setAttribute("checked", false);
|
||||||
this.listbox.appendChild(itemNode);
|
this.listbox.appendChild(itemNode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
|
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
|
||||||
// clear scrapeURLList
|
this.io.dataOut = new Object();
|
||||||
this.documentObject.scrapeURLList = new Object();
|
|
||||||
|
|
||||||
// collect scrapeURLList from listbox
|
// collect scrapeURLList from listbox
|
||||||
for(var i=0; i<this.listbox.length; i++) {
|
for(var i=0; i<this.listbox.length; i++) {
|
||||||
var itemNode = this.listbox[i];
|
var itemNode = this.listbox[i];
|
||||||
this.documentObject.scrapeURLList[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -4,6 +4,21 @@
|
||||||
|
|
||||||
Scholar.Ingester = new function() {}
|
Scholar.Ingester = new function() {}
|
||||||
|
|
||||||
|
Scholar.Ingester.createHiddenBrowser = function(myWindow) {
|
||||||
|
// Create a hidden browser
|
||||||
|
var newHiddenBrowser = myWindow.document.createElement("browser");
|
||||||
|
var windows = myWindow.document.getElementsByTagName("window");
|
||||||
|
windows[0].appendChild(newHiddenBrowser);
|
||||||
|
Scholar.debug("created hidden browser");
|
||||||
|
return newHiddenBrowser;
|
||||||
|
}
|
||||||
|
|
||||||
|
Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
|
||||||
|
// Delete a hidden browser
|
||||||
|
delete myBrowser;
|
||||||
|
Scholar.debug("deleted hidden browser");
|
||||||
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
//
|
//
|
||||||
// Scholar.Ingester.Model
|
// Scholar.Ingester.Model
|
||||||
|
@ -48,8 +63,8 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// Scholar.Ingester.Utilities class, a set of methods to assist in data
|
// Scholar.Ingester.Utilities class, a set of methods to assist in data
|
||||||
// extraction. Most code here was stolen directly from the Piggy Bank project.
|
// extraction. Most code here was stolen directly from the Piggy Bank project.
|
||||||
Scholar.Ingester.Utilities = function(hiddenBrowser) {
|
Scholar.Ingester.Utilities = function(myWindow) {
|
||||||
this._hiddenBrowser = hiddenBrowser;
|
this.window = myWindow;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adapter for Piggy Bank function to print debug messages; log level is
|
// Adapter for Piggy Bank function to print debug messages; log level is
|
||||||
|
@ -115,7 +130,7 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
|
||||||
// exception - a function to execute if an exception occurs (exceptions are
|
// exception - a function to execute if an exception occurs (exceptions are
|
||||||
// also logged in the Firefox Scholar log)
|
// also logged in the Firefox Scholar log)
|
||||||
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||||
var hiddenBrowser = this._hiddenBrowser;
|
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
|
||||||
Scholar.debug("processDocuments called");
|
Scholar.debug("processDocuments called");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -141,26 +156,23 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
||||||
exception(e);
|
exception(e);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
|
||||||
hiddenBrowser.setTimeout(done, 10);
|
hiddenBrowser.setTimeout(done, 10);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
var onLoad = function() {
|
var onLoad = function() {
|
||||||
Scholar.debug("onLoad called");
|
Scholar.debug("onLoad called");
|
||||||
if(hiddenBrowser.id == "scholar-hidden-browser") {
|
|
||||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||||
try {
|
try {
|
||||||
var newHiddenBrowser = new Object();
|
var newHiddenBrowser = new Object();
|
||||||
Scholar.debug("new hidden browser");
|
|
||||||
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
||||||
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
||||||
Scholar.debug("added attributes");
|
|
||||||
processor(newHiddenBrowser);
|
processor(newHiddenBrowser);
|
||||||
Scholar.debug("called processor");
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
||||||
exception(e);
|
exception(e);
|
||||||
}
|
}
|
||||||
}
|
doLoad();
|
||||||
};
|
};
|
||||||
var init = function() {
|
var init = function() {
|
||||||
Scholar.debug("init called");
|
Scholar.debug("init called");
|
||||||
|
@ -302,6 +314,50 @@ Scholar.Ingester.Utilities.prototype.cleanTags = function(x) {
|
||||||
return x.replace(/<[^>]+>/g, "");
|
return x.replace(/<[^>]+>/g, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allows a user to select which items to scrape
|
||||||
|
*/
|
||||||
|
Scholar.Ingester.Utilities.prototype.selectItems = function(itemList) {
|
||||||
|
// mozillazine made me do it! honest!
|
||||||
|
var io = { dataIn:itemList, dataOut:null }
|
||||||
|
var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
|
||||||
|
"_blank","chrome,modal,centerscreen,resizable=yes", io);
|
||||||
|
return io.dataOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Grabs items based on URLs
|
||||||
|
*/
|
||||||
|
Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) {
|
||||||
|
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||||
|
|
||||||
|
// Require link to match this
|
||||||
|
var tagRegexp = new RegExp();
|
||||||
|
tagRegexp.compile(urlRe);
|
||||||
|
// Do not allow text to match this
|
||||||
|
var rejectRegexp = new RegExp();
|
||||||
|
rejectRegexp.compile(rejectRe);
|
||||||
|
|
||||||
|
var links = inHere.getElementsByTagName("a");
|
||||||
|
for(var i=0; i<links.length; i++) {
|
||||||
|
if(tagRegexp.test(links[i].href)) {
|
||||||
|
var text = this.getNodeString(doc, links[i], './/text()', null);
|
||||||
|
if(text) {
|
||||||
|
text = this.cleanString(text);
|
||||||
|
if(!rejectRegexp.test(text)) {
|
||||||
|
if(availableItems[links[i].href]) {
|
||||||
|
availableItems[links[i].href] += " "+text;
|
||||||
|
} else {
|
||||||
|
availableItems[links[i].href] = text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return availableItems;
|
||||||
|
}
|
||||||
|
|
||||||
// These functions are for use by importMARCRecord. They're private, because,
|
// These functions are for use by importMARCRecord. They're private, because,
|
||||||
// while they are useful, it's also nice if as many of our scrapers as possible
|
// while they are useful, it's also nice if as many of our scrapers as possible
|
||||||
// are PiggyBank compatible, and if our scrapers used functions, that would
|
// are PiggyBank compatible, and if our scrapers used functions, that would
|
||||||
|
@ -512,14 +568,14 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
|
||||||
/*
|
/*
|
||||||
* Constructor for Document object
|
* Constructor for Document object
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
|
Scholar.Ingester.Document = function(browserWindow, myWindow){
|
||||||
this.scraper = null;
|
this.scraper = null;
|
||||||
this.browser = browserWindow;
|
this.browser = browserWindow;
|
||||||
|
this.window = myWindow;
|
||||||
this.model = new Scholar.Ingester.Model();
|
this.model = new Scholar.Ingester.Model();
|
||||||
this.items = new Array();
|
this.items = new Array();
|
||||||
this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
|
this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
|
||||||
.getService(Ci.nsIAppShellService);
|
.getService(Ci.nsIAppShellService);
|
||||||
this._hiddenBrowser = hiddenBrowser;
|
|
||||||
this._generateSandbox();
|
this._generateSandbox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -596,17 +652,19 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||||
|
|
||||||
var scraperSandbox = this._sandbox;
|
var scraperSandbox = this._sandbox;
|
||||||
try {
|
try {
|
||||||
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
var returnValue = Components.utils.evalInSandbox("(function(){\n" +
|
||||||
|
this.scraper.scraperJavaScript +
|
||||||
|
"\n})()", scraperSandbox);
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
Scholar.debug(e+' in scraperJavaScript for '+this.scraper.label);
|
Scholar.debug(e+' in scraperJavaScript for '+this.scraper.label);
|
||||||
this._scrapePageComplete();
|
this._scrapePageComplete(false);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If synchronous, call _scrapePageComplete();
|
// If synchronous, call _scrapePageComplete();
|
||||||
if(!this._waitForCompletion) {
|
if(!this._waitForCompletion) {
|
||||||
Scholar.debug("is asynch");
|
Scholar.debug("is asynch");
|
||||||
this._scrapePageComplete();
|
this._scrapePageComplete(returnValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -637,10 +695,10 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||||
/*
|
/*
|
||||||
* Called when scraping (synchronous or asynchronous) is complete
|
* Called when scraping (synchronous or asynchronous) is complete
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) {
|
||||||
this._updateDatabase();
|
this._updateDatabase();
|
||||||
if(this._scrapeCallback) {
|
if(this._scrapeCallback) {
|
||||||
this._scrapeCallback(this);
|
this._scrapeCallback(this, returnValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -651,7 +709,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
||||||
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
||||||
this._sandbox.browser = this.browser;
|
this._sandbox.browser = this.browser;
|
||||||
this._sandbox.doc = this._sandbox.browser.contentDocument;
|
this._sandbox.doc = this._sandbox.browser.contentDocument;
|
||||||
this._sandbox.utilities = new Scholar.Ingester.Utilities(this._hiddenBrowser);
|
this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window);
|
||||||
this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow);
|
this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow);
|
||||||
this._sandbox.window = this.window;
|
this._sandbox.window = this.window;
|
||||||
this._sandbox.model = this.model;
|
this._sandbox.model = this.model;
|
||||||
|
|
51
scrapers.sql
51
scrapers.sql
|
@ -175,48 +175,7 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex
|
||||||
wait();');
|
wait();');
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-21 22:44:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
|
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-21 22:44:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
|
||||||
'if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
'var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
|
||||||
// We have search results
|
|
||||||
|
|
||||||
var namespace = doc.documentElement.namespaceURI;
|
|
||||||
var nsResolver = namespace ? function(prefix) {
|
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
|
||||||
} : null;
|
|
||||||
|
|
||||||
var availableItems = new Object(); // Technically, associative arrays are objects
|
|
||||||
|
|
||||||
// Require link to match this
|
|
||||||
var tagRegexp = new RegExp();
|
|
||||||
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
|
|
||||||
// Do not allow text to match this
|
|
||||||
var rejectRegexp = new RegExp();
|
|
||||||
rejectRegexp.compile(''\[ [0-9]+ \]'');
|
|
||||||
|
|
||||||
var links = doc.getElementsByTagName("a");
|
|
||||||
for(var i=0; i<links.length; i++) {
|
|
||||||
if(tagRegexp.test(links[i].href)) {
|
|
||||||
var text = utilities.getNodeString(doc, links[i], ''.//text()'', nsResolver);
|
|
||||||
if(text) {
|
|
||||||
text = utilities.cleanString(text);
|
|
||||||
if(!rejectRegexp.test(text)) {
|
|
||||||
if(availableItems[links[i].href]) {
|
|
||||||
availableItems[links[i].href] += " "+text;
|
|
||||||
} else {
|
|
||||||
availableItems[links[i].href] = text;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(availableItems) {
|
|
||||||
return availableItems;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
|
|
||||||
for(i in export_options) {
|
for(i in export_options) {
|
||||||
if(export_options[i].text == ''Latin1 MARC''
|
if(export_options[i].text == ''Latin1 MARC''
|
||||||
|| export_options[i].text == ''Raw MARC''
|
|| export_options[i].text == ''Raw MARC''
|
||||||
|
@ -233,6 +192,14 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||||
|
|
||||||
|
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
||||||
|
var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]'');
|
||||||
|
var items = utilities.selectItems(items);
|
||||||
|
if(!items) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var uri = doc.location.href;
|
var uri = doc.location.href;
|
||||||
|
|
||||||
var raw, unicode, latin1;
|
var raw, unicode, latin1;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user