diff --git a/chrome/chromeFiles/content/scholar/fileInterface.js b/chrome/chromeFiles/content/scholar/fileInterface.js index 6afc26fba..7d7bead70 100644 --- a/chrome/chromeFiles/content/scholar/fileInterface.js +++ b/chrome/chromeFiles/content/scholar/fileInterface.js @@ -1,5 +1,6 @@ Scholar_File_Interface = new function() { this.exportFile = exportFile; + this.importFile = importFile; /* * Creates Scholar.Translate instance and shows file picker for file export @@ -23,4 +24,41 @@ Scholar_File_Interface = new function() { translation.translate(); } } + + /* + * Creates Scholar.Translate instance and shows file picker for file import + */ + function importFile() { + var translation = new Scholar.Translate("import"); + var translators = translation.getTranslators(); + + const nsIFilePicker = Components.interfaces.nsIFilePicker; + var fp = Components.classes["@mozilla.org/filepicker;1"] + .createInstance(nsIFilePicker); + fp.init(window, "Import", nsIFilePicker.modeOpen); + for(var i in translators) { + fp.appendFilter(translators[i].label, "*."+translators[i].target); + } + + var rv = fp.show(); + if (rv == nsIFilePicker.returnOK || rv == nsIFilePicker.returnReplace) { + translation.setLocation(fp.file); + // get translators again, bc now we can check against the file + translators = translation.getTranslators(); + if(translators.length) { + // TODO: display a list of available translators + translation.setTranslator(translators[0]); + translation.setHandler("itemDone", _importItemDone); + translation.translate(); + } + } + } + + /* + * Saves items after they've been imported. We could have a nice little + * "items imported" indicator, too. + */ + function _importItemDone(obj, item) { + item.save(); + } } \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index 1e6a25c96..1645c6fee 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -25,8 +25,7 @@ Scholar_Ingester_Interface._scrapeProgress = new Array(); */ Scholar_Ingester_Interface.init = function() { Scholar_Ingester_Interface.browsers = new Array(); - Scholar_Ingester_Interface.browserDocuments = new Object(); - Scholar_Ingester_Interface.browserUris = new Array(); + Scholar_Ingester_Interface.browserData = new Object(); Scholar_Ingester_Interface._scrapePopupShowing = false; Scholar.Ingester.ProxyMonitor.init(); @@ -54,7 +53,7 @@ Scholar_Ingester_Interface.chromeLoad = function() { * When chrome unloads, delete our document objects and remove our listeners */ Scholar_Ingester_Interface.chromeUnload = function() { - delete Scholar_Ingester_Interface.browserDocuments; + delete Scholar_Ingester_Interface.browserData, Scholar_Ingester_Interface.browsers; this.tabBrowser.removeProgressListener(this); } @@ -62,30 +61,20 @@ Scholar_Ingester_Interface.chromeUnload = function() { * Scrapes a page (called when the capture icon is clicked) */ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) { - var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); - if(documentObject.scraper) { - var scrapeProgress = new Scholar_Ingester_Interface.Progress(window); - Scholar_Ingester_Interface._scrapeProgress.push(scrapeProgress); - documentObject.scrapePage(function(obj, returnValue) { Scholar_Ingester_Interface._finishScraping(obj, returnValue, scrapeProgress, saveLocation) }); - } -} - -/* - * Updates the status of the capture icon to reflect the scrapability or lack - * thereof of the current page - */ -Scholar_Ingester_Interface.updateStatus = function() { - var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); - if(documentObject && documentObject.scraper) { - if(documentObject.type == "multiple") { - // Use folder icon for multiple types, for now - Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png"; - } else { - Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+documentObject.type+".png"; - } - Scholar_Ingester_Interface.statusImage.hidden = false; - } else { - Scholar_Ingester_Interface.statusImage.hidden = true; + var browser = Scholar_Ingester_Interface.tabBrowser.selectedBrowser; + var data = Scholar_Ingester_Interface._getData(browser); + + if(data.translators && data.translators.length) { + Scholar_Ingester_Interface.Progress.show(); + + var translate = new Scholar.Translate("web"); + translate.setBrowser(browser); + // use first translator available + translate.setTranslator(data.translators[0]); + translate.setHandler("select", Scholar_Ingester_Interface._selectItems); + translate.setHandler("itemDone", Scholar_Ingester_Interface._itemDone); + translate.setHandler("done", Scholar_Ingester_Interface._finishScraping); + translate.translate(); } } @@ -122,8 +111,14 @@ Scholar_Ingester_Interface.contentLoad = function(event) { return; } - Scholar_Ingester_Interface._setDocument(browser); - Scholar_Ingester_Interface.updateStatus(); + // get data object + var data = Scholar_Ingester_Interface._getData(browser); + // get translators + var translate = new Scholar.Translate("web"); + translate.setBrowser(browser); + data.translators = translate.getTranslators(); + // update status + Scholar_Ingester_Interface._updateStatus(data); } } @@ -162,13 +157,12 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) Scholar_Ingester_Interface._deleteDocument(browser); } } - Scholar_Ingester_Interface.updateStatus(); + + var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); + Scholar_Ingester_Interface._updateStatus(data); // Make sure scrape progress is gone - var scrapeProgress; - while(scrapeProgress = Scholar_Ingester_Interface._scrapeProgress.pop()) { - scrapeProgress.kill(); - } + Scholar_Ingester_Interface.Progress.kill(); } Scholar_Ingester_Interface.hidePopup = function(collectionID) { @@ -219,95 +213,101 @@ Scholar_Ingester_Interface.showPopup = function(collectionID, parentElement) { ////////////////////////////////////////////////////////////////////////////// /* - * Gets a document object given a browser window object + * Gets a data object given a browser window object * * NOTE: Browser objects are associated with document objects via keys generated * from the time the browser object is opened. I'm not sure if this is the * appropriate mechanism for handling this, but it's what PiggyBank used and it * appears to work. + * + * Currently, the data object contains only one property: "translators," which + * is an array of translators that should work with the given page as returned + * from Scholar.Translate.getTranslator() */ -Scholar_Ingester_Interface._getDocument = function(browser) { +Scholar_Ingester_Interface._getData = function(browser) { try { var key = browser.getAttribute("scholar-key"); - if(Scholar_Ingester_Interface.browserDocuments[key]) { - return Scholar_Ingester_Interface.browserDocuments[key]; + if(Scholar_Ingester_Interface.browserData[key]) { + return Scholar_Ingester_Interface.browserData[key]; } - } finally {} - return false; -} - -/* - * Creates a new document object for a browser window object, attempts to - * retrieve appropriate scraper - */ -Scholar_Ingester_Interface._setDocument = function(browser) { - try { - var key = browser.getAttribute("scholar-key"); } finally { if(!key) { var key = (new Date()).getTime(); browser.setAttribute("scholar-key", key); + Scholar_Ingester_Interface.browserData[key] = new Array(); + return Scholar_Ingester_Interface.browserData[key]; } } - - // Only re-load the scraper if it's a new document - //if(Scholar_Ingester_Interface.browserUris[key] != browser.contentDocument.location.href) { - Scholar_Ingester_Interface.browserUris[key] = browser.contentDocument.location.href; - Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, window); - Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper(); - //} } /* * Deletes the document object associated with a given browser window object */ -Scholar_Ingester_Interface._deleteDocument = function(browser) { +Scholar_Ingester_Interface._deleteData = function(browser) { try { var key = browser.getAttribute("scholar-key"); - if(Scholar_Ingester_Interface.browserDocuments[key]) { - delete Scholar_Ingester_Interface.browserDocuments[key]; + if(Scholar_Ingester_Interface.browserData[key]) { + delete Scholar_Ingester_Interface.browserData[key]; return true; } } finally {} return false; } +/* + * Updates the status of the capture icon to reflect the scrapability or lack + * thereof of the current page + */ +Scholar_Ingester_Interface._updateStatus = function(data) { + if(data.translators && data.translators.length) { + var itemType = data.translators[0].itemType; + if(itemType == "multiple") { + // Use folder icon for multiple types, for now + Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png"; + } else { + Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+itemType+".png"; + } + Scholar_Ingester_Interface.statusImage.hidden = false; + } else { + Scholar_Ingester_Interface.statusImage.hidden = true; + } +} + +/* + * Callback to be executed when an item has been finished + */ +Scholar_Ingester_Interface._itemDone = function(obj, item) { + var title = item.getField("title"); + var icon = "chrome://scholar/skin/treeitem-"+Scholar.ItemTypes.getName(item.getField("itemTypeID"))+".png" + Scholar_Ingester_Interface.Progress.addLines([title], [icon]); + item.save(); +} + +/* + * called when a user is supposed to select items + */ +Scholar_Ingester_Interface._selectItems = function(obj, itemList) { + // this is kinda ugly, mozillazine made me do it! honest! + var io = { dataIn:itemList, dataOut:null } + var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul", + "_blank","chrome,modal,centerscreen,resizable=yes", io); + + if(!io.dataOut) { // user selected no items, so kill the progress indicatior + Scholar_Ingester_Interface.Progress.kill(); + } + + return io.dataOut; +} + /* * Callback to be executed when scraping is complete */ -Scholar_Ingester_Interface._finishScraping = function(obj, returnValue, scrapeProgress, saveLocation) { - if(obj.items.length) { - scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete")); - - // Display title and creators - var labels = new Array(); - var icons = new Array(); - for(var i in obj.items) { - labels.push(obj.items[i].getField("title")); - icons.push("chrome://scholar/skin/treeitem-"+Scholar.ItemTypes.getName(obj.items[i].getField("itemTypeID"))+".png"); - } - scrapeProgress.addLines(labels, icons); - - // Get collection if the user used the drop-down menu - if(saveLocation) { - var saveCollection = Scholar.Collections.get(saveLocation); - } - // Save items - for(i in obj.items) { - obj.items[i].save(); - if(saveLocation) { - saveCollection.addItem(obj.items[i].getID()); - } - } - - setTimeout(function() { scrapeProgress.fade() }, 2500); - } else if(returnValue) { - scrapeProgress.kill(); - } else { - scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError")); - scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription")); - setTimeout(function() { scrapeProgress.fade() }, 2500); +Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) { + if(!returnValue) { + Scholar_Ingester_Interface.Progress.changeHeadline(Scholar.getString("ingester.scrapeError")); + Scholar_Ingester_Interface.Progress.addDescription(Scholar.getString("ingester.scrapeErrorDescription")); } + Scholar_Ingester_Interface.Progress.fade(); } ////////////////////////////////////////////////////////////////////////////// @@ -317,99 +317,126 @@ Scholar_Ingester_Interface._finishScraping = function(obj, returnValue, scrapePr ////////////////////////////////////////////////////////////////////////////// // Handles the display of a div showing progress in scraping - -Scholar_Ingester_Interface.Progress = function(myWindow) { - this.openerWindow = myWindow; - this.progressWindow = myWindow.openDialog("chrome://scholar/chrome/ingester/progress.xul", "", "chrome,dialog=no,titlebar=no,popup=yes"); - var me = this; - this.progressWindow.addEventListener("load", function() { me.windowLoaded() }, false); +Scholar_Ingester_Interface.Progress = new function() { + var _windowLoaded = false; + var _windowLoading = false; + // keep track of all of these things in case they're called before we're + // done loading the progress window + var _loadDescription = null; + var _loadLines = new Array(); + var _loadIcons = new Array(); + var _loadHeadline = Scholar.getString("ingester.scraping"); - this._loadDescription = null; - this._loadLines = new Array(); - this._loadIcons = new Array(); - this._loadHeadline = Scholar.getString("ingester.scraping"); -} - -Scholar_Ingester_Interface.Progress.prototype.windowLoaded = function() { - this._windowLoaded = true; - this._move(); + this.show = show; + this.changeHeadline = changeHeadline; + this.addLines = addLines; + this.addDescription = addDescription; + this.fade = fade; + this.kill = kill; - this.changeHeadline(this._loadHeadline); - this.addLines(this._loadLines, this._loadIcons); - if(this._loadDescription) { - this.addDescription(this._loadDescription); + function show() { + if(_windowLoading || _windowLoaded) { // already loading or loaded + return false; + } + _progressWindow = window.openDialog("chrome://scholar/chrome/ingester/progress.xul", "", "chrome,dialog=no,titlebar=no,popup=yes"); + _progressWindow.addEventListener("load", _onWindowLoaded, false); + _windowLoading = true; } -} - -Scholar_Ingester_Interface.Progress.prototype.changeHeadline = function(headline) { - if(this._windowLoaded) { - this.progressWindow.document.getElementById("scholar-progress-text-headline").value = headline; - } else { - this._loadHeadline = headline; + + function changeHeadline(headline) { + if(_windowLoaded) { + _progressWindow.document.getElementById("scholar-progress-text-headline").value = headline; + } else { + _loadHeadline = headline; + } } -} - -Scholar_Ingester_Interface.Progress.prototype.addLines = function(label, icon) { - if(this._windowLoaded) { - for(i in label) { - var newLabel = this.progressWindow.document.createElement("label"); - newLabel.setAttribute("class", "scholar-progress-item-label"); - newLabel.setAttribute("crop", "end"); - newLabel.setAttribute("value", label[i]); + + function addLines(label, icon) { + if(_windowLoaded) { + for(i in label) { + var newLabel = _progressWindow.document.createElement("label"); + newLabel.setAttribute("class", "scholar-progress-item-label"); + newLabel.setAttribute("crop", "end"); + newLabel.setAttribute("value", label[i]); + + var newImage = _progressWindow.document.createElement("image"); + newImage.setAttribute("class", "scholar-progress-item-icon"); + newImage.setAttribute("src", icon[i]); + + var newHB = _progressWindow.document.createElement("hbox"); + newHB.setAttribute("class", "scholar-progress-item-hbox"); + newHB.setAttribute("valign", "center"); + newHB.appendChild(newImage); + newHB.appendChild(newLabel); + + _progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB); + } - var newImage = this.progressWindow.document.createElement("image"); - newImage.setAttribute("class", "scholar-progress-item-icon"); - newImage.setAttribute("src", icon[i]); - - var newHB = this.progressWindow.document.createElement("hbox"); + _move(); + } else { + _loadLines = _loadLines.concat(label); + _loadIcons = _loadIcons.concat(icon); + } + } + + function addDescription(text) { + if(_windowLoaded) { + var newHB = _progressWindow.document.createElement("hbox"); newHB.setAttribute("class", "scholar-progress-item-hbox"); - newHB.setAttribute("valign", "center"); - newHB.appendChild(newImage); - newHB.appendChild(newLabel); + var newDescription = _progressWindow.document.createElement("description"); + newDescription.setAttribute("class", "scholar-progress-description"); + var newText = _progressWindow.document.createTextNode(text); - this.progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB); + newDescription.appendChild(newText); + newHB.appendChild(newDescription); + _progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB); + + _move(); + } else { + _loadDescription = text; + } + } + + function fade() { + setTimeout(_timeout, 2500); + } + + function kill() { + _windowLoaded = false; + try { + _progressWindow.close(); + } catch(ex) {} + } + + function _onWindowLoaded() { + _windowLoading = false; + _windowLoaded = true; + + _move(); + // do things we delayed because the winodw was loading + changeHeadline(_loadHeadline); + addLines(_loadLines, _loadIcons); + if(_loadDescription) { + addDescription(_loadDescription); } - this._move(); - } else { - this._loadLines = this._loadLines.concat(label); - this._loadIcons = this._loadIcons.concat(icon); + // reset parameters + _loadDescription = null; + _loadLines = new Array(); + _loadIcons = new Array(); + _loadHeadline = Scholar.getString("ingester.scraping") + } + + function _move() { + _progressWindow.sizeToContent(); + _progressWindow.moveTo( + window.screenX + window.outerWidth - _progressWindow.outerWidth - 30, + window.screenY + window.outerHeight - _progressWindow.outerHeight + ); + } + + function _timeout() { + kill(); // could check to see if we're really supposed to fade yet + // (in case multiple scrapers are operating at once) } } - -Scholar_Ingester_Interface.Progress.prototype.addDescription = function(text) { - if(this._windowLoaded) { - var newHB = this.progressWindow.document.createElement("hbox"); - newHB.setAttribute("class", "scholar-progress-item-hbox"); - var newDescription = this.progressWindow.document.createElement("description"); - newDescription.setAttribute("class", "scholar-progress-description"); - var newText = this.progressWindow.document.createTextNode(text); - - newDescription.appendChild(newText); - newHB.appendChild(newDescription); - this.progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB); - - this._move(); - } else { - this._loadDescription = text; - } -} - -Scholar_Ingester_Interface.Progress.prototype._move = function() { - this.progressWindow.sizeToContent(); - this.progressWindow.moveTo( - this.openerWindow.screenX + this.openerWindow.outerWidth - this.progressWindow.outerWidth - 30, - this.openerWindow.screenY + this.openerWindow.outerHeight - this.progressWindow.outerHeight - ); -} - -Scholar_Ingester_Interface.Progress.prototype.fade = function() { - this.kill(); -} - -Scholar_Ingester_Interface.Progress.prototype.kill = function() { - try { - this.progressWindow.close(); - } catch(ex) {} -} - diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index 9e1c8c100..5dbf43132 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -19,47 +19,6 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) { Scholar.debug("deleted hidden browser"); } -/* - * Operates the ingester given only a URL - * url - URL to scrape - * complete - callback function to be executed if page grab completes - * (will be passed document object; obj.items contains array of - * *unsaved* items scraped; empty array indicates unscrapable page) - * error - callback function to be executed if an error occurred loading page - * myWindow - optional argument indicating window to attach a dialog to. if no - * window is given, Firefox Scholar uses the hidden DOM window and - * will simply avoid scraping multiple pages - */ -Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) { - var isHidden = false; - if(!myWindow) { - var myWindow = Components.classes["@mozilla.org/appshell/appShellService;1"] - .getService(Components.interfaces.nsIAppShellService) - .hiddenDOMWindow; - var isHidden = true; - } - - var succeeded = function(browser) { - var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden); - if(myDoc.retrieveTranslator()) { - myDoc.scrapePage(function(myDoc) { - Scholar.Ingester.deleteHiddenBrowser(browser); - complete(myDoc); - }); - } else { - Scholar.Ingester.deleteHiddenBrowser(browser); - complete(myDoc); - } - } - - var failed = function() { - Scholar.debug("Scholar.Ingester.ingestURL: could not ingest "+url); - error(); - } - - Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed, true); -} - ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.ProxyMonitor @@ -101,54 +60,56 @@ Scholar.Ingester.ProxyMonitor = new function() { function observe(channel) { channel.QueryInterface(Components.interfaces.nsIHttpChannel); - if(channel.getResponseHeader("Server") == "EZproxy") { - // We're connected to an EZproxy - if(channel.responseStatus != "302") { - return; - } - - Scholar.debug(channel.URI.spec); - // We should be able to scrape the URL out of this - var m = _ezProxyRe.exec(channel.URI.spec); - if(!m) { - return; - } - - // Found URL - var variable = m[1]; - var properURL = m[2]; - if(variable.toLowerCase() == "qurl") { - properURL = unescape(properURL); - } - var properURI = _parseURL(properURL); - if(!properURI) { - return; - } - - // Get the new URL - var newURL = channel.getResponseHeader("Location"); - if(!newURL) { - return; - } - var newURI = _parseURL(newURL); - if(!newURI) { - return; - } - - if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) { - // Different ports but the same server means EZproxy active - - Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort); - // Initialize variables here so people who never use EZProxies - // don't get the (very very minor) speed hit - if(!_mapFromProxy) { - _mapFromProxy = new Object(); - _mapToProxy = new Object(); + try { + if(channel.getResponseHeader("Server") == "EZproxy") { + // We're connected to an EZproxy + if(channel.responseStatus != "302") { + return; + } + + Scholar.debug(channel.URI.spec); + // We should be able to scrape the URL out of this + var m = _ezProxyRe.exec(channel.URI.spec); + if(!m) { + return; + } + + // Found URL + var variable = m[1]; + var properURL = m[2]; + if(variable.toLowerCase() == "qurl") { + properURL = unescape(properURL); + } + var properURI = _parseURL(properURL); + if(!properURI) { + return; + } + + // Get the new URL + var newURL = channel.getResponseHeader("Location"); + if(!newURL) { + return; + } + var newURI = _parseURL(newURL); + if(!newURI) { + return; + } + + if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) { + // Different ports but the same server means EZproxy active + + Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort); + // Initialize variables here so people who never use EZProxies + // don't get the (very very minor) speed hit + if(!_mapFromProxy) { + _mapFromProxy = new Object(); + _mapToProxy = new Object(); + } + _mapFromProxy[newURI.hostPort] = properURI.hostPort; + _mapToProxy[properURI.hostPort] = newURI.hostPort; } - _mapFromProxy[newURI.hostPort] = properURI.hostPort; - _mapToProxy[properURI.hostPort] = newURI.hostPort; } - } + } catch(e) {} } /* @@ -195,394 +156,4 @@ Scholar.Ingester.ProxyMonitor = new function() { var uri = ioService.newURI(url, null, null); return uri; } -} - -///////////////////////////////////////////////////////////////// -// -// Scholar.Ingester.Model -// -///////////////////////////////////////////////////////////////// - -// Scholar.Ingester.Model, an object representing an RDF data model with -// methods to add to that model. In Piggy Bank, this was implemented in Java, -// but seeing as we don't really want an enormous web server running with FS, -// but we don't actually need that, so it's much simpler. -// -// The Java version of this class can be viewed at -// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java -Scholar.Ingester.Model = function() { - this.data = new Object(); -} - -// Piggy Bank provides a fourth argument, one that determines if the third -// argument is a literal or an RDF URI. Since our ontologies are -// sufficiently restricted, we have no chance of confusing a literal and an -// RDF URI and thus this is unnecessary. -Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) { - if(!this.data[uri]) this.data[uri] = new Object(); - if(!this.data[uri][rdfUri]) { - this.data[uri][rdfUri] = new Array(); - } - this.data[uri][rdfUri].push(literal); - Scholar.debug(rdfUri+" for "+uri+" is "+literal); -} - -// Additional functions added for compatibility purposes only -// No idea if any scraper actually uses these, but just in case, they're -// implemented so as not to throw an exception -Scholar.Ingester.Model.prototype.addTag = function() {} -Scholar.Ingester.Model.prototype.getRepository = function() {} -Scholar.Ingester.Model.prototype.detachRepository = function() {} - -////////////////////////////////////////////////////////////////////////////// -// -// Scholar.Ingester.Document -// -////////////////////////////////////////////////////////////////////////////// - -/* THIS CODE IS GOING AWAY - * eventually, all ingesting will be part of a unified API in Scholar.Translate. - * until then, Scholar.Ingester.Document reigns supreme. - * - * Public properties: - * browser - browser window object of document - * model - data model for semantic scrapers - * scraper - best scraper to use to scrape page - * items - items returned after page is scraped - * window - window, for creating new hidden browsers - * url - url, as passed through proxy system - * type - type of item that will be scraped (set after retrieveScraper() is - * called) - * - * Private properties: - * _sandbox - sandbox for code execution - * _scrapeCallback - callback function to be executed when scraping is complete - */ - -////////////////////////////////////////////////////////////////////////////// -// -// Public Scholar.Ingester.Document methods -// -////////////////////////////////////////////////////////////////////////////// - -/* - * Constructor for Document object - */ -Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) { - this.browser = myBrowser; - this.window = myWindow; - this.isHidden = isHidden; - this.scraper = this.type = null; - this.model = new Scholar.Ingester.Model(); - - // Create separate URL to account for proxies - this.url = Scholar.Ingester.ProxyMonitor.proxyToProper(this.browser.contentDocument.location.href); - if(this.url != this.browser.contentDocument.location.href) { - this.proxiedURL = true; - } - - this.items = new Array(); - this._generateSandbox(); -} - -/* - * Retrieves the best scraper to scrape a given page - */ -Scholar.Ingester.Document.prototype.retrieveScraper = function() { - Scholar.debug("Retrieving scrapers for "+this.url); - - var sql = 'SELECT * FROM translators WHERE type = 3 ORDER BY target IS NULL ASC'; - var scrapers = Scholar.DB.query(sql); - for(var i=0; i= 4) { - var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ - if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) { - newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); - } else { - var m; - var yearRe = /[0-9]{4}$/; - if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) { - newItem.setField("year", m[0]); - } - } - } - } - - // Handle ISBNs/ISSNs/Call Numbers - if(this.model.data[uri][prefixDC + 'identifier']) { - var oldIndex = -1; - var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID); - var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID); - for(i in this.model.data[uri][prefixDC + 'identifier']) { - prefix = this.model.data[uri][prefixDC + 'identifier'][i].substr(0, this.model.data[uri][prefixDC + 'identifier'][i].indexOf(" ")); - if(needISSN && prefix == 'ISSN') { - newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5)); - needISSN = false; - } - if(needISBN && prefix == 'ISBN') { - newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5)); - needISBN = false; - } - var newIndex = Scholar.arraySearch(prefix, callNumbers); - if(newIndex && newIndex > oldIndex) { - oldIndex = newIndex; - var callNumber = this.model.data[uri][prefixDC + 'identifier'][i].substring(prefix.length+1); - } - } - if(callNumber) { - newItem.setField("callNumber", callNumber); - } - } - - this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID); - this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID); - this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID); - this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID); - - this.items.push(newItem); - } - } catch(ex) { - Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex); - } } \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/xpcom/marc.js b/chrome/chromeFiles/content/scholar/xpcom/marc.js deleted file mode 100644 index 6cf46d146..000000000 --- a/chrome/chromeFiles/content/scholar/xpcom/marc.js +++ /dev/null @@ -1,532 +0,0 @@ -/* -* Scholar.Ingester.MARC_Record.js -* Stefano Bargioni, Pontificia Universitˆ della Santa Croce - Biblioteca -* Trattamento di record MARC in JavaScript -* -* Original version copyright (C) 2005 Stefano Bargioni, licensed under the LGPL -* (Available at http://www.pusc.it/bib/mel/Scholar.Ingester.MARC_Record.js) -* -* This library is free software; you can redistribute it or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -*/ - -Scholar.Ingester.MARC_Record = function() { // new MARC record - this.VERSIONE = '2.6.6b'; - this.VERSIONE_data ='2005-05-10'; - - this.leader = { - record_length:'00000', - record_status:'n', // acdnp - type_of_record:' ', - bibliographic_level:' ', - type_of_control:' ', - character_coding_scheme:' ', - indicator_count:'2', - subfield_code_length:'2', - base_address_of_data:'00000', - encoding_level:' ', - descriptive_cataloging_form:' ', - linked_record_requirement:' ', - entry_map:'4500' - }; // 24 chars - - this.field_terminator = '\x1E'; - this.record_terminator = '\x1D'; - this.subfield_delimiter = '\x1F'; - this.directory = ''; - this.directory_terminator = this.field_terminator; - this.variable_fields = new Array(); - return this; -} - -Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s passed in format f - if (f == 'binary') { - this.leader.record_length = '00000'; - this.leader.record_status = s.substr(5,1); - this.leader.type_of_record = s.substr(6,1); - this.leader.bibliographic_level = s.substr(7,1); - this.leader.type_of_control = s.substr(8,1); - this.leader.character_coding_scheme = s.substr(9,1); - this.leader.indicator_count = '2'; - this.leader.subfield_code_length = '2'; - this.leader.base_address_of_data = '00000'; - this.leader.encoding_level = s.substr(17,1); - this.leader.descriptive_cataloging_form = s.substr(18,1); - this.leader.linked_record_requirement = s.substr(19,1); - this.leader.entry_map = '4500'; - - this.directory = ''; - this.directory_terminator = this.field_terminator; - this.variable_fields = new Array(); - - // loads fields - var campi = s.split(this.field_terminator); - var k; - for (k=1; k<-1+campi.length; k++) { // the first and the last are unuseful - // the first is the header + directory, the last is the this.record_terminator - var tag = campi[0].substr(24+(k-1)*12,3); - var ind1 = ''; var ind2 = ''; var value = campi[k]; - if (tag.substr(0,2) != '00') { - ind1 = campi[k].substr(0,1); - ind2 = campi[k].substr(1,1); - value = campi[k].substr(2); - } - this.add_field(tag,ind1,ind2,value); - } - } else if (f == 'MARC_Harvard') { - var linee = s.split('\n'); - for (var i=0; i '008' && tag < '899') { // jumps low and high tags, also H03 and similia - if (tag != '040') this.add_field(tag,ind1,ind2,value); - } - } - this.add_field_005(); - } else if (f == 'MARC_BNI') { - var linee = s.split('\n'); - for (var i=0; i '008' && tag < '899') { // jumps low and high tags - if (tag != '040') this.add_field(tag,ind1,ind2,value); - } - } - this.add_field_005(); - } else if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov - var linee = s.split('\n'); - for (var i=0; i '008' && tag < '899') { // jumps low and high tags - if (tag != '040') this.add_field(tag,ind1,ind2,value); - } - } - this.add_field_005(); - } else if (f == 'MARC_PAC') { - var linee = s.split('\n'); - for (var i=0; i '008' && tag < '899') { // jumps low and high tags - if (tag != '040') this.add_field(tag,ind1,ind2,value); - } - } - this.add_field_005(); - } - - this.update_record_length(); - this.update_base_address_of_data(); - return this; -} - -Scholar.Ingester.MARC_Record.prototype.update_base_address_of_data = function() { // updates the base_address - this.leader.base_address_of_data = this._zero_fill(24+this.variable_fields.length*12+1,5); - return this.leader.base_address_of_data; -} - -Scholar.Ingester.MARC_Record.prototype.update_displacements = function() { // rebuilds the directory - var displ = 0; - this.directory = ''; - for (var i=0; i 0) return true; - return false; -} - -Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield - this.tag = tag; - this.occ = rec.count_occ(tag)+1; // occurrence order no. - this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' '; - this.ind2 = ind2; if (this.ind2 == '') this.ind2 = ' '; - if (tag.substr(0,2) == '00') { - this.ind1 = ''; this.ind2 = ''; - } - this.value = value; - return this; -} - -Scholar.Ingester.MARC_Record.prototype.display = function(type) { // displays record in format type - type = type.toLowerCase(); - if (type == 'binary') return this.show_leader() + - this.directory + - this.field_terminator + - this.show_fields() + - this.record_terminator; - if (type == 'html') { - var s = ''; - var l = R.show_leader(); - s += ''; - var i; - for (i=0; i'; - s += ''; - s += ''; - var v = this.variable_fields[i].value; - if (this.variable_fields[i].tag == '008') v = v.replace(/ /g,' '); - s += ''; - s += ''; - } - s += '
000'+l+'
'+ind1+''+ind2+''+this._ddagger(v)+'
'; - return s; - } - if (type == 'xml') { - s = ''; - s += ''; - s += ''+this.show_leader()+''; - // var i; - for (i=0; i'+this.variable_fields[i].value+''; - else { - var subfields = this.variable_fields[i].value.split(this.subfield_delimiter); - // alert(this.variable_fields[i].value+' '+subfields.length); // test - if (subfields.length == 1) subfields[1] = '?'+this.variable_fields[i].value; - var sf = ''; - for (var j=1; j'+subfields[j].substr(1)+''; - } - s += '' + sf + ''; - } - } - s += ''; - return s; - } - if (type == 'xml-html') { - s = this.display('xml'); - // abbellimenti - s = s.replace(/\/,'\n '); - s = s.replace(/\ - s = s.replace(/\/g,'>'); - // colore alle keyword - s = s.replace(/(controlfield|datafield|collection|record|leader|subfield)/g,'$1'); - s = s.replace(/(tag|code|ind1|ind2)=/g,'$1='); - return s; - } - return false; -} - -Scholar.Ingester.MARC_Record.prototype.get_field = function(tag) { // returns an array of values, one for each occurrence - var v = new Array(); var i; - for (i=0; i= this.directory.length) alert('Internal error!'); - this.directory = this.directory.substr(0,i) + this.directory.substr(i+12); - // updates lengths - this.update_base_address_of_data(); - this.update_displacements(); - this.update_record_length(); - return true; -} - -Scholar.Ingester.MARC_Record.prototype._ddagger = function(s) { // display doubledagger in html code - s = s.replace(/\%1F(.)/g, "‡$1"); - s = s.replace(/\x1F(.)/g, "‡$1"); - return s; -} - -Scholar.Ingester.MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides - s = s.replace(/\s+$/,''); - return s.replace(/^\s+/,''); -} - -Scholar.Ingester.MARC_Record.prototype._zero_fill = function(s,l) { // left '0' padding of s, up to l (l<=15) - var t = '000000000000000'; - t = t+s; - return t.substr(t.length-l,l); -} - -Scholar.Ingester.MARC_Record.prototype.version = function() { // returns version and date - return 'MARC Editor Lite '+this.VERSIONE+' ('+this.VERSIONE_data+')'; -} \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js index 232a9fbfd..703c517b7 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/translate.js +++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -11,15 +11,15 @@ * * type can be: * export - * import (NOT IMPLEMENTED) - * web (NOT IMPLEMENTED) + * import + * web * * a typical export process: * var translatorObj = new Scholar.Translate(); * var possibleTranslators = translatorObj.getTranslators(); * // do something involving nsIFilePicker; remember, each possibleTranslator * // object has properties translatorID, label, and targetID - * translatorObj.setFile(myNsILocalFile); + * translatorObj.setLocation(myNsILocalFile); * translatorObj.setTranslator(possibleTranslators[x]); // also accepts only an ID * translatorObj.setHandler("done", _translationDone); * translatorObj.translate(); @@ -27,15 +27,19 @@ * * PUBLIC PROPERTIES: * - * type - the text type of translator (set by constructor) - * numeric type - the numeric type of translator (set by constructor) - * location - the location of the target (set by setLocation) + * type - the text type of translator (set by constructor, should be read only) + * browser - the browser object to be used for web scraping (read-only; set + * with setBrowser) + * translator - the translator currently in use (read-only; set with + * setTranslator) + * location - the location of the target (read-only; set with setLocation) * for import/export - this is an instance of nsILocalFile - * for web - this is a browser object - * translator - the translator currently in use (set by setTranslator) + * for web - this is a URL + * path - the path to the target; for web, this is the same as location * * PRIVATE PROPERTIES: * + * _numericTypes - possible numeric types as a comma-delimited string * _handlers - handlers for various events (see setHandler) * _configOptions - options set by translator modifying behavior of * Scholar.Translate @@ -43,68 +47,80 @@ * _waitForCompletion - whether to wait for asynchronous completion, or return * immediately when script has finished executing * _sandbox - sandbox in which translators will be executed + * _streams - streams that need to be closed when execution is complete + * + * WEB-ONLY PRIVATE PROPERTIES: + * + * _locationIsProxied - whether the URL being scraped is going through + * an EZProxy */ Scholar.Translate = function(type) { this.type = type; - if(this.type == "import") { - this.numericType = 1; - } else if(this.type == "export") { - this.numericType = 2; - } else if(this.type == "web") { - this.numericType = 3; + // import = 001 = 1 + // export = 010 = 2 + // web = 100 = 4 + + // combination types determined by addition or bitwise AND + // i.e., import+export = 1+2 = 3 + if(type == "import") { + this._numericTypes = "1,3,5,7"; + } else if(type == "export") { + this._numericTypes = "2,3,6,7"; + } else if(type == "web") { + this._numericTypes = "4,5,6,7"; + } else { + throw("invalid import type"); } this._handlers = new Array(); + this._streams = new Array(); } /* - * gets all applicable translators - * - * for import, you should call this after setFile; otherwise, you'll just get - * a list of all import filters, not filters equipped to handle a specific file + * sets the browser to be used for web translation; also sets the location */ -Scholar.Translate.prototype.getTranslators = function() { - this._generateSandbox(); - - if(this.type == "export") { - var sql = 'SELECT translatorID, label, target FROM translators WHERE type = ?'; - var translators = Scholar.DB.query(sql, [this.numericType]); - return translators; +Scholar.Translate.prototype.setBrowser = function(browser) { + this.browser = browser; + this.setLocation(browser.contentDocument.location.href); +} + +/* + * sets the location to operate upon (file should be an nsILocalFile object or + * web address) + */ +Scholar.Translate.prototype.setLocation = function(location) { + if(this.type == "web") { + // account for proxies + this.location = Scholar.Ingester.ProxyMonitor.proxyToProper(location); + if(this.location != location) { + // figure out if this URL is being proxies + this.locationIsProxied = true; + } + this.path = this.location; + } else { + this.location = location; + this.path = location.path; } } -/* - * sets the location to operate upon (file should be an nsILocalFile object) - */ -Scholar.Translate.prototype.setLocation = function(file) { - this.location = file; -} - /* * sets the translator to be used for import/export * * accepts either the object from getTranslators() or an ID */ Scholar.Translate.prototype.setTranslator = function(translator) { - if(typeof(translator) == "object") { + if(typeof(translator) == "object") { // passed an object and not an ID translator = translator.translatorID; } - var sql = 'SELECT * FROM translators WHERE translatorID = ? AND type = ?'; - this.translator = Scholar.DB.rowQuery(sql, [translator, this.numericType]); + var sql = "SELECT * FROM translators WHERE translatorID = ? AND type IN ("+this._numericTypes+")"; + this.translator = Scholar.DB.rowQuery(sql, [translator]); if(!this.translator) { return false; } - if(this.type == "export") { - // for export, we need to execute the translator detectCode to get - // options; for other types, this has already been done - this._executeDetectCode(this.translator); - } - - Scholar.debug("got translator "+translator); return true; } @@ -130,7 +146,7 @@ Scholar.Translate.prototype.setTranslator = function(translator) { * done * valid: all * called: when all processing is finished - * passed: return value of the processing function + * passed: returns true if successful, false if an error occurred * returns: N/A */ Scholar.Translate.prototype.setHandler = function(type, handler) { @@ -140,6 +156,55 @@ Scholar.Translate.prototype.setHandler = function(type, handler) { this._handlers[type].push(handler); } +/* + * gets all applicable translators + * + * for import, you should call this after setFile; otherwise, you'll just get + * a list of all import filters, not filters equipped to handle a specific file + * + * this returns a list of translator objects, of which the following fields + * are useful: + * + * translatorID - the GUID of the translator + * label - the name of the translator + * itemType - the type of item this scraper says it will scrape + */ +Scholar.Translate.prototype.getTranslators = function() { + var sql = "SELECT translatorID, label, target, detectCode FROM translators WHERE type IN ("+this._numericTypes+") ORDER BY target IS NULL"; + var translators = Scholar.DB.query(sql); + + if(!this.location) { + return translators; // no need to see which can translate, because + // we don't have a location yet (for export or + // import dialog) + } else { + // create a new sandbox + this._generateSandbox(); + + var possibleTranslators = new Array(); + Scholar.debug("searching for translators for "+this.path); + + // see which translators can translate + for(var i in translators) { + if(this._canTranslate(translators[i])) { + Scholar.debug("found translator "+translators[i].label); + + // for some reason, and i'm not quite sure what this reason is, + // we HAVE to do this to get things to work right; we can't + // just push a normal translator object from an SQL statement + var translator = {translatorID:translators[i].translatorID, + label:translators[i].label, + target:translators[i].target, + itemType:translators[i].itemType} + + possibleTranslators.push(translator); + } + } + + return possibleTranslators; + } +} + /* * gets translator options to be displayed in a dialog * @@ -148,28 +213,57 @@ Scholar.Translate.prototype.setHandler = function(type, handler) { Scholar.Translate.prototype.displayOptions = function() { } -/* - * does the actual translation - */ -Scholar.Translate.prototype.translate = function() { - this._complete = false; - Scholar.debug("converting using "+this.translator.label); +Scholar.Translate.prototype._loadTranslator = function() { + if(!this._sandbox) { + // create a new sandbox if none exists + this._generateSandbox(); + } + + // parse detect code for the translator + this._parseDetectCode(this.translator); + + Scholar.debug("parsing code for "+this.translator.label); try { Components.utils.evalInSandbox(this.translator.code, this._sandbox); } catch(e) { Scholar.debug(e+' in parsing code for '+this.translator.label); this._translationComplete(false); + return false; + } + + return true; +} + +/* + * does the actual translation + */ +Scholar.Translate.prototype.translate = function() { + + if(!this.location) { + throw("cannot translate: no location specified"); + } + + this._complete = false; + + if(!this._loadTranslator()) { return; } - if(this.type == "export") { - var returnValue = this._export(); + var returnValue; + if(this.type == "web") { + returnValue = this._web(); + } else if(this.type == "import") { + returnValue = this._import(); + } else if(this.type == "export") { + returnValue = this._export(); } - - // If synchronous, call _translationComplete(); - if(!this._waitForCompletion) { - this._translationComplete(returnValue); + if(!returnValue) { + // failure + this._translationComplete(false); + } else if(!this._waitForCompletion) { + // if synchronous, call _translationComplete(); + this._translationComplete(true); } } @@ -177,42 +271,145 @@ Scholar.Translate.prototype.translate = function() { * generates a sandbox for scraping/scraper detection */ Scholar.Translate.prototype._generateSandbox = function() { + var me = this; + if(this.type == "web") { - this._sandbox = new Components.utils.Sandbox(url); - this._sandbox.browser = this.browser; - this._sandbox.doc = this.browser.contentDocument; - this._sandbox.url = this.sandboxURL; - this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden); - this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL); - this._sandbox.model = this.model; + // use real URL, not proxied version, to create sandbox + this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href); + this._sandbox.Scholar = new Object(); + + // add ingester utilities + this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this.locationIsProxied); + this._sandbox.Scholar.Utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.locationIsProxied); + + // set up selectItems handler + this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) }; } else { + // use null URL to create sanbox this._sandbox = new Components.utils.Sandbox(""); - this._sandbox.utilities = new Scholar.Utilities(); + this._sandbox.Scholar = new Object(); + + this._sandbox.Scholar.Utilities = new Scholar.Utilities(); + } + + if(this.type == "web" || this.type == "import") { + // add routines to add new items + this._sandbox.Scholar.Item = Scholar.Translate.ScholarItem; + // attach the function to be run when an item is + this._sandbox.Scholar.Item.prototype.complete = function() {me._itemDone(this)}; + } else if(this.type == "export") { + // add routines to retrieve items and collections + this._sandbox.Scholar.nextItem = function() { return me._exportGetItem() }; + this._sandbox.Scholar.nextCollection = function() { return me._exportGetCollection() }; } this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; - this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record; - this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record(); + // for asynchronous operation, use wait() + // done() is implemented after wait() is called + this._sandbox.Scholar.wait = function() { me._enableAsynchronous() }; + // for adding configuration options + this._sandbox.Scholar.configure = function(option, value) {me._configure(option, value) }; + // for adding displayed options + this._sandbox.Scholar.addOption = function(option, value) {me._addOption(option, value) }; + + // for loading other translators and accessing their methods var me = this; - this._sandbox.wait = function() {me._enableAsynchronous() }; - this._sandbox.configure = function(option, value) {me._configure(option, value) }; - this._sandbox.addOption = function(option, value) {me._addOption(option, value) }; + this._sandbox.Scholar.loadTranslator = function(type, translatorID) { + var translation = new Scholar.Translate(type); + // assign same handlers as for parent, because the done handler won't + // get called anyway, and the itemDone/selectItems handlers should be + // the same + translation._handlers = me._handlers; + // set the translator + translation.setTranslator(translatorID); + // load the translator into our sandbox + translation._loadTranslator(); + // use internal io + translation._initializeInternalIO(); + return translation._sandbox; + } } /* - * executes translator detectCode, sandboxed + * Check to see if _scraper_ can scrape this document */ -Scholar.Translate.prototype._executeDetectCode = function(translator) { +Scholar.Translate.prototype._canTranslate = function(translator) { + var canTranslate = false; + + // Test location with regular expression + // If this is slow, we could preload all scrapers and compile regular + // expressions, so each check will be faster + if(translator.target) { + if(this.type == "web") { + var regularExpression = new RegExp(translator.target, "i"); + } else { + var regularExpression = new RegExp("\."+translator.target+"$", "i"); + } + + if(regularExpression.test(this.path)) { + canTranslate = true; + } + } + + // Test with JavaScript if available and didn't have a regular expression or + // passed regular expression test + if((!translator.target || canTranslate) + && translator.detectCode) { + // parse the detect code and execute + this._parseDetectCode(translator); + + if(this.type == "import") { + try { + this._importConfigureIO(); // so it can read + } catch(e) { + Scholar.debug(e+' in opening IO for '+translator.label); + return false; + } + } + + if(this._sandbox.detect) { + var returnValue; + + try { + if(this.type == "web") { + returnValue = this._sandbox.detect(this.browser.contentDocument, this.location); + } else if(this.type == "import") { + returnValue = this._sandbox.detect(); + } + } catch(e) { + Scholar.debug(e+' in executing detectCode for '+translator.label); + return false; + } + + Scholar.debug("executed detectCode for "+translator.label); + + // detectCode returns text type + if(returnValue) { + canTranslate = true; + + if(typeof(returnValue) == "string") { + translator.itemType = returnValue; + } + } else { + canTranslate = false; + } + } + } + + return canTranslate; +} +Scholar.Translate.prototype._parseDetectCode = function(translator) { this._configOptions = new Array(); this._displayOptions = new Array(); - Scholar.debug("executing detect code"); - try { - return Components.utils.evalInSandbox(translator.detectCode, this._sandbox); - } catch(e) { - Scholar.debug(e+' in executing detectCode for '+translator.label); - return; + if(translator.detectCode) { + try { + Components.utils.evalInSandbox(translator.detectCode, this._sandbox); + } catch(e) { + Scholar.debug(e+' in parsing detectCode for '+translator.label); + return; + } } } @@ -259,8 +456,22 @@ Scholar.Translate.prototype._addOption = function(option, value) { * called as wait() in translator code */ Scholar.Translate.prototype._enableAsynchronous = function() { + me = this; this._waitForCompletion = true; - this._sandbox.done = function(returnValue) { me._translationComplete(returnValue); }; + this._sandbox.Scholar.done = function() { me._translationComplete(true) }; +} + +/* + * lets user pick which items s/he wants to put in his/her library + * + * called as selectItems() in translator code + */ +Scholar.Translate.prototype._selectItems = function(options) { + if(this._handlers.select) { + return this._runHandler("select", options); + } else { // no handler defined; assume they want all of them + return options; + } } /* @@ -278,17 +489,213 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) { // call handler this._runHandler("done", returnValue); + + // close open streams + this._closeStreams(); } } +/* + * closes open file streams, if any exist + */ +Scholar.Translate.prototype._closeStreams = function() { + if(this._streams.length) { + for(var i in this._streams) { + var stream = this._streams[i]; + + // stream could be either an input stream or an output stream + try { + stream.QueryInterface(Components.interfaces.nsIFileInputStream); + } catch(e) { + stream.QueryInterface(Components.interfaces.nsIFileOutputStream); + } + + // encase close in try block, because it's possible it's already + // closed + try { + stream.close(); + } catch(e) { + } + } + } + delete this._streams; + this._streams = new Array(); +} + +/* + * executed when an item is done and ready to be loaded into the database + */ +Scholar.Translate.prototype._itemDone = function(item) { + // Get typeID, defaulting to "website" + var type = (item.itemType ? item.itemType : "website"); + + // makes looping through easier + delete item.itemType, item.complete; + item.itemType = item.complete = undefined; + + var typeID = Scholar.ItemTypes.getID(type); + var newItem = Scholar.Items.getNewItemByType(typeID); + + if(item.date && !item.year) { + // date can serve as a year + var dateID = Scholar.ItemFields.getID("date"); + var yearID = Scholar.ItemFields.getID("year"); + if(!Scholar.ItemFields.isValidForType(dateID, typeID) && Scholar.ItemFields.isValidForType(yearID, typeID)) { + // year is valid but date is not + var yearRe = /[0-9]{4}/; + var m = yearRe.exec(item.date); + if(m) { + item.year = m[0] + item.date = undefined; + } + } + } else if(!item.date && item.year) { + // the converse is also true + var dateID = Scholar.ItemFields.getID("date"); + var yearID = Scholar.ItemFields.getID("year"); + if(Scholar.ItemFields.isValidForType(dateID, typeID) && !Scholar.ItemFields.isValidForType(yearID, typeID)) { + // date is valid but year is not + item.date = item.year; + item.year = undefined; + } + } + + Scholar.debug(item); + + var fieldID, field; + for(var i in item) { + // loop through item fields + data = item[i]; + + if(data) { // if field has content + if(i == "creators") { // creators are a special case + for(j in data) { + newItem.setCreator(j, data[j].firstName, data[j].lastName, 1); + } + } else if(i == "title") { // skip checks for title + newItem.setField(i, data); + } else if(i == "tags") { // add tags + for(j in data) { + newItem.addTag(data[j]); + } + } else if(fieldID = Scholar.ItemFields.getID(i)) { + // if field is in db + if(Scholar.ItemFields.isValidForType(fieldID, typeID)) { + // if field is valid for this type + // add field + newItem.setField(i, data); + } else { + Scholar.debug("discarded field "+i+" for item: field not valid for type "+type); + } + } else { + Scholar.debug("discarded field "+i+" for item: field does not exist"); + } + } + } + + delete item; + + this._runHandler("itemDone", newItem); +} + /* * calls a handler (see setHandler above) */ Scholar.Translate.prototype._runHandler = function(type, argument) { + var returnValue; if(this._handlers[type]) { for(var i in this._handlers[type]) { Scholar.debug("running handler "+i+" for "+type); - this._handlers[type][i](this, argument); + try { + returnValue = this._handlers[type][i](this, argument); + } catch(e) { + Scholar.debug(e+' in handler '+i+' for '+type); + } + } + } + return returnValue; +} + +/* + * does the actual web translation + */ +Scholar.Translate.prototype._web = function() { + try { + this._sandbox.doWeb(this.browser.contentDocument, this.location); + } catch(e) { + Scholar.debug(e+' in executing code for '+this.translator.label); + return false; + } + + return true; +} + +/* + * does the actual import translation + */ +Scholar.Translate.prototype._import = function() { + this._importConfigureIO(); + + try { + this._sandbox.doImport(); + } catch(e) { + Scholar.debug(e+' in executing code for '+this.translator.label); + return false; + } + + return true; +} + +/* + * sets up import for IO + */ +Scholar.Translate.prototype._importConfigureIO = function() { + if(this._configOptions.dataMode == "rdf") { + var IOService = Components.classes['@mozilla.org/network/io-service;1'] + .getService(Components.interfaces.nsIIOService); + var fileHandler = IOService.getProtocolHandler("file") + .QueryInterface(Components.interfaces.nsIFileProtocolHandler); + var URL = fileHandler.getURLSpecFromFile(this.location); + delete fileHandler, IOService; + + var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1'] + .getService(Components.interfaces.nsIRDFService); + var dataSource = RDFService.GetDataSourceBlocking(URL); + + // make an instance of the RDF handler + this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(dataSource); + } else { + // open file + var fStream = Components.classes["@mozilla.org/network/file-input-stream;1"] + .createInstance(Components.interfaces.nsIFileInputStream); + fStream.init(this.location, 0x01, 0664, 0); + this._streams.push(fStream); + + if(this._configOptions.dataMode == "line") { // line by line reading + var notEof = true; + var lineData = new Object(); + + fStream.QueryInterface(Components.interfaces.nsILineInputStream); + + this._sandbox.Scholar.read = function() { + if(notEof) { + notEof = fStream.readLine(lineData); + return lineData.value; + } else { + return false; + } + } + } else { // block reading + var sStream = Components.classes["@mozilla.org/scriptableinputstream;1"] + .createInstance(Components.interfaces.nsIScriptableInputStream); + sStream.init(fStream); + + this._sandbox.Scholar.read = function(amount) { + return sStream.read(amount); + } + + // attach sStream to stack of streams to close + this._streams.push(sStream); } } } @@ -300,36 +707,21 @@ Scholar.Translate.prototype._export = function() { this._exportConfigureIO(); // get items - var itemObjects = Scholar.getItems(); - var itemArrays = new Array(); - for(var i in itemObjects) { - itemArrays.push(itemObjects[i].toArray()); - } - delete itemObjects; // free memory + this._itemsLeft = Scholar.getItems(); // get collections, if requested - var collectionArrays; if(this._configOptions.getCollections) { - var collectionObjects = Scholar.getCollections(); - collectionArrays = new Array(); - for(var i in collectionObjects) { - var collection = new Object(); - collection.id = collectionObjects[i].getID(); - collection.name = collectionObjects[i].getName(); - collection.type = "collection"; - collection.children = collectionObjects[i].toArray(); - - collectionArrays.push(collection); - } - delete collectionObjects; // free memory + this._collectionsLeft = Scholar.getCollections(); } try { - return this._sandbox.translate(itemArrays, collectionArrays); + this._sandbox.doExport(); } catch(e) { Scholar.debug(e+' in executing code for '+this.translator.label); - this._translationComplete(false); + return false; } + + return true; } /* @@ -337,17 +729,13 @@ Scholar.Translate.prototype._export = function() { */ Scholar.Translate.prototype._exportConfigureIO = function() { // open file - var foStream = Components.classes["@mozilla.org/network/file-output-stream;1"] + var fStream = Components.classes["@mozilla.org/network/file-output-stream;1"] .createInstance(Components.interfaces.nsIFileOutputStream); - foStream.init(this.location, 0x02 | 0x08 | 0x20, 0664, 0); // write, create, truncate + fStream.init(this.location, 0x02 | 0x08 | 0x20, 0664, 0); // write, create, truncate + // attach to stack of streams to close at the end + this._streams.push(fStream); - if(this._configOptions.dataMode == "rdf") { - /*** INITIALIZATION ***/ - var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1'].getService(Components.interfaces.nsIRDFService); - var IOService = Components.classes['@mozilla.org/network/io-service;1'].getService(Components.interfaces.nsIIOService); - var AtomService = Components.classes["@mozilla.org/atom-service;1"].getService(Components.interfaces.nsIAtomService); - var RDFContainerUtils = Components.classes["@mozilla.org/rdf/container-utils;1"].getService(Components.interfaces.nsIRDFContainerUtils); - + if(this._configOptions.dataMode == "rdf") { // rdf io // create data source var dataSource = Components.classes["@mozilla.org/rdf/datasource;1?name=xml-datasource"]. createInstance(Components.interfaces.nsIRDFDataSource); @@ -355,91 +743,319 @@ Scholar.Translate.prototype._exportConfigureIO = function() { var serializer = Components.classes["@mozilla.org/rdf/xml-serializer;1"]. createInstance(Components.interfaces.nsIRDFXMLSerializer); serializer.init(dataSource); + serializer.QueryInterface(Components.interfaces.nsIRDFXMLSource); - /*** FUNCTIONS ***/ - this._sandbox.model = new Object(); + // make an instance of the RDF handler + this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(dataSource, serializer); - // writes an RDF triple - this._sandbox.model.addStatement = function(about, relation, value, literal) { - Scholar.debug("pre: model.addStatement("+about+", "+relation+", "+value+", "+literal+")"); - - if(!(about instanceof Components.interfaces.nsIRDFResource)) { - about = RDFService.GetResource(about); - } - if(!(value instanceof Components.interfaces.nsIRDFResource)) { - if(literal) { - value = RDFService.GetLiteral(value); - } else { - value = RDFService.GetResource(value); - } - } - - Scholar.debug("post: model.addStatement("+about+", "+relation+", "+value+", "+literal+")"); - - dataSource.Assert(about, RDFService.GetResource(relation), value, true); - } - - // creates an anonymous resource - this._sandbox.model.newResource = function() { return RDFService.GetAnonymousResource() }; - - // creates a new container - this._sandbox.model.newContainer = function(type, about) { - if(!(about instanceof Components.interfaces.nsIRDFResource)) { - about = RDFService.GetResource(about); - } - - type = type.toLowerCase(); - if(type == "bag") { - return RDFContainerUtils.MakeBag(dataSource, about); - } else if(type == "seq") { - return RDFContainerUtils.MakeSeq(dataSource, about); - } else if(type == "alt") { - return RDFContainerUtils.MakeAlt(dataSource, about); - } else { - throw "Invalid container type in model.newContainer"; - } - } - - // adds a new container (index optional) - this._sandbox.model.addContainerElement = function(about, element, index) { - if(!(about instanceof Components.interfaces.nsIRDFContainer)) { - if(!(about instanceof Components.interfaces.nsIRDFResource)) { - about = RDFService.GetResource(about); - } - var container = Components.classes["@mozilla.org/rdf/container;1"]. - createInstance(Components.interfaces.nsIRDFContainer); - container.Init(dataSource, about); - } - if(!(element instanceof Components.interfaces.nsIRDFResource)) { - element = RDFService.GetResource(element); - } - - if(index) { - about.InsertElementAt(element, index, true); - } else { - about.AppendElement(element); - } - } - - // sets a namespace - this._sandbox.model.addNamespace = function(prefix, uri) { - serializer.addNameSpace(AtomService.getAtom(prefix), uri); - } - - this.setHandler("done", function() { - serializer.QueryInterface(Components.interfaces.nsIRDFXMLSource); - serializer.Serialize(foStream); - - delete dataSource, RDFService, IOService, AtomService, RDFContainerUtils; - }); - } else { - /*** FUNCTIONS ***/ - // write just writes to the file - this._sandbox.write = function(data) { foStream.write(data, data.length) }; + this.setHandler("done", function() { serializer.Serialize(fStream) }); + } else { // regular io; write just writes to file + this._sandbox.Scholar.write = function(data) { fStream.write(data, data.length) }; } +} - this.setHandler("done", function() { - foStream.close(); - delete foStream; - }); +/* + * gets the next item to process (called as Scholar.nextItem() from code) + */ +Scholar.Translate.prototype._exportGetItem = function() { + if(this._itemsLeft.length != 0) { + var returnItem = this._itemsLeft.shift(); + return returnItem.toArray(); + } + return false; +} + +/* + * gets the next item to collection (called as Scholar.nextCollection() from code) + */ +Scholar.Translate.prototype._exportGetCollection = function() { + if(!this._collectionsLeft) { + throw("getCollections configure option not set; cannot retrieve collection"); + } + + if(this._collectionsLeft.length != 0) { + var returnItem = this._collectionsLeft.shift(); + var collection = new Object(); + collection.id = returnItem.getID(); + collection.name = returnItem.getName(); + collection.type = "collection"; + collection.children = returnItem.toArray(); + + return returnItem; + } +} + +/* + * sets up internal IO in such a way that both reading and writing are possible + * (for inter-scraper communications) + */ +Scholar.Translate.prototype._initializeInternalIO = function() { + if(this.type == "import" || this.type == "export") { + if(this._configOptions.dataMode == "rdf") { + // use an in-memory data source for internal IO + var dataSource = Components.classes["@mozilla.org/rdf/datasource;1?name=in-memory-datasource"]. + createInstance(Components.interfaces.nsIRDFDataSource); + + // make an instance of the RDF handler + this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(dataSource); + } else { + // create a storage stream + var storageStream = Components.classes["@mozilla.org/storagestream;1"]. + createInstance(Components.interfaces.nsIStorageStream); + storageStream.init(4096, 4294967295, null); // virtually no size limit + + // set up write() method + var fStream = storageStream.getOutputStream(0); + this._sandbox.Scholar.write = function(data) { fStream.write(data, data.length) }; + + // set up read methods + var sStream; + var me = this; + if(this._configOptions.dataMode == "line") { // line by line reading + var lastCharacter; + + this._sandbox.Scholar.read = function() { + if(!sStream) { // allocate an fStream and sStream on the fly + // otherwise with no data we get an error + sStream = Components.classes["@mozilla.org/scriptableinputstream;1"] + .createInstance(Components.interfaces.nsIScriptableInputStream); + sStream.init(fStream.newInputStream(0)); + + // attach sStream to stack of streams to close + me._streams.push(sStream); + } + + var character = sStream.read(1); + if(!character) { + return false; + } + var string = ""; + + if(lastCharacter == "\r" && character == "\n") { + // if the last read got a cr, and this first char was + // an lf, ignore the lf + character = ""; + } + + while(character != "\n" && character != "\r" && character) { + string += character; + character = sStream.read(1); + } + + lastCharacter = character; + + return string; + } + } else { // block reading + this._sandbox.Scholar.read = function(amount) { + if(!sStream) { // allocate an fStream and sStream on the fly + // otherwise with no data we get an error + sStream = Components.classes["@mozilla.org/scriptableinputstream;1"] + .createInstance(Components.interfaces.nsIScriptableInputStream); + sStream.init(fStream.newInputStream(0)); + + // attach sStream to stack of streams to close + me._streams.push(sStream); + } + + return sStream.read(amount); + } + } + + // set Scholar.eof() to close the storage stream + this._sandbox.Scholar.eof = function() { + storageStream.QueryInterface(Components.interfaces.nsIOutputStream); + storageStream.close(); + } + } + } +} + +/* Scholar.Translate.ScholarItem: a class for generating new item from + * inside scraper code + * + * (this must be part of the prototype because it must be able to access + * methods relating to a specific instance of Scholar.Translate yet be called + * as a class) + */ + +Scholar.Translate.ScholarItem = function(itemType) { + // assign item type + this.itemType = itemType; + // generate creators array + this.creators = new Array(); + // generate notes array + this.notes = new Array(); + // generate tags array + this.tags = new Array(); +} + +/* Scholar.Translate.RDF: a class for handling RDF IO + * + * If an import/export translator specifies dataMode RDF, this is the interface, + * accessible from model.x + * + * In order to simplify things, all classes take in their resource/container + * as either the Mozilla native type or a string, but all + * return resource/containers as Mozilla native types (use model.toString to + * convert) + */ + +Scholar.Translate.RDF = function(dataSource, serializer) { + this._RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1'] + .getService(Components.interfaces.nsIRDFService); + this._AtomService = Components.classes["@mozilla.org/atom-service;1"] + .getService(Components.interfaces.nsIAtomService); + this._RDFContainerUtils = Components.classes["@mozilla.org/rdf/container-utils;1"] + .getService(Components.interfaces.nsIRDFContainerUtils); + + this._dataSource = dataSource; + this._serializer = serializer; +} + +// turn an nsISimpleEnumerator into an array +Scholar.Translate.RDF.prototype._deEnumerate = function(enumerator) { + if(!(enumerator instanceof Components.interfaces.nsISimpleEnumerator)) { + return false; + } + + var resources = new Array(); + + while(enumerator.hasMoreElements()) { + var resource = enumerator.getNext(); + try { + resource.QueryInterface(Components.interfaces.nsIRDFLiteral); + resources.push(resource.Value); + } catch(e) { + resource.QueryInterface(Components.interfaces.nsIRDFResource); + resources.push(resource); + } + } + + if(resources.length) { + return resources; + } else { + return false; + } +} + +// get a resource as an nsIRDFResource, instead of a string +Scholar.Translate.RDF.prototype._getResource = function(about) { + if(!(about instanceof Components.interfaces.nsIRDFResource)) { + about = this._RDFService.GetResource(about); + } + return about; +} + +// USED FOR OUTPUT + +// writes an RDF triple +Scholar.Translate.RDF.prototype.addStatement = function(about, relation, value, literal) { + about = this._getResource(about); + + if(!(value instanceof Components.interfaces.nsIRDFResource)) { + if(literal) { + value = this._RDFService.GetLiteral(value); + } else { + value = this._RDFService.GetResource(value); + } + } + + this._dataSource.Assert(about, this._RDFService.GetResource(relation), value, true); +} + +// creates an anonymous resource +Scholar.Translate.RDF.prototype.newResource = function() { + return this._RDFService.GetAnonymousResource() +}; + +// creates a new container +Scholar.Translate.RDF.prototype.newContainer = function(type, about) { + about = this._getResource(about); + + type = type.toLowerCase(); + if(type == "bag") { + return this._RDFContainerUtils.MakeBag(this._dataSource, about); + } else if(type == "seq") { + return this._RDFContainerUtils.MakeSeq(this._dataSource, about); + } else if(type == "alt") { + return this._RDFContainerUtils.MakeAlt(this._dataSource, about); + } else { + throw "Invalid container type in model.newContainer"; + } +} + +// adds a new container element (index optional) +Scholar.Translate.RDF.prototype.addContainerElement = function(about, element, index) { + if(!(about instanceof Components.interfaces.nsIRDFContainer)) { + about = this._getResource(about); + var container = Components.classes["@mozilla.org/rdf/container;1"]. + createInstance(Components.interfaces.nsIRDFContainer); + container.Init(this._dataSource, about); + } + if(!(element instanceof Components.interfaces.nsIRDFResource)) { + element = this._RDFService.GetResource(element); + } + + if(index) { + about.InsertElementAt(element, index, true); + } else { + about.AppendElement(element); + } +} + +// sets a namespace +Scholar.Translate.RDF.prototype.addNamespace = function(prefix, uri) { + if(this._serializer) { // silently fail, in case the reason the scraper + // is failing is that we're using internal IO + this._serializer.addNameSpace(this._AtomService.getAtom(prefix), uri); + } +} + +// gets a resource's URI +Scholar.Translate.RDF.prototype.getResourceURI = function(resource) { + resource.QueryInterface(Components.interfaces.nsIRDFResource); + return resource.ValueUTF8; +} + +// USED FOR INPUT + +// gets all RDF resources +Scholar.Translate.RDF.prototype.getAllResources = function() { + var resourceEnumerator = this._dataSource.GetAllResources(); + return this._deEnumerate(resourceEnumerator); +} + +// gets arcs going in +Scholar.Translate.RDF.prototype.getArcsIn = function(resource) { + resource = this._getResource(resource); + + var arcEnumerator = this._dataSource.ArcLabelsIn(resource); + return this._deEnumerate(arcEnumerator); +} + +// gets arcs going out +Scholar.Translate.RDF.prototype.getArcsOut = function(resource) { + resource = this._getResource(resource); + + var arcEnumerator = this._dataSource.ArcLabelsOut(resource); + return this._deEnumerate(arcEnumerator); +} + +// gets source resources +Scholar.Translate.RDF.prototype.getSources = function(resource, property) { + property = this._getResource(property); + resource = this._getResource(resource); + + var enumerator = this._dataSource.GetSources(resource, property, true); + return this._deEnumerate(enumerator); +} + +// gets target resources +Scholar.Translate.RDF.prototype.getTargets = function(resource, property) { + property = this._getResource(property); + resource = this._getResource(resource); + + var enumerator = this._dataSource.GetTargets(resource, property, true); + return this._deEnumerate(enumerator); } \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index 40df4e30c..74385f822 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -82,19 +82,29 @@ Scholar.Utilities.prototype.dateToISO = function(jsDate) { /* * Cleans extraneous punctuation off an author name */ -Scholar.Utilities.prototype.cleanAuthor = function(author) { +Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); author = author.replace(/ +/, ' '); - // Add period for initials - if(author.substring(author.length-2, author.length-1) == " ") { - author += "."; + if(useComma) { + // Add period for initials + if(author.substr(author.length-2, 1) == " ") { + author += "."; + } + var splitNames = author.split(', '); + if(splitNames.length > 1) { + var lastName = splitNames[0]; + var firstName = splitNames[1]; + } else { + var lastName = author; + } + } else { + var spaceIndex = author.lastIndexOf(" "); + var lastName = author.substring(spaceIndex+1); + var firstName = author.substring(0, spaceIndex); } - var splitNames = author.split(', '); - if(splitNames.length > 1) { - author = splitNames[1]+' '+splitNames[0]; - } - return author; + // TODO: take type into account + return {firstName:firstName, lastName:lastName, creatorType:type}; } /* @@ -141,7 +151,7 @@ Scholar.Utilities.prototype.getVersion = function() { /* * Get a page range, given a user-entered set of pages */ -Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/ +Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/; Scholar.Utilities.prototype.getPageRange = function(pages) { var pageNumbers; var m = this._pageRangeRegexp.exec(pages); @@ -155,8 +165,21 @@ Scholar.Utilities.prototype.getPageRange = function(pages) { return pageNumbers; } +/* + * provide inArray function + */ Scholar.Utilities.prototype.inArray = Scholar.inArray; +/* + * pads a number or other string with a given string on the left + */ +Scholar.Utilities.prototype.lpad = function(string, pad, length) { + while(string.length < length) { + string = pad + string; + } + return string; +} + /* * END SCHOLAR FOR FIREFOX EXTENSIONS */ @@ -169,10 +192,8 @@ Scholar.Utilities.prototype.inArray = Scholar.inArray; // Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional // classes relating to data extraction specifically from HTML documents. -Scholar.Utilities.Ingester = function(myWindow, proxiedURL, isHidden) { - this.window = myWindow; +Scholar.Utilities.Ingester = function(proxiedURL) { this.proxiedURL = proxiedURL; - this.isHidden = isHidden; } Scholar.Utilities.Ingester.prototype = new Scholar.Utilities(); @@ -240,21 +261,6 @@ Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, return returnVar; } -/* - * Allows a user to select which items to scrape - */ -Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) { - if(this.isHidden != true) { - // this is kinda ugly, mozillazine made me do it! honest! - var io = { dataIn:itemList, dataOut:null } - var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul", - "_blank","chrome,modal,centerscreen,resizable=yes", io); - return io.dataOut; - } else { - return null; - } -} - /* * Grabs items based on URLs */ @@ -300,129 +306,19 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, return availableItems; } -// These functions are for use by importMARCRecord. They're private, because, -// while they are useful, it's also nice if as many of our scrapers as possible -// are PiggyBank compatible, and if our scrapers used functions, that would -// break compatibility -Scholar.Utilities.Ingester.prototype._MARCCleanString = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - return author.replace(/ +/, ' '); -} - -Scholar.Utilities.Ingester.prototype._MARCCleanNumber = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - var regexp = /^[^ ]*/; - var m = regexp.exec(author); - if(m) { - return m[0]; - } -} -Scholar.Utilities.Ingester.prototype._MARCPullYear = function(text) { - var pullRe = /[0-9]+/; - var m = pullRe.exec(text); - if(m) { - return m[0]; - } -} - -Scholar.Utilities.Ingester.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) { - if(!part) { - part = 'a'; - } - var field = record.get_field_subfields(fieldNo); - Scholar.debug('Found '+field.length+' matches for '+fieldNo+part); - if(field) { - for(i in field) { - var value; - for(var j=0; j= 0) { - return "multiple"; -} else { - return "book"; +REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', +'function detect(doc, url) { + var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)''); + if(searchRe.test(doc.location.href)) { + return "multiple"; + } else { + return "book"; + } } ', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; +'function scrape(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -function scrape(doc) { - uri = doc.location.href; + var newItem = new Scholar.Item("book"); + newItem.source = doc.location.href; // Retrieve authors - var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; - - model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here - } + try { + var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + var author = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue; + + newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author")); + } + } catch(ex) {} // Retrieve data from "Product Details" box var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; - var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); - if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { - var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); - if(attribute == "Publisher:") { - if(value.lastIndexOf("(") != -1) { - var date = value.substring(value.lastIndexOf("(")+1, value.length-1); - jsDate = new Date(date); - if(!isNaN(jsDate.valueOf())) { - date = utilities.dateToISO(jsDate); + try { + var elmt = elmts[i]; + var attribute = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); + if(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { + var value = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); + if(attribute == "Publisher:") { + if(value.lastIndexOf("(") != -1) { + var date = value.substring(value.lastIndexOf("(")+1, value.length-1); + jsDate = new Date(date); + if(!isNaN(jsDate.valueOf())) { + date = Scholar.Utilities.dateToISO(jsDate); + } + newItem.date = date; + + value = value.substring(0, value.lastIndexOf("(")-1); } - - value = value.substring(0, value.lastIndexOf("(")-1); + if(value.lastIndexOf(";") != -1) { + newItem.edition = value.substring(value.lastIndexOf(";")+2, value.length); + + value = value.substring(0, value.lastIndexOf(";")); + } + newItem.publisher = value; + /*} else if(attribute == "Language:") { + .addStatement(uri, prefixDC + ''language'', value);*/ + } else if(attribute == "ISBN:") { + newItem.ISBN = value; + /*} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { + .addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); + .addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));*/ } - if(value.lastIndexOf(";") != -1) { - var edition = value.substring(value.lastIndexOf(";")+2, value.length); - value = value.substring(0, value.lastIndexOf(";")); - } - model.addStatement(uri, prefixDC + ''publisher'', value); - model.addStatement(uri, prefixDC + ''date'', date); - model.addStatement(uri, prefixDC + ''hasVersion'', edition); - } else if(attribute == "Language:") { - model.addStatement(uri, prefixDC + ''language'', value); - } else if(attribute == "ISBN:") { - model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value); - } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { - model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); - model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":"))); } - } + } catch(ex) {} } var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var title = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { title = title.substring(0, title.lastIndexOf("(")-1); } - model.addStatement(uri, prefixDC + ''title'', title); - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + newItem.title = title; + + newItem.complete(); } -var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)''); -var m = searchRe.exec(doc.location.href) -if(m) { - // Why can''t amazon use the same stylesheets - var xpath; - if(m == "gp/search/") { - xpath = ''//table[@class="searchresults"]''; +function doWeb(doc, url) { + var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)''); + var m = searchRe.exec(doc.location.href) + if(m) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // Why can''t amazon use the same stylesheets + var xpath; + if(m == "exec/obidos/search-handle-url/") { + xpath = ''//table[@cellpadding="3"]''; + } else { + xpath = ''//table[@class="searchresults"]''; + } + + var searchresults = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var items = Scholar.Utilities.getItemArray(doc, searchresults, ''^http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { Scholar.done(); }, function() {}); + + Scholar.wait(); } else { - xpath = ''//table[@cellpadding="3"]''; + scrape(doc); } - - var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - var items = utilities.getItemArray(doc, searchresults, ''^http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$''); - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - var uris = new Array(); - for(i in items) { - uris.push(i); - } - - utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, - function() { done(); }, function() {}); - - wait(); -} else { - scrape(doc); }'); -REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 3, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', -'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { - return "book"; -} else if(doc.title == ''FirstSearch: WorldCat List of Records'') { - return "multiple"; -} -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/; -var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/; -var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/; -var hostRegexp = new RegExp("http://([^/]+)/"); - -var uri = doc.location.href; - -var sMatch = sessionRegexp.exec(uri); -var sessionid = sMatch[1]; - -var hMatch = hostRegexp.exec(uri); -var host = hMatch[1]; - -var newUri, exportselect; - -if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { - var publisherRegexp = /^(.*), (.*?),?$/; - - var nMatch = numberRegexp.exec(uri); - if(nMatch) { - var number = nMatch[1]; - } else { - number = 1; +REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', +'function detect(doc, url) { + if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { + return "book"; + } else if(doc.title == ''FirstSearch: WorldCat List of Records'') { + return "multiple"; } - - var rMatch = resultsetRegexp.exec(uri); - if(rMatch) { - var resultset = rMatch[1]; - } else { - // It''s in an XPCNativeWrapper, so we have to do this black magic - resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value; - } - - exportselect = ''record''; - newUri = ''http://''+host+''/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0''; - - var uris = new Array(newUri); -} else { - var items = utilities.getItemArray(doc, doc, ''/WebZ/FSFETCH\\?fetchtype=fullrecord'', ''^(See more details for locating this item|Detailed Record)$''); - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - // Set BookMark cookie - for(i in items) { // Hack to get first item - var myCookie = sessionid+":"; - var rMatch = resultsetRegexp.exec(i); - var resultset = rMatch[1]; - break; - } - var uris = new Array(); - for(i in items) { - var nMatch = numberRegexp.exec(i); - myCookie += resultset+"_"+nMatch[1]+","; - uris.push(i); - } - myCookie = myCookie.substr(0, myCookie.length-1); - doc.cookie = "BookMark="+myCookie; - - exportselect = ''marked''; - newUri = ''http://''+host+''/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno=1:sessionid='' + sessionid + '':entitypagenum=29:0''; -} - -utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) { - var lineRegexp = new RegExp(); - lineRegexp.compile("^([\\w() ]+): *(.*)$"); - - var k = 0; - var uri = uris[k]; - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); - - var lines = text.split(''\n''); - for(var i=0;i/; - var stableURL, ISSN; - - for(i in lines) { - if(lines[i].substring(0,3) == "<1>") { - haveStarted = true; - } else if(newItemRe.test(lines[i])) { - if(!stableURL) { - if(ISSN) { - stableURL = "http://www.jstor.org/browse/"+ISSN; - } else { // Just make sure it''s unique - stableURL = k; - k++; - } - } - model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false); - for(i in data) { - if(data[i].length) { - for(j in data[i]) { - model.addStatement(stableURL, i, data[i][j]); - } - } - } - var data = newDataObject(); - delete ISSN; - delete stableURL; - } else if(lines[i].substring(2, 5) == " : " && haveStarted) { - var fieldCode = lines[i].substring(0, 2); - var fieldContent = utilities.cleanString(lines[i].substring(5)) - - if(fieldCode == "TI") { - data[prefixDC + "title"].push(fieldContent); - } else if(fieldCode == "AU") { - var authors = fieldContent.split(";"); - for(j in authors) { - var author = authors[j]; - if(author) { - var splitNames = author.split('', ''); - if(splitNames) { - author = splitNames[1]+'' ''+splitNames[0]; - } - data[prefixDC + "creator"].push(author); - } - } - } else if(fieldCode == "SO") { - data[prefixDummy + "publication"].push(fieldContent); - } else if(fieldCode == "VO") { - data[prefixDummy + "volume"].push(fieldContent); - } else if(fieldCode == "NO") { - data[prefixDummy + "number"].push(fieldContent); - } else if(fieldCode == "SE") { - data[prefixDummy + "series"].push(fieldContent); - } else if(fieldCode == "DA") { - var date = new Date(fieldContent.replace(".", "")); - if(isNaN(date.valueOf())) { - data[prefixDC + "date"].push(fieldContent); +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + var saveCitations = new Array(); + + if(doc.title == "JSTOR: Search Results") { + var availableItems = new Object(); + + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile(''citationAction=''); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/div[@class="indent"]/table/tbody/tr[td/span[@class="printDownloadSaveLinks"]]'', nsResolver); + // Go through table rows + for(var i=0; i/; + + var newItem = new Scholar.Item("journalArticle"); + + for(var i in lines) { + if(lines[i].substring(0,3) == "<1>") { + haveStarted = true; + } else if(newItemRe.test(lines[i])) { + itemComplete(newItem, url); + newItem = new Scholar.Item("journalArticle"); + } else if(lines[i].substring(2, 5) == " : " && haveStarted) { + var fieldCode = lines[i].substring(0, 2); + var fieldContent = Scholar.Utilities.cleanString(lines[i].substring(5)) + + if(fieldCode == "TI") { + newItem.title = fieldContent; + } else if(fieldCode == "AU") { + var authors = fieldContent.split(";"); + for(j in authors) { + if(authors[j]) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true)); + } + } + } else if(fieldCode == "SO") { + newItem.publication = fieldContent; + } else if(fieldCode == "VO") { + newItem.volume = fieldContent; + } else if(fieldCode == "NO") { + newItem.number = fieldContent; + } else if(fieldCode == "SE") { + newItem.series = fieldContent; + } else if(fieldCode == "DA") { + var date = new Date(fieldContent.replace(".", "")); + if(isNaN(date.valueOf())) { + newItem.date = fieldContent; + } else { + newItem.date = Scholar.Utilities.dateToISO(date); + } + } else if(fieldCode == "PP") { + newItem.pages = fieldContent; + } else if(fieldCode == "EI") { + newItem.source = fieldContent; + } else if(fieldCode == "IN") { + newItem.ISSN = fieldContent; + } else if(fieldCode == "PB") { + newItem.publisher = fieldContent; + } } } - } - - done(); - }); - }, function() {}); -}); + + // last item is complete + if(haveStarted) { + itemComplete(newItem, url); + } + + Scholar.done(); + }); + }, function() {}); + }); + + Scholar.wait(); +}'); -wait();'); - -REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 3, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', -'if(doc.title == "History Cooperative: Search Results") { - return "multiple"; -} else { - return "journalArticle"; +REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', +'function detect(doc, url) { + if(doc.title == "History Cooperative: Search Results") { + return "multiple"; + } else { + return "journalArticle"; + } }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -function associateMeta(uri, metaTags, field, rdfUri) { +'function associateMeta(newItem, metaTags, field, scholarField) { var field = metaTags.namedItem(field); if(field) { - model.addStatement(uri, rdfUri, field.getAttribute("content"), false); + newItem[scholarField] = field.getAttribute("content"); } } function scrape(doc) { - var uri = doc.location.href; + var newItem = new Scholar.Item("journalArticle"); + newItem.source = doc.location.href; + var month, year; var metaTags = doc.getElementsByTagName("meta"); - associateMeta(uri, metaTags, "Title", prefixDC + "title"); - associateMeta(uri, metaTags, "Journal", prefixDummy + "publication"); - associateMeta(uri, metaTags, "Volume", prefixDummy + "volume"); - associateMeta(uri, metaTags, "Issue", prefixDummy + "number"); + associateMeta(newItem, metaTags, "Title", "title"); + associateMeta(newItem, metaTags, "Journal", "publication"); + associateMeta(newItem, metaTags, "Volume", "volume"); + associateMeta(newItem, metaTags, "Issue", "number"); var author = metaTags.namedItem("Author"); if(author) { var authors = author.getAttribute("content").split(" and "); for(j in authors) { - model.addStatement(uri, prefixDC + "creator", authors[j], false); + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author")); } } - var month = metaTags.namedItem("PublicationMonth"); + newItem.complete(); + + // don''t actually need date info for a journal article + /*var month = metaTags.namedItem("PublicationMonth"); var year = metaTags.namedItem("PublicationYear"); if(month && year) { - model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); - } - - model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); + odel.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); + }*/ } -if(doc.title == "History Cooperative: Search Results") { - var items = utilities.getItemArray(doc, doc, ''^http://[^/]+/journals/.+/.+/.+\.html$''); - items = utilities.selectItems(items); - - if(!items) { - return true; +function doWeb(doc, url) { + if(doc.title == "History Cooperative: Search Results") { + var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://[^/]+/journals/.+/.+/.+\.html$''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { Scholar.done(); }, function() {}); + + Scholar.wait(); + } else { + scrape(doc); } - - var uris = new Array(); - for(i in items) { - uris.push(i); - } - - utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, - function() { done(); }, function() {}); - - wait(); -} else { - scrape(doc); }'); -REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-28 22:52:00', 3, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', -'// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button -var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); -if(matchRegexp.test(doc.location.href)) { - return "book"; -} -// Next, look for the MARC button -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -var xpath = ''//a[img[@alt="MARC Display"]]''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(elmts.length) { - return "book"; -} -// Also, check for links to an item display page -var tags = doc.getElementsByTagName("a"); -for(i=0; i ''008'' && tag < ''899'') { // jumps low and high tags + if (tag != ''040'') record.add_field(tag,ind1,ind2,value); + } + } + + record.translate(newItem); + newItem.complete(); + + Scholar.done(); + }, function() {}); + } else { // Search results page + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile(''^http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/frameset''); + + var checkboxes = new Array(); + var urls = new Array(); + var availableItems = new Array(); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//table[@class="browseScreen"]//tr[td/input[@type="checkbox"]]'', nsResolver); + // Go through table rows + for(var i=0; i= 0) { - model.addStatement(uri, prefixRDF + "type", prefixDummy + "magazineArticle", false); + newItem.itemType = "magazineArticle"; } else if(value.indexOf("newspaper") >= 0) { - model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false); - } else { - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + newItem.itemType = "newspaperArticle"; + } else { // TODO: support thesis + newItem.itemType = "book"; } } } else if(field == "isbn" || field == "issn" || field == "issn/isbn") { - var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); if(value) { var type; - value = utilities.superCleanString(value.nodeValue); + value = Scholar.Utilities.superCleanString(value.nodeValue); if(value.length == 10 || value.length == 13) { - type = "ISBN"; + newItem.ISBN = value; } else if(value.length == 8) { - type = "ISSN"; - } - if(type) { - model.addStatement(uri, prefixDC + "identifier", type+" "+value, false); + newItem.ISSN = value; } } } } + + newItem.complete(); } -if(doc.title == "Results") { - var items = new Object(); +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; - // Require link to match this - var tagRegexp = new RegExp(); - tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)''); - - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[@class="rowUnMarked"]/td[3][@class="textMedium"]'', nsResolver); - // Go through table rows - for(var i=0; i]*>/gi); - model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true); + newItem.publication = elementParts[elementParts.length-1]; var dateRegexp = /]*>(?:)?([A-Z][a-z]+)(?:<\/b>)? ([0-9]+, [0-9]{4})/; var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); if(m) { var jsDate = new Date(m[1]+" "+m[2]); - model.addStatement(uri, prefixDC + "date", utilities.dateToISO(jsDate), true); + newItem.date = Scholar.Utilities.dateToISO(jsDate); } else { var elementParts = centerElements[centerElements.length-1].innerHTML.split(/]*>/gi); - model.addStatement(uri, prefixDC + "date", elementParts[1], true); + newItem.date = elementParts[1]; } var cutIndex = citationDataDiv.innerHTML.indexOf("BODY:"); @@ -1326,1147 +1319,1114 @@ function scrape(doc) { citationData = citationDataDiv.innerHTML; } - citationData = utilities.cleanTags(citationData); + citationData = Scholar.Utilities.cleanTags(citationData); var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/; var m = headlineRegexp.exec(citationData); if(m) { - model.addStatement(uri, prefixDC + "title", utilities.cleanTags(m[1]), true); + newItem.title = Scholar.Utilities.cleanTags(m[1]); } var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; var m = bylineRegexp.exec(citationData); - if(m) { + if(m) { // there is a byline; use it as an author if(m[1].substring(0, 3).toLowerCase() == "by ") { m[1] = m[1].substring(3); } - model.addStatement(uri, prefixDC + "creator", m[1], true); - model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false); - } else { - model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); + newItem.creators.push(Scholar.Utilities.cleanAuthor(m[1], "author")); + + newItem.itemType = "newspaperArticle"; + } else { // no byline; must be a journal + newItem.itemType = "journalArticle"; } - var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; + // other ways authors could be encoded + var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; var m = authorRegexp.exec(citationData); if(m) { var authors = m[1].split(/, (?:and )?/); - for(i in authors) { - model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true); + for(var i in authors) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[i].replace(" *", ""), "author")); } } + + newItem.complete(); } -var detailRe = new RegExp("^http://[^/]+/universe/document"); -if(detailRe.test(doc.location.href)) { - scrape(doc); -} else { - var items = utilities.getItemArray(doc, doc, "^http://[^/]+/universe/document"); - items = utilities.selectItems(items); - - if(!items) { - return true; +function doWeb(doc, url) { + var detailRe = new RegExp("^http://[^/]+/universe/document"); + if(detailRe.test(doc.location.href)) { + scrape(doc); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, "^http://[^/]+/universe/document"); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { Scholar.done(); }, function() {}); + + Scholar.wait(); } - - var uris = new Array(); - for(i in items) { - uris.push(i); - } - - utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, - function() { done(); }, function() {}); - - wait(); }'); -REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 3, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', -'var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); - -if(singleRe.test(doc.location.href)) { - return "book"; -} else { - var tags = doc.getElementsByTagName("a"); - for(var i=0; i 3) { - var ind1 = field.charAt(3); - if(field.length > 4) { - var ind2 = field.charAt(4); - } - } - record.add_field(code, ind1, ind2, value); - } - } - utilities.importMARCRecord(record, uri, model); -}, function() { done(); }, function() {}); - -wait();'); - -REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 3, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', -'var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); -if(detailsRe.test(doc.location.href)) { - return "book"; -} else { - return "multiple"; -}', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var uri = doc.location.href; -var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); - -var uris = new Array(); -if(detailsRe.test(uri)) { - uris.push(uri+''&fullmarc=true''); -} else { - var items = utilities.getItemArray(doc, doc, "ipac\.jsp\?.*uri=full=[0-9]|^javascript:buildNewList\\(''.*uri%3Dfull%3D[0-9]"); - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - var buildNewList = new RegExp("^javascript:buildNewList\\(''([^'']+)"); - - var uris = new Array(); - for(i in items) { - var m = buildNewList.exec(i); - if(m) { - uris.push(unescape(m[1]+''&fullmarc=true'')); - } else { - uris.push(i+''&fullmarc=true''); - } - } -} - -utilities.processDocuments(browser, null, uris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href; - - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]''; - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - var record = new MARC_Record(); - for(var i=0; i 3) { + var ind1 = field.charAt(3); + if(field.length > 4) { + var ind2 = field.charAt(4); } } + record.add_field(code, ind1, ind2, value); + } + } + + var newItem = new Scholar.Item(); + newItem.source = uri; + record.translate(newItem); + newItem.complete(); + }, function() { Scholar.done(); }, function() {}); + + Scholar.wait(); +}'); + +REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', +'function detect(doc, url) { + var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); + if(detailsRe.test(doc.location.href)) { + return "book"; + } else { + return "multiple"; + } +}', +'function scrape(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var uri = doc.location.href; + var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); + + var uris = new Array(); + if(detailsRe.test(uri)) { + uris.push(uri+''&fullmarc=true''); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, "ipac\.jsp\?.*uri=full=[0-9]|^javascript:buildNewList\\(''.*uri%3Dfull%3D[0-9]"); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var buildNewList = new RegExp("^javascript:buildNewList\\(''([^'']+)"); + + var uris = new Array(); + for(var i in items) { + var m = buildNewList.exec(i); + if(m) { + uris.push(unescape(m[1]+''&fullmarc=true'')); + } else { + uris.push(i+''&fullmarc=true''); } } } - items = utilities.selectItems(items); + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - if(!items) { - return true; - } - - for(i in items) { - utilities.debugPrint(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); - newUris.push(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); - } -} - -utilities.processDocuments(browser, null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href - - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var xpath = ''//table[@class="outertable"]/tbody/tr[td[4]]''; - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - var record = new MARC_Record(); - for(var i=0; i 0) { - return "multiple"; -} else { - return "book"; +REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)', +'function detect(doc, url) { + var node = Scholar.Utilities.getNode(doc, doc, ''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', null); + if(node) { + return "multiple"; + } + var node = Scholar.Utilities.getNode(doc, doc, ''//a[text()="marc"]'', null); + if(node) { + return "book"; + } }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var checkItems = false; - -if(doc.location.href.indexOf("/authority_hits") > 0) { +'function doWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - checkItems = utilities.gatherElementsOnXPath(doc, doc, "/html/body//ol/li", nsResolver); -} - -if(checkItems && checkItems.length) { - var items = utilities.getItemArray(doc, checkItems, ''https?://.*/web2/tramp2\.exe/see_record''); - items = utilities.selectItems(items); + var uri = doc.location.href; + var newUris = new Array(); - if(!items) { - return true; - } + var marcs = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//a[text()="marc"]'', nsResolver); - var uris = new Array(); - for(i in items) { - uris.push(i); - } -} else { - var uris = new Array(doc.location.href); -} - -for(i in uris) { - var uri = uris[i]; - var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i; - var m = uriRegexp.exec(uri); - if(uri.indexOf("/authority_hits") < 0) { - var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3]; + if(marcs.length == 1) { + newUris.push(marcs[0].href) } else { - var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc"; - } - - // Keep track of how many requests have been completed - var j = 0; - - utilities.HTTPUtilities.doGet(newUri, null, function(text) { - var record = new MARC_Record(); - record.load(text, "binary"); - utilities.importMARCRecord(record, uris[j], model); - j++; - if(j == uris.length) { - done(); + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile("/chameleon\?.*function=CARDSCR"); + + var items = new Array(); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//tr[@class="intrRow"]'', nsResolver); + // Go through table rows + for(var i=0; i 0) { - return "multiple"; -} else { - return "book"; -}', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var uri = doc.location.href; - -var uris = new Array(); - -if(uri.indexOf("/GeacQUERY") > 0) { - var items = utilities.getItemArray(doc, doc, "(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)"); - items = utilities.selectItems(items); - - if(!items) { - return true; } + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + + Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//table[@class="outertable"]/tbody/tr[td[4]]''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + var record = new marc.MARC_Record(); + for(var i=0; i 0) { + return "multiple"; + } else { + return "book"; + } +}', +'function doWeb(doc, url) { + var checkItems = false; + + if(doc.location.href.indexOf("/authority_hits") > 0) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + checkItems = Scholar.Utilities.gatherElementsOnXPath(doc, doc, "/html/body//ol/li", nsResolver); + } + + if(checkItems && checkItems.length) { + var items = Scholar.Utilities.getItemArray(doc, checkItems, ''https?://.*/web2/tramp2\.exe/see_record''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + } else { + var uris = new Array(doc.location.href); + } + + for(var i in uris) { + var uri = uris[i]; + var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i; + var m = uriRegexp.exec(uri); + if(uri.indexOf("/authority_hits") < 0) { + var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3]; + } else { + var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc"; + } + + // Keep track of how many requests have been completed + var j = 0; + + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + + Scholar.Utilities.HTTPUtilities.doGet(newUri, null, function(text) { + var record = new marc.MARC_Record(); + record.load(text, "binary"); + + var newItem = new Scholar.Item(); + newItem.source = uris[j]; + record.translate(record, newItem); + newItem.complete(); + + j++; + if(j == uris.length) { + Scholar.done(); + } + }); + } + Scholar.wait(); +}'); + + +REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', +'function detect(doc, url) { + if(doc.location.href.indexOf("/GeacQUERY") > 0) { + return "multiple"; + } else { + return "book"; + } +}', +'function doWeb(doc, url) { + var uri = doc.location.href; + var uris = new Array(); - for(i in items) { - var newUri = i.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); + + if(uri.indexOf("/GeacQUERY") > 0) { + var items = Scholar.Utilities.getItemArray(doc, doc, "(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)"); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + var newUri = i.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); + newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); + uris.push(newUri); + } + } else { + var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); uris.push(newUri); } -} else { - var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); - newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); - uris.push(newUri); -} - -utilities.processDocuments(browser, null, uris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href; - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - var record = new MARC_Record(); - - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''//pre/text()'', nsResolver); - var tag, ind1, ind2, content; - - for(var i=0; i 10) { - ind1 = line.substring(4, 5); - ind2 = line.substring(5, 6); - content = line.substring(7); - content = content.replace(/\$([a-z])(?: |$)/g, record.subfield_delimiter+"$1"); - } else { - ind1 = ""; - ind2 = ""; - content = line.substring(4); - } - - } - - utilities.importMARCRecord(record, uri, model); -}, function() { done(); }, function() {}); - -wait();'); - -REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 3, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', -'var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver); -for(i in elmts) { - if(utilities.superCleanString(elmts[i].nodeValue) == "Viewing record") { - return "book"; - } -} -var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(elmts.length) { - return "multiple"; -} -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -// Cheap hack to convert HTML entities -function unescapeHTML(text) { - var div = doc.createElement("div"); - div.innerHTML = utilities.cleanTags(text); - var text = div.childNodes[0] ? div.childNodes[0].nodeValue : null; - delete div; - return text; -} - -var uri = doc.location.href; -var recNumbers = new Array(); - -var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(elmts.length) { // Search results page - var uriRegexp = /^http:\/\/[^\/]+/; - var m = uriRegexp.exec(uri); - var postAction = doc.forms.namedItem("hitlist").getAttribute("action"); - var newUri = m[0]+postAction.substr(0, postAction.length-1)+"40" - - var titleRe = /
\s*(.*[^\s])\s*
/i; - - var items = new Array(); - - for(var i=0; i"); - texts = texts[1].split(""); - text = unescapeHTML(texts[0]); - var documents = text.split("*** DOCUMENT BOUNDARY ***"); - - for(var j=1; j 10) { - ind1 = line.substr(6, 1); - ind2 = line.substr(7, 1); - content = line.substr(8); + ind1 = line.substring(4, 5); + ind2 = line.substring(5, 6); + content = line.substring(7); + content = content.replace(/\$([a-z])(?: |$)/g, record.subfield_delimiter+"$1"); } else { ind1 = ""; ind2 = ""; - content = line.substring(6); + content = line.substring(4); } - } - utilities.importMARCRecord(record, uri, model); - } - done(); -}); - -wait();'); - -REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 3, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', -'var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); -if(detailRe.test(doc.location.href)) { - return "book"; -} else { - return "multiple"; -}', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); -var uri = doc.location.href; -var newUris = new Array(); - -if(detailRe.test(uri)) { - newUris.push(uri.replace("LabelDisplay", "MARCDisplay")); -} else { - var items = utilities.getItemArray(doc, doc, ''TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]''); - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - for(i in items) { - newUris.push(i.replace("LabelDisplay", "MARCDisplay")); - } -} - -utilities.processDocuments(browser, null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href; - - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var record = new MARC_Record(); - - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''/html/body/table/tbody/tr[td[4]]'', nsResolver); - var tag, ind1, ind2, content; - - for(var i=0; i 1) { - var data = newDataObject(); - for(i in lines) { - var fieldCode = lines[i].substring(0, 2); - var fieldContent = utilities.cleanString(lines[i].substring(6)) - - if(fieldCode == "T1") { - data[prefixDC + "title"].push(fieldContent); - } else if(fieldCode == "A1") { - var authors = fieldContent.split(";"); - for(j in authors) { - var author = authors[j]; - if(author) { - var splitNames = author.split('', ''); - if(splitNames) { - author = splitNames[1]+'' ''+splitNames[0]; - } - data[prefixDC + "creator"].push(author); - } - } - } else if(fieldCode == "JF") { - data[prefixDummy + "publication"].push(fieldContent); - } else if(fieldCode == "VL") { - data[prefixDummy + "volume"].push(fieldContent); - } else if(fieldCode == "IS") { - data[prefixDummy + "number"].push(fieldContent); - } else if(fieldCode == "Y1") { - data[prefixDC + "year"].push(fieldContent); - } else if(fieldCode == "PP") { - data[prefixDummy + "pages"].push(fieldContent); - } else if(fieldCode == "UR") { - stableURL = fieldContent; - } else if(fieldCode == "SN") { - data[prefixDC + "identifier"].push("ISSN "+fieldContent); - ISSN = fieldContent; - } else if(fieldCode == "PB") { - data[prefixDC + "publisher"].push(fieldContent); - } - } - model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false); - for(i in data) { - if(data[i].length) { - for(j in data[i]) { - model.addStatement(stableURL, i, data[i][j]); - } - } - } - } - } - done(); - }, function() {}); - }, function() {}); - - wait(); -} else { - var uri = doc.location.href; - - var elmts = utilities.gatherElementsOnXPath(doc, doc, ''//comment()'', nsResolver); - for(i in elmts) { - if(elmts[i].nodeValue.substr(0, 10) == "HeaderData") { - var headerRegexp = /HeaderData((?:.|\n)*)\#\#EndHeaders/i - var m = headerRegexp.exec(elmts[i].nodeValue); - var headerData = m[1]; - } - } - - // Use E4X rather than DOM/XPath, because the Mozilla gods have decided not to - // expose DOM/XPath to sandboxed scripts - var newDOM = new XML(headerData); - - function mapRDF(text, rdfUri) { - if(text) { - model.addStatement(uri, rdfUri, text, true); - } - } - - mapRDF(newDOM.journal.text(), prefixDummy + "publication"); - mapRDF(newDOM.volume.text(), prefixDummy + "volume"); - mapRDF(newDOM.issue.text(), prefixDummy + "number"); - mapRDF(newDOM.year.text(), prefixDummy + "year"); - mapRDF(newDOM.pubdate.text(), prefixDC + "date"); - mapRDF(newDOM.doctitle.text(), prefixDC + "title"); - - // Do ISSN - var issn = newDOM.issn.text(); - if(issn) { - model.addStatement(uri, prefixDC + "identifier", "ISSN "+issn.replace(/[^0-9]/g, ""), true); - } - - // Do pages - var fpage = newDOM.fpage.text(); - var lpage = newDOM.lpage.text(); - if(fpage != "") { - var pages = fpage; - if(lpage) { - pages += "-"+lpage; - } - model.addStatement(uri, prefixDummy + "pages", pages, true); - } - - // Do authors - var elmts = newDOM.docauthor; - for(i in elmts) { - var fname = elmts[i].fname.text(); - var surname = elmts[i].surname.text(); - model.addStatement(uri, prefixDC + "creator", fname+" "+surname, true); - } - - model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); + Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 3, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', -'if(doc.location.href.indexOf("list_uids=") >= 0) { - return "journalArticle"; -} else { - return "multiple"; -}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -function mapRDF(uri, text, rdfUri) { - if(text != "") { - model.addStatement(uri, rdfUri, text, true); - } -} - -var uri = doc.location.href; -var ids = new Array(); -var idRegexp = /[\?\&]list_uids=([0-9\,]+)/; - -var m = idRegexp.exec(uri); -if(m) { - ids.push(m[1]); -} else { +REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +'function detect(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - var items = new Array(); - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver); - // Go through table rows - for(var i=0; i]*>/, "").replace(/<\?xml[^>]*\?>/, ""); - - var xml = new XML(text); - - for(var i=0; i\s*(.*[^\s])\s*
/i; + + var items = new Array(); + + for(var i=0; i"); + texts = texts[1].split(""); + text = unescapeHTML(texts[0]); + var documents = text.split("*** DOCUMENT BOUNDARY ***"); + + for(var j=1; j 10) { + ind1 = line.substr(6, 1); + ind2 = line.substr(7, 1); + content = line.substr(8); + } else { + ind1 = ""; + ind2 = ""; + content = line.substring(6); + } + } + + var newItem = new Scholar.Item(); + newItem.source = uri; + record.translate(newItem); + newItem.complete(); + } + Scholar.done(); + }); + + Scholar.wait(); +}'); -wait();'); +REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', +'function detect(doc, url) { + var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); + if(detailRe.test(doc.location.href)) { + return "book"; + } else { + return "multiple"; + } +}', +'function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); + var uri = doc.location.href; + var newUris = new Array(); + + if(detailRe.test(uri)) { + newUris.push(uri.replace("LabelDisplay", "MARCDisplay")); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, ''TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + for(var i in items) { + newUris.push(i.replace("LabelDisplay", "MARCDisplay")); + } + } + + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + + Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var record = new marc.MARC_Record(); + + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, ''/html/body/table/tbody/tr[td[4]]'', nsResolver); + var tag, ind1, ind2, content; + + for(var i=0; i]*>/, "").replace(/<\?xml[^>]*\?>/, ""); + + var xml = new XML(text); + + for(var i=0; i; var modsCollection = ; - for(var i in items) { - var item = items[i]; - + var item; + while(item = Scholar.nextItem()) { var isPartialItem = false; - if(utilities.inArray(item.itemType, partialItemTypes)) { + if(Scholar.Utilities.inArray(item.itemType, partialItemTypes)) { isPartialItem = true; } @@ -2607,7 +2566,7 @@ function translate(items, collections) { } // XML tag recordInfo.recordOrigin; used to store our generator note - //mods.recordInfo.recordOrigin = "Scholar for Firefox "+utilities.getVersion(); + //mods.recordInfo.recordOrigin = "Scholar for Firefox "+Scholar.Utilities.getVersion(); /** FIELDS ON NEARLY EVERYTHING BUT NOT A PART OF THE CORE **/ @@ -2646,7 +2605,7 @@ function translate(items, collections) { // XML tag detail; object field volume if(item.volume) { - if(utilities.isInt(item.volume)) { + if(Scholar.Utilities.isInt(item.volume)) { part += {item.volume}; } else { part += {item.volume}; @@ -2655,7 +2614,7 @@ function translate(items, collections) { // XML tag detail; object field number if(item.number) { - if(utilities.isInt(item.number)) { + if(Scholar.Utilities.isInt(item.number)) { part += {item.number}; } else { part += {item.number}; @@ -2664,7 +2623,7 @@ function translate(items, collections) { // XML tag detail; object field section if(item.section) { - if(utilities.isInt(item.section)) { + if(Scholar.Utilities.isInt(item.section)) { part += {item.section}; } else { part += {item.section}; @@ -2673,7 +2632,7 @@ function translate(items, collections) { // XML tag detail; object field pages if(item.pages) { - var range = utilities.getPageRange(item.pages); + var range = Scholar.Utilities.getPageRange(item.pages); part += {range[0]}{range[1]}; } @@ -2804,33 +2763,35 @@ function translate(items, collections) { } modsCollection.rdf::RDF = rdfDoc;*/ - write(''''+"\n"); - write(modsCollection.toXMLString()); + Scholar.write(''''+"\n"); + Scholar.write(modsCollection.toXMLString()); }'); REPLACE INTO "translators" VALUES ('14763d24-8ba0-45df-8f52-b8d1108e7ac9', '2006-07-07 12:44:00', 2, 'Biblio/DC/FOAF/PRISM/VCard (RDF/XML)', 'Simon Kornblith', 'rdf', -'configure("getCollections", true); -configure("dataMode", "rdf");', +'Scholar.configure("getCollections", true); +Scholar.configure("dataMode", "rdf"); +Scholar.addOption("exportNotes", true); +Scholar.addOption("exportFileData", true);', 'function generateSeeAlso(resource, seeAlso) { for(var i in seeAlso) { - model.addStatement(resource, n.dc+"relation", itemResources[seeAlso[i]], false); + Scholar.RDF.addStatement(resource, n.dc+"relation", itemResources[seeAlso[i]], false); } } function generateCollection(collection) { var collectionResource = "#collection:"+collection.id; - model.addStatement(collectionResource, rdf+"type", n.bib+"Collection", false); + Scholar.RDF.addStatement(collectionResource, rdf+"type", n.bib+"Collection", false); for(var i in collection.children) { var child = collection.children[i]; // add child list items if(child.type == "collection") { - model.addStatement(collectionResource, n.dc+"hasPart", "#collection:"+child.id, false); + Scholar.RDF.addStatement(collectionResource, n.dc+"hasPart", "#collection:"+child.id, false); // do recursive processing of collections generateCollection(child); } else { - model.addStatement(collectionResource, n.dc+"hasPart", itemResources[child.id], false); + Scholar.RDF.addStatement(collectionResource, n.dc+"hasPart", itemResources[child.id], false); } } } @@ -2840,9 +2801,9 @@ function getContainerIfExists() { if(containerElement) { return containerElement; } else { - containerElement = model.newResource(); + containerElement = Scholar.RDF.newResource(); // attach container to section (if exists) or resource - model.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); + Scholar.RDF.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); return containerElement; } } else { @@ -2850,7 +2811,7 @@ function getContainerIfExists() { } } -function translate(items, collections) { +function doExport() { rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; n = { @@ -2864,7 +2825,7 @@ function translate(items, collections) { // add namespaces for(var i in n) { - model.addNamespace(i, n[i]); + Scholar.RDF.addNamespace(i, n[i]); } // leave as global @@ -2888,7 +2849,8 @@ function translate(items, collections) { } } - for(var i in items) { + var item; + while(item = Scholar.nextItem()) { // these items are global item = items[i]; resource = itemResources[item.itemID]; @@ -2901,7 +2863,7 @@ function translate(items, collections) { // title if(item.title) { - model.addStatement(resource, n.dc+"title", item.title, true); + Scholar.RDF.addStatement(resource, n.dc+"title", item.title, true); } // type @@ -2938,18 +2900,18 @@ function translate(items, collections) { type = "Memo"; } if(type) { - model.addStatement(resource, rdf+"type", n.bib+type, false); + Scholar.RDF.addStatement(resource, rdf+"type", n.bib+type, false); } // authors/editors/contributors var creatorContainers = new Object(); for(var j in item.creators) { - var creator = model.newResource(); - model.addStatement(creator, rdf+"type", n.foaf+"Person", false); + var creator = Scholar.RDF.newResource(); + Scholar.RDF.addStatement(creator, rdf+"type", n.foaf+"Person", false); // gee. an entire vocabulary for describing people, and these aren''t even // standardized in it. oh well. using them anyway. - model.addStatement(creator, n.foaf+"surname", item.creators[j].lastName, true); - model.addStatement(creator, n.foaf+"givenname", item.creators[j].firstName, true); + Scholar.RDF.addStatement(creator, n.foaf+"surname", item.creators[j].lastName, true); + Scholar.RDF.addStatement(creator, n.foaf+"givenname", item.creators[j].firstName, true); // in addition, these tags are not yet in Biblio, but Bruce D''Arcus // says they will be. @@ -2962,142 +2924,142 @@ function translate(items, collections) { } if(!creatorContainers[cTag]) { - var creatorResource = model.newResource(); + var creatorResource = Scholar.RDF.newResource(); // create new seq for author type - creatorContainers[cTag] = model.newContainer("seq", creatorResource); + creatorContainers[cTag] = Scholar.RDF.newContainer("seq", creatorResource); // attach container to resource - model.addStatement(resource, n.bib+cTag, creatorResource, false); + Scholar.RDF.addStatement(resource, n.bib+cTag, creatorResource, false); } - model.addContainerElement(creatorContainers[cTag], creator, true); + Scholar.RDF.addContainerElement(creatorContainers[cTag], creator, true); } /** FIELDS ON NEARLY EVERYTHING BUT NOT A PART OF THE CORE **/ // source if(item.source) { - model.addStatement(resource, n.dc+"source", item.source, true); + Scholar.RDF.addStatement(resource, n.dc+"source", item.source, true); } // accessionNumber as generic ID if(item.accessionNumber) { - model.addStatement(resource, n.dc+"identifier", item.accessionNumber, true); + Scholar.RDF.addStatement(resource, n.dc+"identifier", item.accessionNumber, true); } // rights if(item.rights) { - model.addStatement(resource, n.dc+"rights", item.rights, true); + Scholar.RDF.addStatement(resource, n.dc+"rights", item.rights, true); } /** SUPPLEMENTAL FIELDS **/ // use section to set up another container element if(item.section) { - section = model.newResource(); // leave as global + section = Scholar.RDF.newResource(); // leave as global // set section type - model.addStatement(section, rdf+"type", n.bib+"Part", false); + Scholar.RDF.addStatement(section, rdf+"type", n.bib+"Part", false); // set section title - model.addStatement(section, n.dc+"title", item.section, true); + Scholar.RDF.addStatement(section, n.dc+"title", item.section, true); // add relationship to resource - model.addStatement(resource, n.dc+"isPartOf", section, false); + Scholar.RDF.addStatement(resource, n.dc+"isPartOf", section, false); } // use ISSN to set up container element if(item.ISSN) { containerElement = "urn:issn:"+item.ISSN; // leave as global // attach container to section (if exists) or resource - model.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); + Scholar.RDF.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); } // publication gets linked to container via isPartOf if(item.publication) { - model.addStatement(getContainerIfExists(), n.dc+"title", item.publication, true); + Scholar.RDF.addStatement(getContainerIfExists(), n.dc+"title", item.publication, true); } // series also linked in if(item.series) { - var series = model.newResource(); + var series = Scholar.RDF.newResource(); // set series type - model.addStatement(series, rdf+"type", n.bib+"Series", false); + Scholar.RDF.addStatement(series, rdf+"type", n.bib+"Series", false); // set series title - model.addStatement(series, n.dc+"title", item.series, true); + Scholar.RDF.addStatement(series, n.dc+"title", item.series, true); // add relationship to resource - model.addStatement(getContainerIfExists(), n.dcterms+"isPartOf", series, false); + Scholar.RDF.addStatement(getContainerIfExists(), n.dcterms+"isPartOf", series, false); } // volume if(item.volume) { - model.addStatement(getContainerIfExists(), n.prism+"volume", item.volume, true); + Scholar.RDF.addStatement(getContainerIfExists(), n.prism+"volume", item.volume, true); } // number if(item.number) { - model.addStatement(getContainerIfExists(), n.prism+"number", item.number, true); + Scholar.RDF.addStatement(getContainerIfExists(), n.prism+"number", item.number, true); } // edition if(item.edition) { - model.addStatement(resource, n.prism+"edition", item.edition, true); + Scholar.RDF.addStatement(resource, n.prism+"edition", item.edition, true); } // publisher/distributor and place if(item.publisher || item.distributor || item.place) { - var organization = model.newResource(); + var organization = Scholar.RDF.newResource(); // set organization type - model.addStatement(organization, rdf+"type", n.foaf+"Organization", false); + Scholar.RDF.addStatement(organization, rdf+"type", n.foaf+"Organization", false); // add relationship to resource - model.addStatement(resource, n.dc+"publisher", organization, false); + Scholar.RDF.addStatement(resource, n.dc+"publisher", organization, false); // add publisher/distributor if(item.publisher) { - model.addStatement(organization, n.foaf+"name", item.publisher, true); + Scholar.RDF.addStatement(organization, n.foaf+"name", item.publisher, true); } else if(item.distributor) { - model.addStatement(organization, n.foaf+"name", item.distributor, true); + Scholar.RDF.addStatement(organization, n.foaf+"name", item.distributor, true); } // add place if(item.place) { - var address = model.newResource(); + var address = Scholar.RDF.newResource(); // set address type - model.addStatement(address, rdf+"type", n.vcard+"Address", false); + Scholar.RDF.addStatement(address, rdf+"type", n.vcard+"Address", false); // set address locality - model.addStatement(address, n.vcard+"locality", item.place, true); + Scholar.RDF.addStatement(address, n.vcard+"locality", item.place, true); // add relationship to organization - model.addStatement(organization, n.vcard+"adr", address, false); + Scholar.RDF.addStatement(organization, n.vcard+"adr", address, false); } } // date/year if(item.date) { - model.addStatement(resource, n.dc+"date", item.date, true); + Scholar.RDF.addStatement(resource, n.dc+"date", item.date, true); } else if(item.year) { - model.addStatement(resource, n.dc+"year", item.year, true); + Scholar.RDF.addStatement(resource, n.dc+"year", item.year, true); } // callNumber if(item.callNumber) { - var term = model.newResource(); + var term = Scholar.RDF.newResource(); // set term type - model.addStatement(term, rdf+"type", n.dcterms+"LCC", false); + Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"LCC", false); // set callNumber value - model.addStatement(term, rdf+"value", item.callNumber, true); + Scholar.RDF.addStatement(term, rdf+"value", item.callNumber, true); // add relationship to resource - model.addStatement(resource, n.dc+"subject", term, false); + Scholar.RDF.addStatement(resource, n.dc+"subject", term, false); } // archiveLocation if(item.archiveLocation) { - model.addStatement(resource, n.dc+"coverage", item.archiveLocation, true); + Scholar.RDF.addStatement(resource, n.dc+"coverage", item.archiveLocation, true); } // medium if(item.medium) { - model.addStatement(resource, n.dc+"medium", item.medium, true); + Scholar.RDF.addStatement(resource, n.dc+"medium", item.medium, true); } // type (not itemType) if(item.type) { - model.addStatement(resource, n.dc+"type", item.type, true); + Scholar.RDF.addStatement(resource, n.dc+"type", item.type, true); } else if(item.thesisType) { - model.addStatement(resource, n.dc+"type", item.thesisType, true); + Scholar.RDF.addStatement(resource, n.dc+"type", item.thesisType, true); } // THIS IS NOT YET IN THE BIBLIO NAMESPACE, BUT BRUCE D''ARCUS HAS SAID // IT WILL BE SOON if(item.pages) { - model.addStatement(resource, n.bib+"pages", item.pages, true); + Scholar.RDF.addStatement(resource, n.bib+"pages", item.pages, true); } /** NOTES **/ @@ -3106,25 +3068,25 @@ function translate(items, collections) { var noteResource = itemResources[item.notes[j].itemID]; // add note tag - model.addStatement(noteResource, rdf+"type", n.bib+"Memo", false); + Scholar.RDF.addStatement(noteResource, rdf+"type", n.bib+"Memo", false); // add note description (sorry, couldn''t find a better way of // representing this data in an existing ontology) - model.addStatement(noteResource, n.dc+"description", item.notes[j].note, true); + Scholar.RDF.addStatement(noteResource, n.dc+"description", item.notes[j].note, true); // add relationship between resource and note - model.addStatement(resource, n.dcterms+"isReferencedBy", noteResource, false); + Scholar.RDF.addStatement(resource, n.dcterms+"isReferencedBy", noteResource, false); // Add see also info to RDF generateSeeAlso(item.notes[j].itemID, item.notes[j].seeAlso); } if(item.note) { - model.addStatement(resource, n.dc+"description", item.note, true); + Scholar.RDF.addStatement(resource, n.dc+"description", item.note, true); } /** TAGS **/ for(var j in item.tags) { - model.addStatement(resource, n.dc+"subject", item.tags[j], true); + Scholar.RDF.addStatement(resource, n.dc+"subject", item.tags[j], true); } // Add see also info to RDF @@ -3134,31 +3096,24 @@ function translate(items, collections) { } /** RDF COLLECTION STRUCTURE **/ - for(var i in collections) { - generateCollection(collections[i]); + var collection; + while(collection = Scholar.nextCollection()) { + generateCollection(collection); } }'); REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006-07-05 23:40:00', 2, 'Unqualified Dublin Core (RDF/XML)', 'Simon Kornblith', 'rdf', -'configure("dataMode", "rdf");', -'function translate(items) { - var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; - +'Scholar.configure("dataMode", "rdf");', +'function doExport() { var dc = "http://purl.org/dc/elements/1.1/"; - model.addNamespace("dc", dc); + Scholar.RDF.addNamespace("dc", dc); - for(var i in items) { - var item = items[i]; - + var item; + while(item = Scholar.nextItem()) { if(item.itemType == "note") { continue; } - var isPartialItem = false; - if(utilities.inArray(item.itemType, partialItemTypes)) { - isPartialItem = true; - } - var resource; if(item.ISBN) { resource = "urn:isbn:"+item.ISBN; @@ -3166,18 +3121,18 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 resource = item.url; } else { // just specify a node ID - resource = model.newResource(); + resource = Scholar.RDF.newResource(); } /** CORE FIELDS **/ // title if(item.title) { - model.addStatement(resource, dc+"title", item.title, true); + Scholar.RDF.addStatement(resource, dc+"title", item.title, true); } // type - model.addStatement(resource, dc+"type", item.itemType, true); + Scholar.RDF.addStatement(resource, dc+"type", item.itemType, true); // creators for(var j in item.creators) { @@ -3188,9 +3143,9 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 } if(item.creators[j].creatorType == "author") { - model.addStatement(resource, dc+"creator", creator, true); + Scholar.RDF.addStatement(resource, dc+"creator", creator, true); } else { - model.addStatement(resource, dc+"contributor", creator, true); + Scholar.RDF.addStatement(resource, dc+"contributor", creator, true); } } @@ -3198,17 +3153,17 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 // source if(item.source) { - model.addStatement(resource, dc+"source", item.source, true); + Scholar.RDF.addStatement(resource, dc+"source", item.source, true); } // accessionNumber as generic ID if(item.accessionNumber) { - model.addStatement(resource, dc+"identifier", item.accessionNumber, true); + Scholar.RDF.addStatement(resource, dc+"identifier", item.accessionNumber, true); } // rights if(item.rights) { - model.addStatement(resource, dc+"rights", item.rights, true); + Scholar.RDF.addStatement(resource, dc+"rights", item.rights, true); } /** SUPPLEMENTAL FIELDS **/ @@ -3217,84 +3172,349 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 // publisher/distributor if(item.publisher) { - model.addStatement(resource, dc+"publisher", item.publisher, true); + Scholar.RDF.addStatement(resource, dc+"publisher", item.publisher, true); } else if(item.distributor) { - model.addStatement(resource, dc+"publisher", item.distributor, true); + Scholar.RDF.addStatement(resource, dc+"publisher", item.distributor, true); } // date/year if(item.date) { - model.addStatement(resource, dc+"date", item.date, true); + Scholar.RDF.addStatement(resource, dc+"date", item.date, true); } else if(item.year) { - model.addStatement(resource, dc+"year", item.year, true); + Scholar.RDF.addStatement(resource, dc+"year", item.year, true); } // ISBN/ISSN if(item.ISBN) { - model.addStatement(resource, dc+"identifier", "ISBN "+item.ISBN, true); + Scholar.RDF.addStatement(resource, dc+"identifier", "ISBN "+item.ISBN, true); } else if(item.ISSN) { - model.addStatement(resource, dc+"identifier", "ISSN "+item.ISSN, true); + Scholar.RDF.addStatement(resource, dc+"identifier", "ISSN "+item.ISSN, true); } // callNumber if(item.callNumber) { - model.addStatement(resource, dc+"identifier", item.callNumber, true); + Scholar.RDF.addStatement(resource, dc+"identifier", item.callNumber, true); } // archiveLocation if(item.archiveLocation) { - model.addStatement(resource, dc+"coverage", item.archiveLocation, true); + Scholar.RDF.addStatement(resource, dc+"coverage", item.archiveLocation, true); } } }'); -REPLACE INTO "translators" VALUES ('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7', '2006-06-30 15:36:00', 2, 'RIS', 'Simon Kornblith', 'ris', -'addOption("exportNotes", true); -addOption("exportFileData", true);', -'function addTag(tag, value) { - if(value) { - write(tag+" - "+value+"\r\n"); +REPLACE INTO "translators" VALUES ('5e3ad958-ac79-463d-812b-a86a9235c28f', '2006-07-15 17:09:00', 1, 'RDF', 'Simon Kornblith', 'rdf', +'Scholar.configure("dataMode", "rdf");', +'function getFirstResults(node, properties, onlyOneString) { + for(var i=0; i 0) return true; + return false; +} + +MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC field + this.tag = tag; + this.occ = rec.count_occ(tag)+1; // occurrence order no. + this.ind1 = ind1; if (this.ind1 == '''') this.ind1 = '' ''; + this.ind2 = ind2; if (this.ind2 == '''') this.ind2 = '' ''; + if (tag.substr(0,2) == ''00'') { + this.ind1 = ''''; this.ind2 = ''''; + } + this.value = value; + return this; +} + +MARC_Record.prototype.display = function(type) { // displays record in format type + type = type.toLowerCase(); + if (type == ''binary'') return this.show_leader() + + this.directory + + this.field_terminator + + this.show_fields() + + this.record_terminator; + if (type == ''xml'') { + s = ''''; + s += ''''; + s += ''''+this.show_leader()+''''; + // var i; + for (i=0; i''+this.variable_fields[i].value+''
''; + else { + var subfields = this.variable_fields[i].value.split(this.subfield_delimiter); + // alert(this.variable_fields[i].value+'' ''+subfields.length); // test + if (subfields.length == 1) subfields[1] = ''?''+this.variable_fields[i].value; + var sf = ''''; + for (var j=1; j''+subfields[j].substr(1)+''''; + } + s += '''' + sf + ''''; + } + } + s += ''''; + return s; + } + return false; +} + +MARC_Record.prototype.get_field = function(tag) { // returns an array of values, one for each occurrence + var v = new Array(); var i; + for (i=0; i= this.directory.length) alert(''Internal error!''); + this.directory = this.directory.substr(0,i) + this.directory.substr(i+12); + // updates lengths + this.update_base_address_of_data(); + this.update_displacements(); + this.update_record_length(); + return true; +} + +MARC_Record.prototype._clean = function(value) { + value = value.replace(/^[\s\.\,\/\:]+/, ''''); + value = value.replace(/[\s\.\,\/\:]+$/, ''''); + value = value.replace(/ +/g, '' ''); + + var char1 = value[1]; + var char2 = value[value.length-1]; + if((char1 == "[" && char2 == "]") || (char1 == "(" && char2 == ")")) { + // chop of extraneous characters + return value.substr(1, value.length-2); + } + + return value; +} + +MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) { + if(!part) { + part = ''a''; + } + var field = this.get_field_subfields(fieldNo); + Scholar.Utilities.debugPrint(''Found ''+field.length+'' matches for ''+fieldNo+part); + if(field) { + for(var i in field) { + var value = false; + for(var j=0; j 1) { + records[0] = holdOver + records[0]; + holdOver = records.pop(); // skip last record, since it''s not done + + for(var i in records) { + var newItem = new Scholar.Item(); + newItem.source = url; + + // create new record + var record = new MARC_Record(); + record.load(records[i], "binary"); + record.translate(newItem); + + newItem.complete(); + } + } else { + holdOver += text; + } } }'); \ No newline at end of file