diff --git a/chrome/chromeFiles/content/scholar/exportOptions.js b/chrome/chromeFiles/content/scholar/exportOptions.js index 66ad0fa69..cfac3be9f 100644 --- a/chrome/chromeFiles/content/scholar/exportOptions.js +++ b/chrome/chromeFiles/content/scholar/exportOptions.js @@ -52,7 +52,7 @@ var Scholar_File_Interface_Export = new function() { var defValue = _options[option]; var element = document.getElementById(option); - if(typeof(defValue) == "bool") { + if(typeof(defValue) == "boolean") { if(element.checked == "true") { _options[option] = true; } else { diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js index c88ed2ce4..7904608cf 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/translate.js +++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -1,6 +1,4 @@ -// Scholar for Firefox Translate -// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) -// This code is licensed according to the GPL +// Scholar for Firefox Translate Engine /* * Scholar.Translate: a class for translation of Scholar metadata from and to @@ -66,6 +64,7 @@ * returned items * _storageStream - the storage stream to be used, if one is configured * _storageStreamLength - the length of the storage stream + * _exportFileDirectory - the directory to which files will be exported * * WEB-ONLY PRIVATE PROPERTIES: * @@ -648,7 +647,7 @@ Scholar.Translate.prototype._parseDetectCode = function(translator) { * * dataMode * valid: import, export - * options: rdf, text + * options: rdf, block, line * purpose: selects whether write/read behave as standard text functions or * using Mozilla's built-in support for RDF data sources * @@ -669,6 +668,7 @@ Scholar.Translate.prototype._configure = function(option, value) { * * called as addOption() in detect code * + * current options are exportNotes and exportFileData */ Scholar.Translate.prototype._addOption = function(option, value) { this._displayOptions[option] = value; @@ -796,6 +796,45 @@ Scholar.Translate.prototype._closeStreams = function() { this._streams = new Array(); } +/* + * imports an attachment from the disk + */ +Scholar.Translate.prototype._itemImportAttachment = function(attachment, sourceID) { + Scholar.debug(attachment); + + if(!attachment.path) { + // create from URL + if(attachment.url) { + var attachmentID = Scholar.Attachments.linkFromURL(attachment.url, sourceID, + (attachment.mimeType ? attachment.mimeType : undefined), + (attachment.title ? attachment.title : undefined)); + } else { + Scholar.debug("not adding attachment: no path or url specified"); + } + } else { + if(attachment.url) { + Scholar.debug("not adding attachment: snapshot import not yet implemented"); + } else { + // generate nsIFile + var IOService = Components.classes["@mozilla.org/network/io-service;1"]. + getService(Components.interfaces.nsIIOService); + var uri = IOService.newURI(attachment.path, "", null); + var file = uri.QueryInterface(Components.interfaces.nsIFileURL).file; + + // import from nsIFile + var attachmentID = Scholar.Attachments.importFromFile(file, sourceID); + // get attachment item + var myAttachmentItem = Scholar.Items.get(attachmentID); + if(attachment.title) { + // set title + myAttachmentItem.setField("title", attachment.title); + } + } + } + + return attachmentID; +} + /* * executed when an item is done and ready to be loaded into the database */ @@ -833,6 +872,8 @@ Scholar.Translate.prototype._itemDone = function(item) { var myID = Scholar.Notes.add(item.note); // re-retrieve the item var newItem = Scholar.Items.get(myID); + } else if(type == "attachment") { + var myID = this._itemImportAttachment(item, null); } else { // create new item var typeID = Scholar.ItemTypes.getID(type); @@ -911,9 +952,11 @@ Scholar.Translate.prototype._itemDone = function(item) { // handle attachments if(item.attachments) { for each(var attachment in item.attachments) { - if(!attachment.url && (this.type != "web" || !attachment.document)) { - Scholar.debug("not adding attachment: no URL specified"); - } else if(this.type == "web") { + if(this.type == "web") { + if(!attachment.url && !attachment.document) { + Scholar.debug("not adding attachment: no URL specified"); + } + if(attachment.downloadable && this._downloadAssociatedFiles) { if(attachment.document) { var attachmentID = Scholar.Attachments.importFromDocument(attachment.document, myID); @@ -925,9 +968,7 @@ Scholar.Translate.prototype._itemDone = function(item) { attachmentItem.setField("title", attachment.title); } } else { - Scholar.Attachments.importFromURL(attachment.url, myID, - (attachment.mimeType ? attachment.mimeType : undefined), - (attachment.title ? attachment.title : undefined)); + Scholar.Attachments.importFromURL(attachment.url, myID); } } else { if(attachment.document) { @@ -945,7 +986,7 @@ Scholar.Translate.prototype._itemDone = function(item) { } } } else if(this.type == "import") { - // TODO + this._itemImportAttachment(attachment, myID); } } } @@ -1173,7 +1214,6 @@ Scholar.Translate.prototype._importConfigureIO = function() { * does the actual export, after code has been loaded and parsed */ Scholar.Translate.prototype._export = function() { - this._exportConfigureIO(); // get items if(this.items) { @@ -1181,6 +1221,7 @@ Scholar.Translate.prototype._export = function() { } else { this._itemsLeft = Scholar.getItems(); } + // run handler for items available this._runHandler("itemCount", this._itemsLeft.length); @@ -1189,6 +1230,45 @@ Scholar.Translate.prototype._export = function() { this._collectionsLeft = Scholar.getCollections(); } + Scholar.debug(this._displayOptions); + + // export file data, if requested + if(this._displayOptions["exportFileData"]) { + // generate directory + var directory = Components.classes["@mozilla.org/file/local;1"]. + createInstance(Components.interfaces.nsILocalFile); + directory.initWithFile(this.location.parent); + + // get name + var name = this.location.leafName; + var extensionMatch = /^(.*)\.[a-zA-Z0-9]+$/ + var m = extensionMatch.exec(name); + if(m) { + name = m[0]; + } + directory.append(name); + + // create directory + directory.create(Components.interfaces.nsIFile.DIRECTORY_TYPE, 0700); + + // generate a new location + var originalName = this.location.leafName; + this.location = Components.classes["@mozilla.org/file/local;1"]. + createInstance(Components.interfaces.nsILocalFile); + this.location.initWithFile(directory); + this.location.append(originalName); + + // create files directory + this._exportFileDirectory = Components.classes["@mozilla.org/file/local;1"]. + createInstance(Components.interfaces.nsILocalFile); + this._exportFileDirectory.initWithFile(directory); + this._exportFileDirectory.append("files"); + this._exportFileDirectory.create(Components.interfaces.nsIFile.DIRECTORY_TYPE, 0700); + } + + // configure IO + this._exportConfigureIO(); + try { this._sandbox.doExport(); } catch(e) { @@ -1229,14 +1309,98 @@ Scholar.Translate.prototype._exportConfigureIO = function() { } } +/* + * copies attachment and returns data, given an attachment object + */ +Scholar.Translate.prototype._exportGetAttachment = function(attachment) { + var attachmentArray = new Object(); + + var attachmentID = attachment.getID(); + var linkMode = attachment.getAttachmentLinkMode(); + + // get url if one exists + if(linkMode == Scholar.Attachments.LINK_MODE_LINKED_URL || + linkMode == Scholar.Attachments.LINK_MODE_IMPORTED_URL) { + var url = attachment.getURL() + attachmentArray.url = url; + } else if(!this._displayOptions["exportFileData"]) { + // only export urls, not files, if exportFileData is off + return false; + } + // add item ID + attachmentArray.itemID = attachmentID; + // get title + attachmentArray.title = attachment.getField("title"); + // get mime type + attachmentArray.mimeType = attachment.getAttachmentMimeType(); + + if(linkMode != Scholar.Attachments.LINK_MODE_LINKED_URL && + this._displayOptions["exportFileData"]) { + // add path and filename if not an internet link + attachmentArray.path = "files/"+attachmentID+"/"; + var file = attachment.getFile(); + attachmentArray.filename = file.leafName; + + if(linkMode == Scholar.Attachments.LINK_MODE_LINKED_FILE) { + // create a new directory + var directory = Components.classes["@mozilla.org/file/local;1"]. + createInstance(Components.interfaces.nsILocalFile); + directory.initWithFile(this._exportFileDirectory); + directory.append(attachmentID); + directory.create(Components.interfaces.nsIFile.DIRECTORY_TYPE, 0700); + // copy file + file.copyTo(directory, attachmentArray.filename); + } else { + // copy imported files from the Scholar directory + var directory = Scholar.getStorageDirectory(); + directory.append(attachmentID); + directory.copyTo(this._exportFileDirectory, attachmentID); + } + } + + Scholar.debug(attachmentArray); + + return attachmentArray; +} + /* * gets the next item to process (called as Scholar.nextItem() from code) */ Scholar.Translate.prototype._exportGetItem = function() { if(this._itemsLeft.length != 0) { var returnItem = this._itemsLeft.shift(); + + // skip files if exportFileData is off, or if the file isn't standalone + if(returnItem.isAttachment() && + (!this._displayOptions["exportFileData"] || + returnItem.getSource())) { + return this._exportGetItem(); + } + + // export file data for single files + if(returnItem.isAttachment()) { // an independent attachment + var returnItemArray = this._exportGetAttachment(returnItem); + returnItemArray.itemType = "attachment"; + return returnItemArray; + } else { + var returnItemArray = returnItem.toArray(); + // get attachments, although only urls will be passed if exportFileData + // is off + returnItemArray.attachments = new Array(); + var attachments = returnItem.getAttachments(); + for each(attachmentID in attachments) { + var attachment = Scholar.Items.get(attachmentID); + var attachmentInfo = this._exportGetAttachment(attachment); + + if(attachmentInfo) { + returnItemArray.attachments.push(attachmentInfo); + } + } + } + this._runHandler("itemDone", returnItem); - return returnItem.toArray(); + + return returnItemArray; } return false; diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index e4f027f16..e5476a496 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -254,10 +254,19 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) { Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) { this.processDocuments([ url ], succeeded, null, failed); } + +Scholar.Utilities.Ingester._protocolRe = new RegExp(); +Scholar.Utilities.Ingester._protocolRe.compile("^(?:(?:http|https|ftp):|[^:]*/)", "i"); Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) { if(this.translate.locationIsProxied) { for(i in urls) { - urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); + if(this.translate.locationIsProxied) { + urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); + } + // check for a protocol colon + if(!Scholar.Utilities.Ingester._protocolRe.test(uris[i])) { + throw("invalid URL in processDocuments"); + } } } @@ -282,6 +291,9 @@ Scholar.Utilities.Ingester.HTTP.prototype.doGet = function(url, onDone) { if(this.translate.locationIsProxied) { url = Scholar.Ingester.ProxyMonitor.properToProxy(url); } + if(!Scholar.Utilities.Ingester._protocolRe.test(url)) { + throw("invalid URL in processDocuments"); + } var translate = this.translate; Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) { @@ -298,6 +310,9 @@ Scholar.Utilities.Ingester.HTTP.prototype.doPost = function(url, body, onDone) { if(this.translate.locationIsProxied) { url = Scholar.Ingester.ProxyMonitor.properToProxy(url); } + if(!Scholar.Utilities.Ingester._protocolRe.test(url)) { + throw("invalid URL in processDocuments"); + } var translate = this.translate; Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) { diff --git a/scrapers.sql b/scrapers.sql index 100ff8df4..e450e99de 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 50 +-- 51 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00')); @@ -3548,12 +3548,39 @@ function generateCollection(collection) { Scholar.RDF.addStatement(collectionResource, n.dcterms+"hasPart", "#collection:"+child.id, false); // do recursive processing of collections generateCollection(child); - } else { + } else if(itemResources[child.id]) { Scholar.RDF.addStatement(collectionResource, n.dcterms+"hasPart", itemResources[child.id], false); } } } +function handleAttachment(attachmentResource, attachment) { + Scholar.RDF.addStatement(attachmentResource, rdf+"type", n.fs+"File", false); + + if(attachment.url) { + // add url as identifier + var term = Scholar.RDF.newResource(); + // set term type + Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"URI", false); + // set url value + Scholar.RDF.addStatement(term, rdf+"value", attachment.url, true); + // add relationship to resource + Scholar.RDF.addStatement(attachmentResource, n.dc+"identifier", term, false); + } + + // add mime type + var term = Scholar.RDF.newResource(); + // set term type + Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"IMT", false); + // set mime type value + Scholar.RDF.addStatement(term, rdf+"value", attachment.mimeType, true); + // add relationship to resource + Scholar.RDF.addStatement(attachmentResource, n.dc+"format", term, false); + + // add title + Scholar.RDF.addStatement(attachmentResource, n.dc+"title", attachment.title, true); +} + function doExport() { rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; @@ -3563,7 +3590,8 @@ function doExport() { dcterms:"http://purl.org/dc/terms/", prism:"http://prismstandard.org/namespaces/1.2/basic/", foaf:"http://xmlns.com/foaf/0.1/", - vcard:"http://nwalsh.com/rdf/vCard" + vcard:"http://nwalsh.com/rdf/vCard#", + fs:"http://chnm.gmu.edu/firefoxscholar/rdf#" }; // add namespaces @@ -3584,7 +3612,10 @@ function doExport() { while(item = Scholar.nextItem()) { items.push(item); - if(item.ISBN && !usedResources["urn:isbn:"+item.ISBN]) { + if(item.itemType == "attachment" && item.path) { + // file is stored locally (paths are always unique) + itemResources[item.itemID] = item.path+item.filename; + } else if(item.ISBN && !usedResources["urn:isbn:"+item.ISBN]) { itemResources[item.itemID] = "urn:isbn:"+item.ISBN; usedResources[itemResources[item.itemID]] = true; } else if(item.url && !usedResources[item.url]) { @@ -3598,6 +3629,20 @@ function doExport() { for(var j in item.notes) { itemResources[item.notes[j].itemID] = "#item:"+item.notes[j].itemID; } + + for each(var attachment in item.attachments) { + if(attachment.path) { + // file is stored locally (paths are always unique) + itemResources[attachment.itemID] = attachment.path+attachment.filename; + } else if(!usedResources[attachment.url]) { + // file is referenced via url, and no other item has this url + itemResources[attachment.itemID] = attachment.url; + usedResources[attachment.url] = true; + } else { + // just specify a node ID + itemResources[attachment.itemID] = "#item:"+attachment.itemID; + } + } } for each(item in items) { @@ -3650,6 +3695,9 @@ function doExport() { if(!Scholar.getOption("exportNotes")) { continue; } + } else if(item.itemType == "attachment") { + handleAttachment(resource, item); + continue; } if(type) { Scholar.RDF.addStatement(resource, rdf+"type", n.bib+type, false); @@ -3692,6 +3740,18 @@ function doExport() { Scholar.RDF.addStatement(resource, n.dc+"source", item.source, true); } + // url + if(item.url) { + // add url as identifier + var term = Scholar.RDF.newResource(); + // set term type + Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"URI", false); + // set url value + Scholar.RDF.addStatement(term, rdf+"value", attachment.url, true); + // add relationship to resource + Scholar.RDF.addStatement(resource, n.dc+"identifier", term, false); + } + // accessionNumber as generic ID if(item.accessionNumber) { Scholar.RDF.addStatement(resource, n.dc+"identifier", item.accessionNumber, true); @@ -3745,7 +3805,7 @@ function doExport() { } // publication gets linked to container via isPartOf - if(item.publication) { + if(item.publicationTitle) { Scholar.RDF.addStatement((containerElement ? containerElement : resource), n.dc+"title", item.publicationTitle, true); } @@ -3860,6 +3920,14 @@ function doExport() { } } + /** FILES **/ + + for each(var attachment in item.attachments) { + var attachmentResource = itemResources[attachment.itemID]; + Scholar.RDF.addStatement(resource, n.dc+"relation", attachmentResource, false); + handleAttachment(attachmentResource, attachment); + } + /** TAGS **/ for(var j in item.tags) { @@ -4048,6 +4116,54 @@ function handleCreators(newItem, creators, creatorType) { } } +// gets attachment info +function handleAttachment(node, attachment) { + if(!attachment) { + attachment = new Array(); + } + + attachment.title = getFirstResults(node, [n.dc+"title"], true); + + var identifiers = getFirstResults(node, [n.dc+"identifier"]); + for each(var identifier in identifiers) { + if(typeof(identifier) != "string") { + var identifierType = Scholar.RDF.getTargets(identifier, rdf+"type"); + if(identifierType) { + identifierType = Scholar.RDF.getResourceURI(identifierType[0]); + + if(identifierType == n.dcterms+"URI") { // uri is url + attachment.url = getFirstResults(identifier, [rdf+"value"], true); + } + } + } + } + + var formats = getFirstResults(node, [n.dc+"format"]); + for each(var format in formats) { + if(typeof(format) != "string") { + var formatType = Scholar.RDF.getTargets(format, rdf+"type"); + if(formatType) { + formatType = Scholar.RDF.getResourceURI(formatType[0]); + + if(formatType == n.dcterms+"IMT") { // uri is url + attachment.mimeType = getFirstResults(format, [rdf+"value"], true); + } + } + } + } + + var stringNode = node; + if(typeof(stringNode) != "string") { + stringNode = Scholar.RDF.getResourceURI(stringNode); + } + if(stringNode.substr(0, 8) == "file:///") { + // not a protocol specifier; we have a path name + attachment.path = stringNode; + } + + return attachment; +} + // processes collections recursively function processCollection(node, collection) { if(!collection) { @@ -4104,7 +4220,8 @@ function doImport() { dcterms:"http://purl.org/dc/terms/", prism:"http://prismstandard.org/namespaces/1.2/basic/", foaf:"http://xmlns.com/foaf/0.1/", - vcard:"http://nwalsh.com/rdf/vCard" + vcard:"http://nwalsh.com/rdf/vCard#", + fs:"http://chnm.gmu.edu/firefoxscholar/rdf#" }; callNumberTypes = [ @@ -4165,8 +4282,6 @@ function doImport() { } else if(type == n.bib+"Memo") { // check to see if this note is independent var arcs = Scholar.RDF.getArcsIn(node); - Scholar.Utilities.debug("working on a note"); - Scholar.Utilities.debug(arcs); var skip = false; for each(var arc in arcs) { arc = Scholar.RDF.getResourceURI(arc); @@ -4184,6 +4299,19 @@ function doImport() { // skip collections until all the items are done collections.push(node); continue; + } else if(type == n.fs+"File") { + // check to see if file is independent + var arcs = Scholar.RDF.getArcsIn(node); + if(arcs.length) { + continue; + } + + // process as file + newItem.itemType = "attachment"; + handleAttachment(node, newItem); + Scholar.Utilities.debug(newItem); + newItem.complete(); + continue; } else { // default to book newItem.itemType = "book"; } @@ -4361,13 +4489,25 @@ function doImport() { } } } + + /* ATTACHMENTS */ + var relations = getFirstResults(node, [n.dc+"relation"]); + for each(var relation in relations) { + var type = Scholar.RDF.getTargets(relation, rdf+"type"); + if(type) { + type = Scholar.RDF.getResourceURI(type[0]); + if(type == n.fs+"File") { + newItem.attachments.push(handleAttachment(relation)); + } + } + } newItem.complete(); } /* COLLECTIONS */ - for each(collection in collections) { + for each(var collection in collections) { if(!Scholar.RDF.getArcsIn(collection)) { var newCollection = new Scholar.Collection(); processCollection(collection, newCollection);