diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js index 80b9cc8cb..232a9fbfd 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/translate.js +++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -346,7 +346,8 @@ Scholar.Translate.prototype._exportConfigureIO = function() { var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1'].getService(Components.interfaces.nsIRDFService); var IOService = Components.classes['@mozilla.org/network/io-service;1'].getService(Components.interfaces.nsIIOService); var AtomService = Components.classes["@mozilla.org/atom-service;1"].getService(Components.interfaces.nsIAtomService); - + var RDFContainerUtils = Components.classes["@mozilla.org/rdf/container-utils;1"].getService(Components.interfaces.nsIRDFContainerUtils); + // create data source var dataSource = Components.classes["@mozilla.org/rdf/datasource;1?name=xml-datasource"]. createInstance(Components.interfaces.nsIRDFDataSource); @@ -360,16 +361,66 @@ Scholar.Translate.prototype._exportConfigureIO = function() { // writes an RDF triple this._sandbox.model.addStatement = function(about, relation, value, literal) { + Scholar.debug("pre: model.addStatement("+about+", "+relation+", "+value+", "+literal+")"); + if(!(about instanceof Components.interfaces.nsIRDFResource)) { about = RDFService.GetResource(about); } - dataSource.Assert(about, RDFService.GetResource(relation), - (literal ? RDFService.GetLiteral(value) : RDFService.GetResource(value)), true); + if(!(value instanceof Components.interfaces.nsIRDFResource)) { + if(literal) { + value = RDFService.GetLiteral(value); + } else { + value = RDFService.GetResource(value); + } + } + + Scholar.debug("post: model.addStatement("+about+", "+relation+", "+value+", "+literal+")"); + + dataSource.Assert(about, RDFService.GetResource(relation), value, true); } // creates an anonymous resource this._sandbox.model.newResource = function() { return RDFService.GetAnonymousResource() }; + // creates a new container + this._sandbox.model.newContainer = function(type, about) { + if(!(about instanceof Components.interfaces.nsIRDFResource)) { + about = RDFService.GetResource(about); + } + + type = type.toLowerCase(); + if(type == "bag") { + return RDFContainerUtils.MakeBag(dataSource, about); + } else if(type == "seq") { + return RDFContainerUtils.MakeSeq(dataSource, about); + } else if(type == "alt") { + return RDFContainerUtils.MakeAlt(dataSource, about); + } else { + throw "Invalid container type in model.newContainer"; + } + } + + // adds a new container (index optional) + this._sandbox.model.addContainerElement = function(about, element, index) { + if(!(about instanceof Components.interfaces.nsIRDFContainer)) { + if(!(about instanceof Components.interfaces.nsIRDFResource)) { + about = RDFService.GetResource(about); + } + var container = Components.classes["@mozilla.org/rdf/container;1"]. + createInstance(Components.interfaces.nsIRDFContainer); + container.Init(dataSource, about); + } + if(!(element instanceof Components.interfaces.nsIRDFResource)) { + element = RDFService.GetResource(element); + } + + if(index) { + about.InsertElementAt(element, index, true); + } else { + about.AppendElement(element); + } + } + // sets a namespace this._sandbox.model.addNamespace = function(prefix, uri) { serializer.addNameSpace(AtomService.getAtom(prefix), uri); @@ -379,7 +430,7 @@ Scholar.Translate.prototype._exportConfigureIO = function() { serializer.QueryInterface(Components.interfaces.nsIRDFXMLSource); serializer.Serialize(foStream); - delete dataSource, RDFService, IOService, AtomService; + delete dataSource, RDFService, IOService, AtomService, RDFContainerUtils; }); } else { /*** FUNCTIONS ***/ diff --git a/scrapers.sql b/scrapers.sql index 328d02313..241e6be27 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,7 +1,7 @@ --- 32 +-- 33 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-07-05 23:40:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-07-07 12:44:00')); REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 3, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', 'if(doc.location.href.indexOf("search") >= 0) { @@ -2465,8 +2465,7 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) { wait();'); REPLACE INTO "translators" VALUES ('0e2235e7-babf-413c-9acf-f27cce5f059c', '2006-07-05 23:40:00', 2, 'MODS (XML)', 'Simon Kornblith', 'xml', -'configure("getCollections", true); -addOption("exportNotes", true); +'addOption("exportNotes", true); addOption("exportFileData", true);', 'var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; var rdf = new Namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); @@ -2515,7 +2514,7 @@ function generateSeeAlso(id, seeAlso, rdfDoc) { } function translate(items, collections) { - var rdfDoc = ; + //var rdfDoc = ; var modsCollection = ; for(var i in items) { @@ -2772,9 +2771,9 @@ function translate(items, collections) { mods.note += note; // Add see also info to RDF - if(item.notes[j].seeAlso) { + /*if(item.notes[j].seeAlso) { rdfDoc.Description += generateSeeAlso(item.notes[j].itemID, item.notes[j].seeAlso); - } + }*/ } if(item.note) { @@ -2793,30 +2792,359 @@ function translate(items, collections) { /** RDF STRUCTURE **/ // Add see also info to RDF - if(item.seeAlso) { + /*if(item.seeAlso) { generateSeeAlso(item.itemID, item.seeAlso, rdfDoc); - } + }*/ modsCollection.mods += mods; } - for(var i in collections) { + /*for(var i in collections) { generateCollection(collections[i], rdfDoc); } - modsCollection.rdf::RDF = rdfDoc; + modsCollection.rdf::RDF = rdfDoc;*/ write(''''+"\n"); write(modsCollection.toXMLString()); }'); -REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006-07-05 23:40:00', 2, 'Dublin Core (RDF/XML)', 'Simon Kornblith', 'xml', +REPLACE INTO "translators" VALUES ('14763d24-8ba0-45df-8f52-b8d1108e7ac9', '2006-07-07 12:44:00', 2, 'Biblio/DC/FOAF/PRISM/VCard (RDF/XML)', 'Simon Kornblith', 'rdf', +'configure("getCollections", true); +configure("dataMode", "rdf");', +'function generateSeeAlso(resource, seeAlso) { + for(var i in seeAlso) { + model.addStatement(resource, n.dc+"relation", itemResources[seeAlso[i]], false); + } +} + +function generateCollection(collection) { + var collectionResource = "#collection:"+collection.id; + model.addStatement(collectionResource, rdf+"type", n.bib+"Collection", false); + + for(var i in collection.children) { + var child = collection.children[i]; + + // add child list items + if(child.type == "collection") { + model.addStatement(collectionResource, n.dc+"hasPart", "#collection:"+child.id, false); + // do recursive processing of collections + generateCollection(child); + } else { + model.addStatement(collectionResource, n.dc+"hasPart", itemResources[child.id], false); + } + } +} + +function getContainerIfExists() { + if(container) { + if(containerElement) { + return containerElement; + } else { + containerElement = model.newResource(); + // attach container to section (if exists) or resource + model.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); + return containerElement; + } + } else { + return resource; + } +} + +function translate(items, collections) { + rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + n = { + bib:"http://purl.org/net/biblio#", + dc:"http://purl.org/dc/elements/1.1/", + dcterms:"http://purl.org/dc/terms/", + prism:"http://prismstandard.org/namespaces/1.2/basic/", + foaf:"http://xmlns.com/foaf/0.1/", + vcard:"http://nwalsh.com/rdf/vCard" + }; + + // add namespaces + for(var i in n) { + model.addNamespace(i, n[i]); + } + + // leave as global + itemResources = new Array(); + + // first, map each ID to a resource + for(var i in items) { + item = items[i]; + + if(item.ISBN) { + itemResources[item.itemID] = "urn:isbn:"+item.ISBN; + } else if(item.url) { + itemResources[item.itemID] = item.url; + } else { + // just specify a node ID + itemResources[item.itemID] = "#item:"+item.itemID; + } + + for(var j in item.notes) { + itemResources[item.notes[j].itemID] = "#item:"+item.notes[j].itemID; + } + } + + for(var i in items) { + // these items are global + item = items[i]; + resource = itemResources[item.itemID]; + + container = null; + containerElement = null; + section = null; + + /** CORE FIELDS **/ + + // title + if(item.title) { + model.addStatement(resource, n.dc+"title", item.title, true); + } + + // type + var type = null; + if(item.itemType == "book") { + type = "Book"; + } else if (item.itemType == "bookSection") { + type = "BookSection"; + container = "Book"; + } else if(item.itemType == "journalArticle") { + type = "Article"; + container = "Journal"; + } else if(item.itemType == "magazineArticle") { + type = "Article"; + container = "Periodical"; + } else if(item.itemType == "newspaperArticle") { + type = "Article"; + container = "Newspaper"; + } else if(item.itemType == "thesis") { + type = "Thesis"; + } else if(item.itemType == "letter") { + type = "Letter"; + } else if(item.itemType == "manuscript") { + type = "Manuscript"; + } else if(item.itemType == "interview") { + type = "Interview"; + } else if(item.itemType == "film") { + type = "MotionPicture"; + } else if(item.itemType == "artwork") { + type = "Illustration"; + } else if(item.itemType == "website") { + type = "Document"; + } else if(item.itemType == "note") { + type = "Memo"; + } + if(type) { + model.addStatement(resource, rdf+"type", n.bib+type, false); + } + + // authors/editors/contributors + var creatorContainers = new Object(); + for(var j in item.creators) { + var creator = model.newResource(); + model.addStatement(creator, rdf+"type", n.foaf+"Person", false); + // gee. an entire vocabulary for describing people, and these aren''t even + // standardized in it. oh well. using them anyway. + model.addStatement(creator, n.foaf+"surname", item.creators[j].lastName, true); + model.addStatement(creator, n.foaf+"givenname", item.creators[j].firstName, true); + + // in addition, these tags are not yet in Biblio, but Bruce D''Arcus + // says they will be. + if(item.creators[j].creatorType == "author") { + var cTag = "authors"; + } else if(item.creators[j].creatorType == "editor") { + var cTag = "editors"; + } else { + var cTag = "contributors"; + } + + if(!creatorContainers[cTag]) { + var creatorResource = model.newResource(); + // create new seq for author type + creatorContainers[cTag] = model.newContainer("seq", creatorResource); + // attach container to resource + model.addStatement(resource, n.bib+cTag, creatorResource, false); + } + model.addContainerElement(creatorContainers[cTag], creator, true); + } + + /** FIELDS ON NEARLY EVERYTHING BUT NOT A PART OF THE CORE **/ + + // source + if(item.source) { + model.addStatement(resource, n.dc+"source", item.source, true); + } + + // accessionNumber as generic ID + if(item.accessionNumber) { + model.addStatement(resource, n.dc+"identifier", item.accessionNumber, true); + } + + // rights + if(item.rights) { + model.addStatement(resource, n.dc+"rights", item.rights, true); + } + + /** SUPPLEMENTAL FIELDS **/ + + // use section to set up another container element + if(item.section) { + section = model.newResource(); // leave as global + // set section type + model.addStatement(section, rdf+"type", n.bib+"Part", false); + // set section title + model.addStatement(section, n.dc+"title", item.section, true); + // add relationship to resource + model.addStatement(resource, n.dc+"isPartOf", section, false); + } + // use ISSN to set up container element + if(item.ISSN) { + containerElement = "urn:issn:"+item.ISSN; // leave as global + // attach container to section (if exists) or resource + model.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); + } + + // publication gets linked to container via isPartOf + if(item.publication) { + model.addStatement(getContainerIfExists(), n.dc+"title", item.publication, true); + } + + // series also linked in + if(item.series) { + var series = model.newResource(); + // set series type + model.addStatement(series, rdf+"type", n.bib+"Series", false); + // set series title + model.addStatement(series, n.dc+"title", item.series, true); + // add relationship to resource + model.addStatement(getContainerIfExists(), n.dcterms+"isPartOf", series, false); + } + + // volume + if(item.volume) { + model.addStatement(getContainerIfExists(), n.prism+"volume", item.volume, true); + } + // number + if(item.number) { + model.addStatement(getContainerIfExists(), n.prism+"number", item.number, true); + } + // edition + if(item.edition) { + model.addStatement(resource, n.prism+"edition", item.edition, true); + } + // publisher/distributor and place + if(item.publisher || item.distributor || item.place) { + var organization = model.newResource(); + // set organization type + model.addStatement(organization, rdf+"type", n.foaf+"Organization", false); + // add relationship to resource + model.addStatement(resource, n.dc+"publisher", organization, false); + // add publisher/distributor + if(item.publisher) { + model.addStatement(organization, n.foaf+"name", item.publisher, true); + } else if(item.distributor) { + model.addStatement(organization, n.foaf+"name", item.distributor, true); + } + // add place + if(item.place) { + var address = model.newResource(); + // set address type + model.addStatement(address, rdf+"type", n.vcard+"Address", false); + // set address locality + model.addStatement(address, n.vcard+"locality", item.place, true); + // add relationship to organization + model.addStatement(organization, n.vcard+"adr", address, false); + } + } + // date/year + if(item.date) { + model.addStatement(resource, n.dc+"date", item.date, true); + } else if(item.year) { + model.addStatement(resource, n.dc+"year", item.year, true); + } + + // callNumber + if(item.callNumber) { + var term = model.newResource(); + // set term type + model.addStatement(term, rdf+"type", n.dcterms+"LCC", false); + // set callNumber value + model.addStatement(term, rdf+"value", item.callNumber, true); + // add relationship to resource + model.addStatement(resource, n.dc+"subject", term, false); + } + + // archiveLocation + if(item.archiveLocation) { + model.addStatement(resource, n.dc+"coverage", item.archiveLocation, true); + } + + // medium + if(item.medium) { + model.addStatement(resource, n.dc+"medium", item.medium, true); + } + + // type (not itemType) + if(item.type) { + model.addStatement(resource, n.dc+"type", item.type, true); + } else if(item.thesisType) { + model.addStatement(resource, n.dc+"type", item.thesisType, true); + } + + // THIS IS NOT YET IN THE BIBLIO NAMESPACE, BUT BRUCE D''ARCUS HAS SAID + // IT WILL BE SOON + if(item.pages) { + model.addStatement(resource, n.bib+"pages", item.pages, true); + } + + /** NOTES **/ + + for(var j in item.notes) { + var noteResource = itemResources[item.notes[j].itemID]; + + // add note tag + model.addStatement(noteResource, rdf+"type", n.bib+"Memo", false); + // add note description (sorry, couldn''t find a better way of + // representing this data in an existing ontology) + model.addStatement(noteResource, n.dc+"description", item.notes[j].note, true); + // add relationship between resource and note + model.addStatement(resource, n.dcterms+"isReferencedBy", noteResource, false); + + // Add see also info to RDF + generateSeeAlso(item.notes[j].itemID, item.notes[j].seeAlso); + } + + if(item.note) { + model.addStatement(resource, n.dc+"description", item.note, true); + } + + /** TAGS **/ + + for(var j in item.tags) { + model.addStatement(resource, n.dc+"subject", item.tags[j], true); + } + + // Add see also info to RDF + generateSeeAlso(item.itemID, item.seeAlso); + + // ELEMENTS AMBIGUOUSLY ENCODED: callNumber, acccessionType + } + + /** RDF COLLECTION STRUCTURE **/ + for(var i in collections) { + generateCollection(collections[i]); + } +}'); + +REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006-07-05 23:40:00', 2, 'Unqualified Dublin Core (RDF/XML)', 'Simon Kornblith', 'rdf', 'configure("dataMode", "rdf");', 'function translate(items) { var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; - var dcterms = "http://purl.org/dc/terms/"; var dc = "http://purl.org/dc/elements/1.1/"; - model.addNamespace("dcterms", dcterms); model.addNamespace("dc", dc); for(var i in items) { @@ -2843,21 +3171,15 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 /** CORE FIELDS **/ - // XML tag titleInfo; object field title - model.addStatement(resource, dc+"title", item.title, true); - - // XML tag typeOfResource/genre; object field type - var type; - if(item.itemType == "film") { - type = "MovingImage"; - } else if(item.itemType == "artwork") { - type = "StillImage"; - } else { - type = "Text"; + // title + if(item.title) { + model.addStatement(resource, dc+"title", item.title, true); } - model.addStatement(resource, dc+"type", "http://purl.org/dc/dcmitype/"+type, false); - // XML tag name; object field creators + // type + model.addStatement(resource, dc+"type", item.itemType, true); + + // creators for(var j in item.creators) { // put creators in lastName, firstName format (although DC doesn''t specify) var creator = item.creators[j].lastName; @@ -2891,19 +3213,8 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 /** SUPPLEMENTAL FIELDS **/ - // publication/series -> isPartOf - if(item.publication) { - model.addStatement(resource, dcterms+"isPartOf", item.publication, true); - } else if(item.series) { - model.addStatement(resource, dcterms+"isPartOf", item.series, true); - } - // TODO - create text citation and OpenURL citation to handle volume, number, pages, issue, place - // edition - if(item.edition) { - model.addStatement(resource, dcterms+"hasVersion", item.edition, true); - } // publisher/distributor if(item.publisher) { model.addStatement(resource, dc+"publisher", item.publisher, true); @@ -2918,18 +3229,10 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 } // ISBN/ISSN - var identifier = false; if(item.ISBN) { - identifier = "urn:isbn:"+item.ISBN; + model.addStatement(resource, dc+"identifier", "ISBN "+item.ISBN, true); } else if(item.ISSN) { - identifier = "urn:issn:"+item.ISSN; - } - if(identifier) { - if(isPartialItem) { - model.addStatement(resource, dc+"isPartOf", identifier, false); - } else { - model.addStatement(resource, dc+"identifier", identifier, false); - } + model.addStatement(resource, dc+"identifier", "ISSN "+item.ISSN, true); } // callNumber @@ -2944,7 +3247,6 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 } }'); - REPLACE INTO "translators" VALUES ('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7', '2006-06-30 15:36:00', 2, 'RIS', 'Simon Kornblith', 'ris', 'addOption("exportNotes", true); addOption("exportFileData", true);',