diff --git a/chrome/chromeFiles/content/scholar/fileInterface.js b/chrome/chromeFiles/content/scholar/fileInterface.js new file mode 100644 index 000000000..e20999408 --- /dev/null +++ b/chrome/chromeFiles/content/scholar/fileInterface.js @@ -0,0 +1,26 @@ +Scholar_File_Interface = new function() { + this.exportFile = exportFile; + + /* + * Creates Scholar.Translate instance and shows file picker for file export + */ + function exportFile() { + var translation = new Scholar.Translate("export"); + var translators = translation.getTranslators(); + + const nsIFilePicker = Components.interfaces.nsIFilePicker; + var fp = Components.classes["@mozilla.org/filepicker;1"] + .createInstance(nsIFilePicker); + fp.init(window, "Export", nsIFilePicker.modeSave); + for(var i in translators) { + fp.appendFilter(translators[i].label, translators[i].target); + } + var rv = fp.show(); + if (rv == nsIFilePicker.returnOK) { + translation.setLocation(fp.file); + translation.setTranslator(translators[fp.filterIndex]); + translation.setHandler("done", Scholar_Ingester_Interface.exportDone); + translation.translate(); + } + } +} \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index 5ec9ed08e..a4be47d59 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -26,7 +26,7 @@ Scholar_Ingester_Interface.init = function() { Scholar_Ingester_Interface.browsers = new Array(); Scholar_Ingester_Interface.browserDocuments = new Object(); Scholar_Ingester_Interface.browserUris = new Array(); - Scholar_Ingester_Interface._scrapePopupShowing = new Array(); + Scholar_Ingester_Interface._scrapePopupShowing = false; Scholar.Ingester.ProxyMonitor.init(); window.addEventListener("load", Scholar_Ingester_Interface.chromeLoad, false); diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index 6195edb0e..0082f36ac 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -41,8 +41,7 @@ Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) { var succeeded = function(browser) { var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden); - myDoc.retrieveScraper(); - if(myDoc.scraper) { + if(myDoc.retrieveTranslator()) { myDoc.scrapePage(function(myDoc) { Scholar.Ingester.deleteHiddenBrowser(browser); complete(myDoc); @@ -241,7 +240,11 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {} // ////////////////////////////////////////////////////////////////////////////// -/* Public properties: +/* THIS CODE IS GOING AWAY + * eventually, all ingesting will be part of a unified API in Scholar.Translate. + * until then, Scholar.Ingester.Document reigns supreme. + * + * Public properties: * browser - browser window object of document * model - data model for semantic scrapers * scraper - best scraper to use to scrape page @@ -288,7 +291,7 @@ Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) { Scholar.Ingester.Document.prototype.retrieveScraper = function() { Scholar.debug("Retrieving scrapers for "+this.url); - var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC'; + var sql = 'SELECT * FROM translators WHERE type = 3 ORDER BY detectCode IS NULL DESC'; var scrapers = Scholar.DB.query(sql); for(var i=0; i text + * returns: a numerically indexed array of ids, as extracted from the passed + * string + * + * itemDone + * valid: web + * called: when an item has been processed; may be called asynchronously + * passed: an item object (see Scholar.Item) + * returns: N/A + * + * done + * valid: all + * called: when all processing is finished + * passed: return value of the processing function + * returns: N/A + */ +Scholar.Translate.prototype.setHandler = function(type, handler) { + this._handlers[type] = handler; +} + +/* + * gets translator options to be displayed in a dialog + * + * NOT IMPLEMENTED + */ +Scholar.Translate.prototype.getOptions = function() { +} + +/* + * sets translator options to be displayed in a dialog + * + * NOT IMPLEMENTED + */ +Scholar.Translate.prototype.setOptions = function() { +} + +/* + * does the actual translation + */ +Scholar.Translate.prototype.translate = function() { + this._complete = false; + Scholar.debug("converting using "+this.translator.label); + + try { + Components.utils.evalInSandbox(this.translator.code, this._sandbox); + } catch(e) { + Scholar.debug(e+' in parsing code for '+this.translator.label); + this._translationComplete(false); + return; + } + + if(this.type == "export") { + var returnValue = this._export(); + } + + // If synchronous, call _translationComplete(); + if(!this._waitForCompletion && returnValue) { + this._translationComplete(returnValue); + } +} + +/* + * generates a sandbox for scraping/scraper detection + */ +Scholar.Translate.prototype._generateSandbox = function() { + if(this.type == "web") { + this._sandbox = new Components.utils.Sandbox(url); + this._sandbox.browser = this.browser; + this._sandbox.doc = this.browser.contentDocument; + this._sandbox.url = this.sandboxURL; + this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden); + this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL); + this._sandbox.model = this.model; + } else { + this._sandbox = new Components.utils.Sandbox(""); + this._sandbox.utilities = new Scholar.Utilities(); + } + + this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; + this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record; + this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record(); + + var me = this; + this._sandbox.wait = function() {me._enableAsynchronous() }; + if(this.type == "export") { + this._sandbox.write = function(data) { me._exportWrite(data); }; + } +} + +/* + * makes translation API wait until done() has been called from the translator + * before executing _translationComplete; called as wait() + */ +Scholar.Translate.prototype._enableAsynchronous = function() { + this._waitForCompletion = true; + this._sandbox.done = function(returnValue) { me._translationComplete(returnValue); }; +} + +/* + * executed on translator completion, either automatically from a synchronous + * scraper or as done() from an asynchronous scraper + * + * finishes things up and calls callback function(s) + */ +Scholar.Translate.prototype._translationComplete = function(returnValue) { + // to make sure this isn't called twice + if(!this._complete) { + this._complete = true; + + if(this.type == "export" || this.type == "import") { + this.foStream.close(); + } + + // call handler + if(this._handlers.done) { + this._handlers.done(this, returnValue); + } + } +} + +/* + * does the actual export, after code has been loaded and parsed + */ +Scholar.Translate.prototype._export = function() { + // get items + var itemObjects = Scholar.getItems(); + var itemArrays = new Array(); + for(var i in itemObjects) { + itemArrays.push(itemObjects[i].toArray()); + } + + // open file + this.foStream = Components.classes["@mozilla.org/network/file-output-stream;1"] + .createInstance(Components.interfaces.nsIFileOutputStream); + this.foStream.init(this.location, 0x02 | 0x08 | 0x20, 0664, 0); // write, create, truncate + + + try { + return this._sandbox.doExport(itemArrays); + } catch(e) { + Scholar.debug(e+' in executing code for '+this.translator.label); + this._translationComplete(false); + } +} + +// TODO - allow writing in different character sets +Scholar.Translate.prototype._exportWrite = function(data) { + this.foStream.write(data, data.length); +} \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index 9445935ff..292e6f63c 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -121,6 +121,25 @@ Scholar.Utilities.prototype.cleanTags = function(x) { return x.replace(/<[^>]+>/g, ""); } +/* + * Test if a string is an integer + */ +Scholar.Utilities.prototype.isInt = function(x) { + if(parseInt(x) == x) { + return true; + } + return false; +} + +/* + * Get current scholar version + */ +Scholar.Utilities.prototype.getVersion = function() { + return Scholar.version; +} + +Scholar.Utilities.prototype.inArray = Scholar.inArray; + /* * END SCHOLAR FOR FIREFOX EXTENSIONS */ diff --git a/chrome/chromeFiles/skin/default/scholar/scholar.css b/chrome/chromeFiles/skin/default/scholar/scholar.css index 1bd86c2df..7794b7aa6 100644 --- a/chrome/chromeFiles/skin/default/scholar/scholar.css +++ b/chrome/chromeFiles/skin/default/scholar/scholar.css @@ -37,7 +37,7 @@ noteeditor .scholar-progress-description { - width: 210px; + width: 220px; } .scholar-scrape-popup-library diff --git a/components/chnmIScholarService.js b/components/chnmIScholarService.js index 743032ee0..6636cced6 100644 --- a/components/chnmIScholarService.js +++ b/components/chnmIScholarService.js @@ -41,6 +41,10 @@ Cc["@mozilla.org/moz/jssubscript-loader;1"] Cc["@mozilla.org/moz/jssubscript-loader;1"] .getService(Ci.mozIJSSubScriptLoader) .loadSubScript("chrome://scholar/content/xpcom/ingester.js"); + +Cc["@mozilla.org/moz/jssubscript-loader;1"] + .getService(Ci.mozIJSSubScriptLoader) + .loadSubScript("chrome://scholar/content/xpcom/translate.js"); Cc["@mozilla.org/moz/jssubscript-loader;1"] .getService(Ci.mozIJSSubScriptLoader) diff --git a/schema.sql b/schema.sql index 2e75a4ecf..d4c9aebf1 100644 --- a/schema.sql +++ b/schema.sql @@ -1,4 +1,4 @@ --- 26 +-- 27 DROP TABLE IF EXISTS version; CREATE TABLE version ( @@ -136,18 +136,18 @@ DROP INDEX IF EXISTS itemID; CREATE INDEX itemID ON collectionItems(itemID); - DROP TABLE IF EXISTS scrapers; - CREATE TABLE scrapers ( - scraperID TEXT PRIMARY KEY, + DROP TABLE IF EXISTS translators; + CREATE TABLE translators ( + translatorID TEXT PRIMARY KEY, lastUpdated DATETIME, + type TEXT, label TEXT, creator TEXT, - urlPattern TEXT, - scraperDetectCode TEXT, - scraperJavaScript TEXT + target TEXT, + detectCode TEXT, + code TEXT ); - DROP TABLE IF EXISTS transactionSets; CREATE TABLE transactionSets ( transactionSetID INTEGER PRIMARY KEY, diff --git a/scrapers.sql b/scrapers.sql index 4d148ccbc..a0d749a61 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,9 +1,9 @@ --- 28 +-- 29 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-26 21:40:00')); -REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-26 16:01:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', +REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-26 16:01:00', 3, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', 'if(doc.title.indexOf("search") >= 0) { return "multiple"; } else { @@ -110,7 +110,7 @@ if(m) { scrape(doc); }'); -REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', +REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 3, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', 'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { return "book"; } else if(doc.title == ''FirstSearch: WorldCat List of Records'') { @@ -270,7 +270,7 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exportt }) wait();'); -REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', +REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 3, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', 'var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; for(i in export_options) { if(export_options[i].text == ''Latin1 MARC'' @@ -397,7 +397,7 @@ utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) { }) wait();'); -REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', +REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 3, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -615,7 +615,7 @@ utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse?citationAction=remov wait();'); -REPLACE INTO "scrapers" VALUES('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', +REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 3, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', 'if(doc.title == "History Cooperative: Search Results") { return "multiple"; } else { @@ -680,7 +680,7 @@ if(doc.title == "History Cooperative: Search Results") { scrape(doc); }'); -REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-26 16:01:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', +REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-26 16:01:00', 3, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', '// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); if(matchRegexp.test(doc.location.href)) { @@ -826,7 +826,7 @@ if(newUri) { wait();'); -REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 3, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -977,7 +977,7 @@ if(!scrape(doc)) { } '); -REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', +REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 3, 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', 'if(doc.title == "Results") { return "magazineArticle"; } else { @@ -1157,7 +1157,7 @@ if(doc.title == "Results") { } }'); -REPLACE INTO "scrapers" VALUES('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', +REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 3, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', 'if(doc.title.substring(0, 8) == "Article ") { return "magazineArticle"; } else doc.title.substring(0, 10) == "Citations ") { @@ -1278,7 +1278,7 @@ if(doc.title.substring(0, 8) == "Article ") { } }'); -REPLACE INTO "scrapers" VALUES('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)', +REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 3, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)', 'var detailRe = new RegExp("^http://[^/]+/universe/document"); if(detailRe.test(doc.location.href)) { return "newspaperArticle"; @@ -1378,7 +1378,7 @@ if(detailRe.test(doc.location.href)) { wait(); }'); -REPLACE INTO "scrapers" VALUES('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', +REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 3, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', 'var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); if(singleRe.test(doc.location.href)) { @@ -1465,7 +1465,7 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', +REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 3, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', 'var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); if(detailsRe.test(doc.location.href)) { return "book"; @@ -1543,7 +1543,7 @@ utilities.processDocuments(browser, null, uris, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)', +REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 3, 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)', 'var node = utilities.getNode(doc, doc, ''//a[text()="marc"]'', null); if(node) { return "book"; @@ -1644,7 +1644,7 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', +REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 3, 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', 'if(doc.location.href.indexOf("/authority_hits") > 0) { return "multiple"; } else { @@ -1708,7 +1708,7 @@ for(i in uris) { wait();'); -REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', +REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 3, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', 'if(doc.location.href.indexOf("/GeacQUERY") > 0) { return "multiple"; } else { @@ -1792,7 +1792,7 @@ utilities.processDocuments(browser, null, uris, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 3, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -1924,7 +1924,7 @@ utilities.HTTPUtilities.doGet(newUri+''?marks=''+recNumbers.join(",")+''&shadow= wait();'); -REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', +REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 3, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', 'var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); if(detailRe.test(doc.location.href)) { return "book"; @@ -2018,7 +2018,7 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', +REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 3, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', 'var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); if(searchRe.test(doc.location.href)) { return "multiple"; @@ -2195,7 +2195,7 @@ if(searchRe.test(doc.location.href)) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); }'); -REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', +REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 3, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', 'if(doc.location.href.indexOf("list_uids=") >= 0) { return "journalArticle"; } else { @@ -2324,7 +2324,7 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { wait();'); -REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 'Generic Scraper', 'Simon Kornblith', '', +REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 3, 'Generic Scraper', 'Simon Kornblith', '', 'return "website";', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -2373,7 +2373,7 @@ if(!foundTitle) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "website", false);'); -REPLACE INTO "scrapers" VALUES('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', +REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 3, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', 'var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i''); if(re.test(doc.location.href)) { return "book"; @@ -2462,4 +2462,271 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); }, function() { done(); }, function() {}); -wait();'); \ No newline at end of file +wait();'); + +REPLACE INTO "translators" VALUES ('0e2235e7-babf-413c-9acf-f27cce5f059c', '2006-06-28 16:00:00', 2, 'Metadata Object Description Schema (MODS)', 'Simon Kornblith', 'xml', +'options.add("Export project structure", "checkbox", "true"); +options.add("Export notes", "checkbox", "true");', +'var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; + +function doExport(items) { + var modsCollection = ; + + for(var i in items) { + var item = items[i]; + + var isPartialItem = false; + if(utilities.inArray(item.itemType, partialItemTypes)) { + isPartialItem = true; + } + + var mods = ; + mods.@ID = item.itemID; + + /** CORE FIELDS **/ + + // XML tag titleInfo; object field title + mods.titleInfo.title = item.title; + + // XML tag typeOfResource/genre; object field type + var modsType, marcGenre; + if(item.itemType == "book" || item.itemType == "bookSection") { + modsType = "text"; + marcGenre = "book"; + } else if(item.itemType == "journalArticle" || item.itemType == "magazineArticle") { + modsType = "text"; + marcGenre = "periodical"; + } else if(item.itemType == "newspaperArticle") { + modsType = "text"; + marcGenre = "newspaper"; + } else if(item.itemType == "thesis") { + modsType = "text"; + marcGenre = "theses"; + } else if(item.itemType == "letter") { + modsType = "text"; + marcGenre = "letter"; + } else if(item.itemType == "manuscript") { + modsType = "text"; + modsType.@manuscript = "yes"; + } else if(item.itemType == "interview") { + modsType = "text"; + modsType.@manuscript = "interview"; + } else if(item.itemType == "film") { + modsType = "moving image"; + marcGenre = "motion picture"; + } else if(item.itemType == "artwork") { + modsType = "still image"; + marcGenre = "art original"; + } else if(item.itemType == "website") { + modsType = "multimedia"; + marcGenre = "web site"; + } + mods.typeOfResource = modsType; + mods.genre += {item.itemType}; + mods.genre += {marcGenre}; + + // XML tag genre; object field thesisType, type + if(item.thesisType) { + mods.genre += {item.thesisType}; + } + if(item.type) { + mods.genre += {item.type}; + } + + // XML tag name; object field creators + for(var j in item.creators) { + var roleTerm = ""; + if(item.creators[j].creatorType == "author") { + roleTerm = "aut"; + } else if(item.creators[j].creatorType == "editor") { + roleTerm = "edt"; + } else if(item.creators[j].creatorType == "creator") { + roleTerm = "ctb"; + } + + // FIXME - currently all names are personal + mods.name += + {item.creators[j].lastName} + {item.creators[j].firstName} + {roleTerm} + ; + } + + // XML tag recordInfo.recordOrigin; used to store our generator note + mods.recordInfo.recordOrigin = "Scholar for Firefox "+utilities.getVersion(); + + /** FIELDS ON NEARLY EVERYTHING BUT NOT A PART OF THE CORE **/ + + // XML tag recordInfo.recordContentSource; object field source + if(item.source) { + mods.recordInfo.recordContentSource = item.source; + } + // XML tag recordInfo.recordIdentifier; object field accessionNumber + if(item.accessionNumber) { + mods.recordInfo.recordIdentifier = item.accessionNumber; + } + + // XML tag accessCondition; object field rights + if(item.rights) { + mods.accessCondition = item.rights; + } + + /** SUPPLEMENTAL FIELDS **/ + + // XML tag relatedItem.titleInfo; object field series + if(item.series) { + var series = + {item.series} + ; + + if(item.itemType == "bookSection") { + // For a book section, series info must go inside host tag + mods.relatedItem.relatedItem = series; + } else { + mods.relatedItem += series; + } + } + + // Make part its own tag so we can figure out where it goes later + var part = new XML(); + + // XML tag detail; object field volume + if(item.volume) { + if(utilities.isInt(item.volume)) { + part += {item.volume}; + } else { + part += {item.volume}; + } + } + + // XML tag detail; object field number + if(item.number) { + if(utilities.isInt(item.number)) { + part += {item.number}; + } else { + part += {item.number}; + } + } + + // XML tag detail; object field section + if(item.section) { + if(utilities.isInt(item.section)) { + part += {item.section}; + } else { + part += {item.section}; + } + } + + // XML tag detail; object field pages + if(item.pages) { + var start, end; + + if(typeof(item.pages) == "string" && item.pages.indexOf("-")) { + // A page range + var pageNumbers = item.pages.split("-"); + start = pageNumbers[0]; + end = pageNumbers[1]; + } else { + // Assume start and end are the same + start = item.pages; + end = item.pages; + } + part += {start}{end}; + } + + // Assign part if something was assigned + if(part.length() != 1) { + if(isPartialItem) { + // For a journal article, bookSection, etc., the part is the host + mods.relatedItem.part += {part}; + } else { + mods.part += {part}; + } + } + + // XML tag originInfo; object fields edition, place, publisher, year, date + var originInfo = new XML(); + if(item.edition) { + originInfo += {item.edition}; + } + if(item.place) { + originInfo += {item.place}; + } + if(item.publisher) { + originInfo += item.publisher; + } else if(item.distributor) { + originInfo += item.distributor; + } + if(item.year) { + // Assume year is copyright date + originInfo += {item.year}; + } + if(item.date) { + if(inArray(item.itemType, ["magazineArticle", "newspaperArticle"])) { + // Assume date is date issued + var dateType = "dateIssued"; + } else { + // Assume date is date created + var dateType = "dateCreated"; + } + originInfo += <{dateType} encoding="iso8601">{item.date}; + } + if(originInfo.length() != 1) { + if(isPartialItem) { + // For a journal article, bookSection, etc., this goes under the host + mods.relatedItem.originInfo += {originInfo}; + } else { + mods.originInfo += {originInfo}; + } + } + + // XML tag identifier; object fields ISBN, ISSN + var identifier = null; + if(item.ISBN) { + identifier = {item.ISBN}; + } else if(item.ISSN) { + identifier = {item.ISSN}; + } + if(identifier) { + if(isPartialItem) { + mods.relatedItem.identifier = identifier; + } else { + mods.identifier = identifier; + } + } + + // XML tag relatedItem.titleInfo; object field publication + if(item.publication) { + mods.relatedItem.titleInfo += {item.publication}; + } + + // XML tag classification; object field callNumber + if(item.callNumber) { + mods.classification = item.callNumber; + } + + // XML tag location.physicalLocation; object field archiveLocation + if(item.archiveLocation) { + mods.location.physicalLocation = item.archiveLocation; + } + + // XML tag location.url; object field archiveLocation + if(item.url) { + mods.location.url = item.url; + } + + if(mods.relatedItem.length() == 1 && isPartialItem) { + mods.relatedItem.@type = "host"; + } + + /** NOTES **/ + + for(var j in item.notes) { + mods.note += {item.notes[j].note}; + } + + modsCollection.mods += mods; + } + + write(modsCollection.toString()); +}'); \ No newline at end of file