diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js index 24fc1f50c..c5df9fd1e 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/translate.js +++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -261,21 +261,7 @@ Scholar.Translate.prototype.getTranslators = function() { Scholar.debug("searching for translators for "+this.path); // see which translators can translate - for(var i in translators) { - if(this._canTranslate(translators[i])) { - Scholar.debug("found translator "+translators[i].label); - - // for some reason, and i'm not quite sure what this reason is, - // we HAVE to do this to get things to work right; we can't - // just push a normal translator object from an SQL statement - var translator = {translatorID:translators[i].translatorID, - label:translators[i].label, - target:translators[i].target, - itemType:translators[i].itemType} - - possibleTranslators.push(translator); - } - } + var possibleTranslators = this._findTranslators(translators); return possibleTranslators; } @@ -289,6 +275,31 @@ Scholar.Translate.prototype.getTranslators = function() { Scholar.Translate.prototype.displayOptions = function() { } +Scholar.Translate.prototype._findTranslators = function(translators, ignoreExtensions) { + var possibleTranslators = new Array(); + for(var i in translators) { + if(this._canTranslate(translators[i], ignoreExtensions)) { + Scholar.debug("found translator "+translators[i].label); + + // for some reason, and i'm not quite sure what this reason is, + // we HAVE to do this to get things to work right; we can't + // just push a normal translator object from an SQL statement + var translator = {translatorID:translators[i].translatorID, + label:translators[i].label, + target:translators[i].target, + itemType:translators[i].itemType} + + possibleTranslators.push(translator); + } + } + if(!possibleTranslators.length && this.type == "import" && !ignoreExtensions) { + Scholar.debug("looking a second time"); + // try search again, ignoring file extensions + return this._findTranslators(translators, true); + } + return possibleTranslators; +} + Scholar.Translate.prototype._loadTranslator = function() { if(!this._sandbox || this.type == "search") { // create a new sandbox if none exists, or for searching (so that it's @@ -467,7 +478,7 @@ Scholar.Translate.prototype._generateSandbox = function() { /* * Check to see if _scraper_ can scrape this document */ -Scholar.Translate.prototype._canTranslate = function(translator) { +Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtensions) { // Test location with regular expression // If this is slow, we could preload all scrapers and compile regular // expressions, so each check will be faster @@ -482,6 +493,12 @@ Scholar.Translate.prototype._canTranslate = function(translator) { if(regularExpression.test(this.path)) { canTranslate = true; } + + if(ignoreExtensions) { + // if we're ignoring extensions, that means we already tried + // everything without ignoring extensions and it didn't work + canTranslate = !canTranslate; + } } else { var canTranslate = true; } @@ -917,7 +934,7 @@ Scholar.Translate.prototype._import = function() { * sets up import for IO */ Scholar.Translate.prototype._importConfigureIO = function() { - if(this._configOptions.dataMode == "rdf") { + if(this._configOptions.dataMode == "rdf") { var IOService = Components.classes['@mozilla.org/network/io-service;1'] .getService(Components.interfaces.nsIIOService); var fileHandler = IOService.getProtocolHandler("file") diff --git a/scrapers.sql b/scrapers.sql index df662eed8..14b67fed2 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,7 +1,7 @@ --- 38 +-- 39 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-07 01:09:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-07 21:55:00')); REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', 'function detectWeb(doc, url) { @@ -2163,7 +2163,7 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006 } }'); -REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 12, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', +REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-08-07 21:55:00', 12, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', 'function detectWeb(doc, url) { if(doc.location.href.indexOf("list_uids=") >= 0) { return "journalArticle"; @@ -2738,7 +2738,15 @@ function doSearch(item) { REPLACE INTO "translators" VALUES ('0e2235e7-babf-413c-9acf-f27cce5f059c', '2006-07-05 23:40:00', 3, 'MODS (XML)', 'Simon Kornblith', 'xml', 'Scholar.addOption("exportNotes", true); -Scholar.addOption("exportFileData", true);', +Scholar.addOption("exportFileData", true); + +function detectImport() { + var read = Scholar.read(512); + var modsTagRegexp = /]+>/ + if(modsTagRegexp.test(read)) { + return true; + } +}', 'var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; function doExport() { @@ -3664,7 +3672,17 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 }'); REPLACE INTO "translators" VALUES ('5e3ad958-ac79-463d-812b-a86a9235c28f', '2006-07-15 17:09:00', 1, 'RDF', 'Simon Kornblith', 'rdf', -'Scholar.configure("dataMode", "rdf");', +'Scholar.configure("dataMode", "rdf"); + +function detectImport() { + // unfortunately, Mozilla will let you create a data source from any type + // of XML, so we need to make sure there are actually nodes + + var nodes = Scholar.RDF.getAllResources(); + if(nodes) { + return true; + } +}', '// gets the first result set for a property that can be encoded in multiple // ontologies function getFirstResults(node, properties, onlyOneString) { @@ -4052,7 +4070,20 @@ function doImport() { REPLACE INTO "translators" VALUES ('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7', '2006-06-30 15:36:00', 3, 'RIS', 'Simon Kornblith', 'ris', 'Scholar.configure("dataMode", "line"); -Scholar.addOption("exportNotes", true);', +Scholar.addOption("exportNotes", true); + +function detectImport() { + var line + while(line = Scholar.read()) { + if(line.replace(/\s/g, "") != "") { + if(line.substr(0, 6) == "TY - ") { + return true; + } else { + return false; + } + } + } +}', 'var itemsWithYears = ["book", "bookSection", "thesis", "film"]; var fieldMap = { @@ -4325,7 +4356,13 @@ function doExport() { }'); REPLACE INTO "translators" VALUES ('a6ee60df-1ddc-4aae-bb25-45e0537be973', '2006-07-16 17:18:00', 1, 'MARC', 'Simon Kornblith', 'marc', -NULL, +'function detectImport() { + var marcRecordRegexp = /^[0-9]{5}[a-z ]{3}$/ + var read = Scholar.read(8); + if(marcRecordRegexp.test(read)) { + return true; + } +}', '/* * Original version of MARC record library copyright (C) 2005 Stefano Bargioni, * licensed under the LGPL @@ -4801,7 +4838,6 @@ function doImport(url) { // the URL is actually here for other translators while(text = Scholar.read(4096)) { // read in 4096 byte increments var records = text.split("\x1D"); - Scholar.Utilities.debugPrint(records); if(records.length > 1) { records[0] = holdOver + records[0];