[recognizePDF] Restructure file

This commit is contained in:
aurimasv 2012-12-22 10:47:57 -06:00
parent 74c9967b95
commit ea6a1098a6

View File

@ -293,10 +293,7 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream); intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);
// get the lines in this sample // get the lines in this sample
var lines = [], var lines = [], str = {};
cleanedLines = [],
cleanedLineLengths = [],
str = {};
while(intlStream.readLine(str)) { while(intlStream.readLine(str)) {
var line = str.value.trim(); var line = str.value.trim();
if(line) lines.push(line); if(line) lines.push(line);
@ -305,16 +302,56 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
inputStream.close(); inputStream.close();
cacheFile.remove(false); cacheFile.remove(false);
var me = this;
// look for DOI // look for DOI
var allText = lines.join("\n"); var allText = lines.join("\n");
Zotero.debug(allText); Zotero.debug(allText);
var m = Zotero.Utilities.cleanDOI(allText); var m = Zotero.Utilities.cleanDOI(allText);
if(m) { if(m) {
this._DOI = m; this._queryDOI(m, function() {
me._cleanLines(lines, me._queryGoogle);
});
} else {
this._cleanLines(lines, me._queryGoogle);
}
} }
/**
* Looks up item by DOI
* @private
* @param {String} doi DOI to search for
* @param {Function} onFail Callback function to call if a DOI is not found
*/
Zotero_RecognizePDF.Recognizer.prototype._queryDOI = function(doi, onFail) {
var me = this;
var translate = new Zotero.Translate.Search();
translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
var item = {"itemType":"journalArticle", "DOI":doi};
translate.setSearch(item);
translate.setHandler("itemDone", function(translate, item) {
me._callback(item);
});
translate.setHandler("select", function(translate, items, callback) {
return me._selectItems(translate, items, callback);
});
translate.setHandler("done", function(translate, success) {
if(!success) onFail.call(me);
});
translate.translate(this._libraryID, false);
}
/**
* Prepares a list of lines that can be used for querying
* The lines are stored in this._goodLines
* @private
* @param {String[]} lines Array of lines
* @param {Function} callback A callback function to be called on completing
*/
Zotero_RecognizePDF.Recognizer.prototype._cleanLines = function(lines, callback) {
// Use only first column from multi-column lines // Use only first column from multi-column lines
const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/; const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/;
var cleanedLines = [], cleanedLineLengths = [];
for(var i=0; i<lines.length && cleanedLines.length<30; i++) { for(var i=0; i<lines.length && cleanedLines.length<30; i++) {
var m = lineRe.exec(lines[i]); var m = lineRe.exec(lines[i]);
if(m && m[1].split(' ').length > 3) { if(m && m[1].split(' ').length > 3) {
@ -345,51 +382,44 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
} }
this._nextLine = this._iteration = 0; this._nextLine = this._iteration = 0;
this._queryGoogle(); callback.call(this);
} }
} }
/**
* Deletes hidden browser and sends a failure message to this_callback
* @private
* @param {String} msg Message to be sent to this._callback
*/
Zotero_RecognizePDF.Recognizer.prototype._deleteBrowserAndFail = function(msg) {
var me = this;
try {
if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
} catch(e) {}
this._callback(false, msg);
}
/** /**
* Queries Google Scholar for metadata for this PDF * Queries Google Scholar for metadata for this PDF
* @private * @private
*/ */
Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
if(this._iteration > 3 || !this._goodLines.length) { if(this._iteration > 3 || !this._goodLines.length) {
try { this._deleteBrowserAndFail("recognizePDF.noMatches");
if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
} catch(e) {}
this._callback(false, "recognizePDF.noMatches");
return; return;
} }
this._iteration++; this._iteration++;
var queryString = ""; var queryString = "";
var me = this; var me = this;
if(this._DOI) {
// use CrossRef to look for DOI
var translate = new Zotero.Translate.Search();
translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
var item = {"itemType":"journalArticle", "DOI":this._DOI};
translate.setSearch(item);
translate.setHandler("itemDone", function(translate, item) {
me._callback(item);
});
translate.setHandler("select", function(translate, items, callback) {
return me._selectItems(translate, items, callback);
});
translate.setHandler("done", function(translate, success) {
if(!success) me._queryGoogle();
});
translate.translate(this._libraryID, false);
delete this._DOI;
} else {
// take the relevant parts of some lines (exclude hyphenated word) // take the relevant parts of some lines (exclude hyphenated word)
var queryStringWords = 0; var queryStringWords = 0;
while(queryStringWords < 25) { while(queryStringWords < 25) {
/**a bit of a hack. We're relying on the same test being applied above. if(!this._goodLines.length) {
* But this way we don't have to rewrite the same error reporting code this._deleteBrowserAndFail("recognizePDF.noMatches");
*/ return;
if(!this._goodLines.length) this._queryGoogle(); }
var words = this._goodLines.splice(this._nextLine,1)[0].split(/\s+/); var words = this._goodLines.splice(this._nextLine,1)[0].split(/\s+/);
//Try to avoid picking adjacent strings so the odds of them appearing in another //Try to avoid picking adjacent strings so the odds of them appearing in another
@ -450,7 +480,6 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
this._hiddenBrowser.loadURIWithFlags(url, this._hiddenBrowser.loadURIWithFlags(url,
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null); Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
} }
}
/** /**
* To be executed when Google Scholar is loaded * To be executed when Google Scholar is loaded