Merge pull request #217 from aurimasv/recognizePDF

Tweak recognizePDF to avoid false positives
2013-01-01 06:21:25 -08:00 · 2013-01-01 06:21:25 -08:00 · 3c6b0d99c3
commit 3c6b0d99c3
parent 11a83e5df3 ea6a1098a6
1 changed files with 130 additions and 93 deletions
--- a/chrome/content/zotero/recognizePDF.js
+++ b/chrome/content/zotero/recognizePDF.js
@ -243,7 +243,7 @@ Zotero_RecognizePDF.Recognizer = function () {}
 *	(function will be passed image as URL and must return text of CAPTCHA)
 */
 Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, callback, captchaCallback) {
-	const MAX_PAGES = 3;
+	const MAX_PAGES = 5;
 	this._libraryID = libraryID;
 	this._callback = callback;
@ -293,10 +293,7 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
 	intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);
 	// get the lines in this sample
-	var lines = [],
+	var lines = [], str = {};
 		cleanedLines = [],
 		cleanedLineLengths = [],
 		str = {};
 	while(intlStream.readLine(str)) {
 		var line = str.value.trim();
 		if(line) lines.push(line);
@ -305,19 +302,59 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
 	inputStream.close();
 	cacheFile.remove(false);
 	var me = this;
 	// look for DOI
 	var allText = lines.join("\n");
 	Zotero.debug(allText);
 	var m = Zotero.Utilities.cleanDOI(allText);
 	if(m) {
-		this._DOI = m;
+		this._queryDOI(m, function() {
 			me._cleanLines(lines, me._queryGoogle);
 		});
 	} else {
 		this._cleanLines(lines, me._queryGoogle);
 	}
 }
 /**
 * Looks up item by DOI
 * @private
 * @param {String} doi DOI to search for
 * @param {Function} onFail Callback function to call if a DOI is not found
 */
 Zotero_RecognizePDF.Recognizer.prototype._queryDOI = function(doi, onFail) {
 	var me = this;
 	var translate = new Zotero.Translate.Search();
 	translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
 	var item = {"itemType":"journalArticle", "DOI":doi};
 	translate.setSearch(item);
 	translate.setHandler("itemDone", function(translate, item) {
 		me._callback(item);
 	});
 	translate.setHandler("select", function(translate, items, callback) {
 		return me._selectItems(translate, items, callback);
 	});
 	translate.setHandler("done", function(translate, success) {
 		if(!success) onFail.call(me);
 	});
 	translate.translate(this._libraryID, false);
 }
 /**
 * Prepares a list of lines that can be used for querying
 * The lines are stored in this._goodLines
 * @private
 * @param {String[]} lines Array of lines
 * @param {Function} callback A callback function to be called on completing
 */
 Zotero_RecognizePDF.Recognizer.prototype._cleanLines = function(lines, callback) {
 	// Use only first column from multi-column lines
 	const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/;
-	for(var i=0; i<lines.length; i++) {
+	var cleanedLines = [], cleanedLineLengths = [];
 	for(var i=0; i<lines.length && cleanedLines.length<30; i++) {
 		var m = lineRe.exec(lines[i]);
-		if(m) {
+		if(m && m[1].split(' ').length > 3) {
 			cleanedLines.push(m[1]);
 			cleanedLineLengths.push(m[1].length);
 		}
@ -334,8 +371,8 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
 		// pick lines within 4 chars of the median (this is completely arbitrary)
 		this._goodLines = [];
-		var uBound = medianLength + 4;
+		var uBound = medianLength + 6;
-		var lBound = medianLength - 4;
+		var lBound = medianLength - 6;
 		for (var i=0; i<lineLengthsLength; i++) {
 			if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
 				// Strip quotation marks so they don't mess up search query quoting
@ -344,49 +381,51 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
 			}
 		}
-		this._startLine = this._iteration = 0;
+		this._nextLine = this._iteration = 0;
-		this._queryGoogle();
+		callback.call(this);
 	}
 }
 /**
 * Deletes hidden browser and sends a failure message to this_callback
 * @private
 * @param {String} msg Message to be sent to this._callback
 */
 Zotero_RecognizePDF.Recognizer.prototype._deleteBrowserAndFail = function(msg) {
 	var me = this;
 	try {
 		if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
 	} catch(e) {}
 	this._callback(false, msg);
 }
 /**
 * Queries Google Scholar for metadata for this PDF
 * @private
 */
 Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
-	if(this._iteration > 3 || this._startLine >= this._goodLines.length) {
+	if(this._iteration > 3 || !this._goodLines.length) {
-		try {
+		this._deleteBrowserAndFail("recognizePDF.noMatches");
 			if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
 		} catch(e) {}
 		this._callback(false, "recognizePDF.noMatches");
 		return;
 	}
 	this._iteration++;
 	var queryString = "";
 	var me = this;
-	if(this._DOI) {
+
 		// use CrossRef to look for DOI
 		var translate = new Zotero.Translate.Search();
 		translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
 		var item = {"itemType":"journalArticle", "DOI":this._DOI};
 		translate.setSearch(item);
 		translate.setHandler("itemDone", function(translate, item) {
 			me._callback(item);
 		});
 		translate.setHandler("select", function(translate, items, callback) {
 			return me._selectItems(translate, items, callback);
 		});
 		translate.setHandler("done", function(translate, success) {
 			if(!success) me._queryGoogle();
 		});
 		translate.translate(this._libraryID, false);
 		delete this._DOI;
 	} else {
 	// take the relevant parts of some lines (exclude hyphenated word)
 	var queryStringWords = 0;
-		while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
+	while(queryStringWords < 25) {
-			var words = this._goodLines[this._startLine].split(/\s+/);
+		if(!this._goodLines.length) {
 			this._deleteBrowserAndFail("recognizePDF.noMatches");
 			return;
 		}
 		var words = this._goodLines.splice(this._nextLine,1)[0].split(/\s+/);
 		//Try to avoid picking adjacent strings so the odds of them appearing in another
 		// document quoting our document is low. Every 7th line is a magic value
 		this._nextLine = (this._nextLine + 7) % this._goodLines.length;
 		// get rid of first and last words
 		words.shift();
 		words.pop();
@ -403,7 +442,6 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
 			queryStringWords += words.length;
 			queryString += '"'+words.join(" ")+'" ';
 		}
 			this._startLine++;
 	}
 	Zotero.debug("RecognizePDF: Query string "+queryString);
@ -442,7 +480,6 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
 	this._hiddenBrowser.loadURIWithFlags(url,
 		Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
 }
 }
 /**
 * To be executed when Google Scholar is loaded