From 74c9967b95aae512c17b8c4d1a8a8869138cf829 Mon Sep 17 00:00:00 2001 From: aurimasv Date: Fri, 21 Dec 2012 14:56:08 -0600 Subject: [PATCH 1/2] Tweak recognizePDF to avoid false positives: only count lines with more than 3 words (since we drop first and last later) as cleaned lines, increase number of pages read and number of lines away from median length as good lines (due to more stringent selection criteria), do not attempt to query google with a short query if we run out of lines, try to pick lines for a query that are not next to each other (to avoid finding articles that quote the article we're looking for) --- chrome/content/zotero/recognizePDF.js | 28 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js index d36d90f76..95f6fbc46 100644 --- a/chrome/content/zotero/recognizePDF.js +++ b/chrome/content/zotero/recognizePDF.js @@ -243,7 +243,7 @@ Zotero_RecognizePDF.Recognizer = function () {} * (function will be passed image as URL and must return text of CAPTCHA) */ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, callback, captchaCallback) { - const MAX_PAGES = 3; + const MAX_PAGES = 5; this._libraryID = libraryID; this._callback = callback; @@ -315,9 +315,9 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c // Use only first column from multi-column lines const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/; - for(var i=0; i 3) { cleanedLines.push(m[1]); cleanedLineLengths.push(m[1].length); } @@ -334,8 +334,8 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c // pick lines within 4 chars of the median (this is completely arbitrary) this._goodLines = []; - var uBound = medianLength + 4; - var lBound = medianLength - 4; + var uBound = medianLength + 6; + var lBound = medianLength - 6; for (var i=0; i lBound && cleanedLineLengths[i] < uBound) { // Strip quotation marks so they don't mess up search query quoting @@ -344,7 +344,7 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c } } - this._startLine = this._iteration = 0; + this._nextLine = this._iteration = 0; this._queryGoogle(); } } @@ -354,7 +354,7 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c * @private */ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { - if(this._iteration > 3 || this._startLine >= this._goodLines.length) { + if(this._iteration > 3 || !this._goodLines.length) { try { if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); } catch(e) {} @@ -385,8 +385,17 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { } else { // take the relevant parts of some lines (exclude hyphenated word) var queryStringWords = 0; - while(queryStringWords < 25 && this._startLine < this._goodLines.length) { - var words = this._goodLines[this._startLine].split(/\s+/); + while(queryStringWords < 25) { + /**a bit of a hack. We're relying on the same test being applied above. + * But this way we don't have to rewrite the same error reporting code + */ + if(!this._goodLines.length) this._queryGoogle(); + + var words = this._goodLines.splice(this._nextLine,1)[0].split(/\s+/); + //Try to avoid picking adjacent strings so the odds of them appearing in another + // document quoting our document is low. Every 7th line is a magic value + this._nextLine = (this._nextLine + 7) % this._goodLines.length; + // get rid of first and last words words.shift(); words.pop(); @@ -403,7 +412,6 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { queryStringWords += words.length; queryString += '"'+words.join(" ")+'" '; } - this._startLine++; } Zotero.debug("RecognizePDF: Query string "+queryString); From ea6a1098a6427d55eaa4e93b03bb9ee9ceb81edd Mon Sep 17 00:00:00 2001 From: aurimasv Date: Sat, 22 Dec 2012 10:47:57 -0600 Subject: [PATCH 2/2] [recognizePDF] Restructure file --- chrome/content/zotero/recognizePDF.js | 211 +++++++++++++++----------- 1 file changed, 120 insertions(+), 91 deletions(-) diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js index 95f6fbc46..cd533521e 100644 --- a/chrome/content/zotero/recognizePDF.js +++ b/chrome/content/zotero/recognizePDF.js @@ -293,10 +293,7 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream); // get the lines in this sample - var lines = [], - cleanedLines = [], - cleanedLineLengths = [], - str = {}; + var lines = [], str = {}; while(intlStream.readLine(str)) { var line = str.value.trim(); if(line) lines.push(line); @@ -304,17 +301,57 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c inputStream.close(); cacheFile.remove(false); - + + var me = this; + // look for DOI var allText = lines.join("\n"); Zotero.debug(allText); var m = Zotero.Utilities.cleanDOI(allText); if(m) { - this._DOI = m; + this._queryDOI(m, function() { + me._cleanLines(lines, me._queryGoogle); + }); + } else { + this._cleanLines(lines, me._queryGoogle); } - +} + +/** + * Looks up item by DOI + * @private + * @param {String} doi DOI to search for + * @param {Function} onFail Callback function to call if a DOI is not found + */ +Zotero_RecognizePDF.Recognizer.prototype._queryDOI = function(doi, onFail) { + var me = this; + var translate = new Zotero.Translate.Search(); + translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753"); + var item = {"itemType":"journalArticle", "DOI":doi}; + translate.setSearch(item); + translate.setHandler("itemDone", function(translate, item) { + me._callback(item); + }); + translate.setHandler("select", function(translate, items, callback) { + return me._selectItems(translate, items, callback); + }); + translate.setHandler("done", function(translate, success) { + if(!success) onFail.call(me); + }); + translate.translate(this._libraryID, false); +} + +/** + * Prepares a list of lines that can be used for querying + * The lines are stored in this._goodLines + * @private + * @param {String[]} lines Array of lines + * @param {Function} callback A callback function to be called on completing + */ +Zotero_RecognizePDF.Recognizer.prototype._cleanLines = function(lines, callback) { // Use only first column from multi-column lines const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/; + var cleanedLines = [], cleanedLineLengths = []; for(var i=0; i 3) { @@ -345,111 +382,103 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c } this._nextLine = this._iteration = 0; - this._queryGoogle(); + callback.call(this); } } +/** + * Deletes hidden browser and sends a failure message to this_callback + * @private + * @param {String} msg Message to be sent to this._callback + */ +Zotero_RecognizePDF.Recognizer.prototype._deleteBrowserAndFail = function(msg) { + var me = this; + try { + if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); + } catch(e) {} + this._callback(false, msg); +} + /** * Queries Google Scholar for metadata for this PDF * @private */ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { if(this._iteration > 3 || !this._goodLines.length) { - try { - if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); - } catch(e) {} - this._callback(false, "recognizePDF.noMatches"); + this._deleteBrowserAndFail("recognizePDF.noMatches"); return; } this._iteration++; var queryString = ""; var me = this; - if(this._DOI) { - // use CrossRef to look for DOI - var translate = new Zotero.Translate.Search(); - translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753"); - var item = {"itemType":"journalArticle", "DOI":this._DOI}; - translate.setSearch(item); - translate.setHandler("itemDone", function(translate, item) { - me._callback(item); - }); - translate.setHandler("select", function(translate, items, callback) { - return me._selectItems(translate, items, callback); - }); - translate.setHandler("done", function(translate, success) { - if(!success) me._queryGoogle(); - }); - translate.translate(this._libraryID, false); - delete this._DOI; - } else { - // take the relevant parts of some lines (exclude hyphenated word) - var queryStringWords = 0; - while(queryStringWords < 25) { - /**a bit of a hack. We're relying on the same test being applied above. - * But this way we don't have to rewrite the same error reporting code - */ - if(!this._goodLines.length) this._queryGoogle(); - var words = this._goodLines.splice(this._nextLine,1)[0].split(/\s+/); - //Try to avoid picking adjacent strings so the odds of them appearing in another - // document quoting our document is low. Every 7th line is a magic value - this._nextLine = (this._nextLine + 7) % this._goodLines.length; + // take the relevant parts of some lines (exclude hyphenated word) + var queryStringWords = 0; + while(queryStringWords < 25) { + if(!this._goodLines.length) { + this._deleteBrowserAndFail("recognizePDF.noMatches"); + return; + } - // get rid of first and last words - words.shift(); - words.pop(); - // make sure there are no long words (probably OCR mistakes) - var skipLine = false; - for(var i=0; i 20) { - skipLine = true; - break; - } - } - // add words to query - if(!skipLine && words.length) { - queryStringWords += words.length; - queryString += '"'+words.join(" ")+'" '; + var words = this._goodLines.splice(this._nextLine,1)[0].split(/\s+/); + //Try to avoid picking adjacent strings so the odds of them appearing in another + // document quoting our document is low. Every 7th line is a magic value + this._nextLine = (this._nextLine + 7) % this._goodLines.length; + + // get rid of first and last words + words.shift(); + words.pop(); + // make sure there are no long words (probably OCR mistakes) + var skipLine = false; + for(var i=0; i 20) { + skipLine = true; + break; } } - - Zotero.debug("RecognizePDF: Query string "+queryString); - - // pass query string to Google Scholar and translate - var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search"; - if(!this._hiddenBrowser) { - this._hiddenBrowser = Zotero.Browser.createHiddenBrowser(); - this._hiddenBrowser.docShell.allowImages = false; + // add words to query + if(!skipLine && words.length) { + queryStringWords += words.length; + queryString += '"'+words.join(" ")+'" '; } - - var translate = new Zotero.Translate.Web(); - var savedItem = false; - translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); - translate.setHandler("itemDone", function(translate, item) { - Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); - savedItem = true; - me._callback(item); - }); - translate.setHandler("select", function(translate, items, callback) { - me._selectItems(translate, items, callback); - }); - translate.setHandler("done", function(translate, success) { - if(!success || !savedItem) me._queryGoogle(); - }); - translate.setHandler("translators", function(translate, detected) { - if(detected.length) { - translate.translate(me._libraryID, false); - } else { - me._queryGoogle(); - } - }); - - this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); - - this._hiddenBrowser.loadURIWithFlags(url, - Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null); } + + Zotero.debug("RecognizePDF: Query string "+queryString); + + // pass query string to Google Scholar and translate + var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search"; + if(!this._hiddenBrowser) { + this._hiddenBrowser = Zotero.Browser.createHiddenBrowser(); + this._hiddenBrowser.docShell.allowImages = false; + } + + var translate = new Zotero.Translate.Web(); + var savedItem = false; + translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); + translate.setHandler("itemDone", function(translate, item) { + Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); + savedItem = true; + me._callback(item); + }); + translate.setHandler("select", function(translate, items, callback) { + me._selectItems(translate, items, callback); + }); + translate.setHandler("done", function(translate, success) { + if(!success || !savedItem) me._queryGoogle(); + }); + translate.setHandler("translators", function(translate, detected) { + if(detected.length) { + translate.translate(me._libraryID, false); + } else { + me._queryGoogle(); + } + }); + + this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); + + this._hiddenBrowser.loadURIWithFlags(url, + Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null); } /**