- better interface for PDF recognizer

- slightly improved PDF recognizer performance
This commit is contained in:
Simon Kornblith 2008-09-03 06:20:19 +00:00
parent 08f7156d16
commit 302de8f189
6 changed files with 296 additions and 73 deletions

View File

@ -0,0 +1,23 @@
<?xml version="1.0" ?>
<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
<!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd">
<window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
title="&zotero.progress.title;" width="550" height="230"
id="zotero-progress">
<vbox style="padding:10px" flex="1">
<label id="label" control="progress-indicator" value="&zotero.recognizePDF.recognizing.label;"/>
<hbox align="center">
<progressmeter id="progress-indicator" mode="determined" flex="1"/>
<button id="cancel-button" label="&zotero.recognizePDF.cancel.label;"/>
</hbox>
<tree flex="1" id="tree" hidecolumnpicker="true">
<treecols>
<treecol id="success-col" style="width:20px;"/>
<treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/>
<treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/>
</treecols>
<treechildren id="treechildren"/>
</tree>
</vbox>
</window>

View File

@ -23,18 +23,24 @@
/** /**
* @fileOverview Tools for automatically retrieving a citation for the given PDF * @fileOverview Tools for automatically retrieving a citation for the given PDF
*/ */
const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif";
/** /**
* Front end for recognizing PDFs * Front end for recognizing PDFs
* @namespace * @namespace
*/ */
var Zotero_RecognizePDF = new function() { var Zotero_RecognizePDF = new function() {
var _progressWindow, _progressIndicator;
/** /**
* Checks whether a given PDF could theoretically be recognized * Checks whether a given PDF could theoretically be recognized
* @returns {Boolean} True if the PDF can be recognized, false if it cannot be * @returns {Boolean} True if the PDF can be recognized, false if it cannot be
*/ */
this.canRecognize = function(/**Zotero.Item*/ item) { this.canRecognize = function(/**Zotero.Item*/ item) {
return (item.attachmentMIMEType && item.attachmentMIMEType == "application/pdf" && !item.getSource()); return (Zotero.Fulltext.pdfConverterIsRegistered && item.attachmentMIMEType &&
item.attachmentMIMEType == "application/pdf" && !item.getSource());
} }
/** /**
@ -43,43 +49,178 @@ var Zotero_RecognizePDF = new function() {
*/ */
this.recognizeSelected = function() { this.recognizeSelected = function() {
var items = ZoteroPane.getSelectedItems(); var items = ZoteroPane.getSelectedItems();
if (!items) { if (!items) return;
return; var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer();
itemRecognizer.recognizeItems(items);
} }
this.recognizeItems(items); }
/**
* @class Handles UI, etc. for recognizing multiple items
*/
Zotero_RecognizePDF.ItemRecognizer = function () {
this._stopped = false;
}
/**
* Retreives metadata for the PDF items passed, displaying a progress dialog during conversion
* and placing the PDFs as a children of the new items
* @param {Zotero.Item[]} items
*/
Zotero_RecognizePDF.ItemRecognizer.prototype.recognizeItems = function(items) {
var me = this;
this._items = items.slice();
this._itemTotal = items.length;
this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false);
}
/**
* Halts recognition of PDFs
*/
Zotero_RecognizePDF.ItemRecognizer.prototype.stop = function() {
this._stopped = true;
}
/**
* Called when the progress window has been opened; adds items to the tree and begins recognizing
* @param
*/
Zotero_RecognizePDF.ItemRecognizer.prototype._onWindowLoaded = function() {
// populate progress window
var treechildren = this._progressWindow.document.getElementById("treechildren");
for(var i in this._items) {
var treeitem = this._progressWindow.document.createElement('treeitem');
var treerow = this._progressWindow.document.createElement('treerow');
var treecell = this._progressWindow.document.createElement('treecell');
treecell.setAttribute("id", "item-"+this._items[i].id+"-icon");
treerow.appendChild(treecell);
treecell = this._progressWindow.document.createElement('treecell');
treecell.setAttribute("label", this._items[i].getField("title"));
treerow.appendChild(treecell);
treecell = this._progressWindow.document.createElement('treecell');
treecell.setAttribute("id", "item-"+this._items[i].id+"-title");
treerow.appendChild(treecell);
treeitem.appendChild(treerow);
treechildren.appendChild(treeitem);
} }
/** var me = this;
* Retreives metadata for the PDF items passed, placing the PDFs as a children of the new items this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() {
me.stop();
me._progressWindow.close();
}, false);
this._progressWindow.addEventListener("close", function() { me.stop() }, false);
this._recognizeItem();
}
/**
* Shifts an item off of this._items and recognizes it, then calls itself again if there are more
* @private
*/ */
this.recognizeItems = function(/**Zotero.Item[]*/ items) { Zotero_RecognizePDF.ItemRecognizer.prototype._recognizeItem = function() {
var itemsCopy = items.slice(); if(!this._items.length) {
var item = itemsCopy.shift(); this._done();
var file = item.getFile(); return;
}
this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100;
this._item = this._items.shift();
this._progressWindow.document.getElementById("item-"+this._item.id+"-icon").
setAttribute("src", Zotero_RecognizePDF_LOADING_IMAGE);
var file = this._item.getFile();
if(file) { if(file) {
var recognizer = new Zotero_RecognizePDF.Recognizer(); var recognizer = new Zotero_RecognizePDF.Recognizer();
recognizer.recognize(file, item.getField("title"), var me = this;
function(translate, newItem) { recognizer.recognize(file, function(newItem, error) { me._callback(newItem, error) });
} else {
this._callback(false, "recognizePDF.fileNotFound");
}
}
/**
* Cleans up after items are recognized, disabling the cancel button and making the progress window
* close on blur
*/
Zotero_RecognizePDF.ItemRecognizer.prototype._done = function() {
this._progressIndicator.value = 100;
this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label");
var me = this;
this._progressWindow.addEventListener("blur",
function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false);
this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label");
}
/**
* Callback function to be executed upon recognition completion
* @param {Zotero.Item|Boolean} newItem The new item created from translation, or false if
* recognition was unsuccessful
* @param {String} [error] The error name, if recognition was unsuccessful.
*/
Zotero_RecognizePDF.ItemRecognizer.prototype._callback = function(newItem, error) {
if(this._stopped) {
if(newItem) Zotero.Items.erase(newItem.id);
return;
}
if(newItem) {
// put new item in same collections as the old one // put new item in same collections as the old one
var itemCollections = item.getCollections(); var itemCollections = this._item.getCollections();
for(var j=0; j<itemCollections.length; j++) { for(var j=0; j<itemCollections.length; j++) {
var collection = Zotero.Collections.get(itemCollections[j]); var collection = Zotero.Collections.get(itemCollections[j]);
collection.addItem(newItem.id); collection.addItem(newItem.id);
} }
// put old item as a child of the new item // put old item as a child of the new item
item.setSource(newItem.id); this._item.setSource(newItem.id);
item.save(); this._item.save();
// continue recognizing
if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy);
});
} else {
if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy);
} }
// add name
this._progressWindow.document.getElementById("item-"+this._item.id+"-title").
setAttribute("label", (newItem ? newItem.getField("title") : Zotero.getString(error)));
// update icon
this._progressWindow.document.getElementById("item-"+this._item.id+"-icon").
setAttribute("src", (newItem ? Zotero_RecognizePDF_SUCCESS_IMAGE : Zotero_RecognizePDF_FAILURE_IMAGE));
if(error == "recognizePDF.limit") {
// now done, since we hit the query limit
var error = Zotero.getString(error);
for(var i in this._items) {
this._progressWindow.document.getElementById("item-"+this._items[i].id+"-title").
setAttribute("label", error);
this._progressWindow.document.getElementById("item-"+this._items[i].id+"-icon").
setAttribute("src", Zotero_RecognizePDF_FAILURE_IMAGE);
}
this._done();
} else {
// scroll to this item
this._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, this._itemTotal-this._items.length-5));
// continue recognizing
this._recognizeItem();
} }
} }
/*Zotero_RecognizePDF.ItemRecognizer.prototype._captchaCallback = function(img) {
var io = {dataIn:img};
Zotero.debug(img);
this._progressWindow.openDialog("chrome://zotero/content/pdfCaptcha.xul", "", "chrome,modal,resizable=no", io);
if(io.dataOut) return io.dataOut;
this.stop();
this._progressWindow.close();
return false;
}*/
/** /**
* @class PDF recognizer backend * @class PDF recognizer backend
*/ */
@ -89,19 +230,23 @@ Zotero_RecognizePDF.Recognizer = function () {}
* Retrieves metadata for a PDF and saves it as an item * Retrieves metadata for a PDF and saves it as an item
* *
* @param {nsIFile} file The PDF file to retrieve metadata for * @param {nsIFile} file The PDF file to retrieve metadata for
* @param {String} pdfTitle The title of the PDF
* @param {Function} callback The function to be executed when recognition is complete * @param {Function} callback The function to be executed when recognition is complete
* @param {Function} [captchaCallback] The function to be executed if a CAPTCHA is encountered
* (function will be passed image as URL and must return text of CAPTCHA)
*/ */
Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, callback) { Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, captchaCallback) {
const MAX_PAGES = 2; const MAX_PAGES = 2;
this._pdfTitle = pdfTitle; const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/;
this._callback = callback;
const whitespaceRe = /^\s*$/; this._callback = callback;
//this._captchaCallback = captchaCallback;
var cacheFile = Zotero.getZoteroDirectory(); var cacheFile = Zotero.getZoteroDirectory();
cacheFile.append("recognizePDFcache.txt"); cacheFile.append(".zotero-recpdf-cache");
if(cacheFile.exists()) {
cacheFile.remove(false);
}
Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk ' Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk '
+ '-l ' + MAX_PAGES + ' "' + file.path + '" "' + '-l ' + MAX_PAGES + ' "' + file.path + '" "'
@ -113,10 +258,15 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca
exec.append(Zotero.Fulltext.pdfConverterFileName); exec.append(Zotero.Fulltext.pdfConverterFileName);
proc.init(exec); proc.init(exec);
var args = ['-enc', 'UTF-8', '-nopgbrk', '-raw', '-l', MAX_PAGES]; var args = ['-enc', 'UTF-8', '-nopgbrk', '-layout', '-l', MAX_PAGES];
args.push(file.path, cacheFile.path); args.push(file.path, cacheFile.path);
proc.run(true, args, args.length); proc.run(true, args, args.length);
if(!cacheFile.exists()) {
this._callback(false, "recognizePDF.couldNotRead");
return;
}
var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"] var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
.createInstance(Components.interfaces.nsIFileInputStream); .createInstance(Components.interfaces.nsIFileInputStream);
inputStream.init(cacheFile, 0x01, 0664, 0); inputStream.init(cacheFile, 0x01, 0664, 0);
@ -131,31 +281,34 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca
var lineLengths = []; var lineLengths = [];
var str = {}; var str = {};
while(intlStream.readLine(str)) { while(intlStream.readLine(str)) {
if(!whitespaceRe.test(str.value)) { var line = lineRe.exec(str.value);
lines.push(str.value); if(line) {
lineLengths.push(str.value.length); lines.push(line[1]);
lineLengths.push(line[1].length);
} }
} }
// get (not quite) median length // get (not quite) median length
var lineLengthsLength = lineLengths.length; var lineLengthsLength = lineLengths.length;
if(lineLengthsLength < 20) { if(lineLengthsLength < 20) {
this._error(); this._callback(false, "recognizePDF.noOCR");
return; return;
} }
var sortedLengths = lineLengths.sort(); var sortedLengths = lineLengths.sort();
var medianLength = sortedLengths[Math.floor(lineLengthsLength/2)]; var medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
// pick lines within 4 chars of the median // pick lines within 4 chars of the median (this is completely arbitrary)
this._goodLines = []; this._goodLines = [];
var uBound = medianLength + 4; var uBound = medianLength + 4;
var lBound = medianLength - 4; var lBound = medianLength - 4;
for (var i=0; i<lineLengthsLength; i++) { for (var i=0; i<lineLengthsLength; i++) {
if(lineLengths[i] >= lBound && lineLengths[i] <= uBound) this._goodLines.push(lines[i]); if(lineLengths[i] > lBound && lineLengths[i] < uBound) this._goodLines.push(lines[i]);
} }
this._startLine = this._iteration = 0; this._startLine = this._iteration = 0;
cacheFile.remove(false);
this._queryGoogle(); this._queryGoogle();
} }
@ -165,18 +318,32 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca
*/ */
Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
if(this._iteration > 3 || this._startLine >= this._goodLines.length) { if(this._iteration > 3 || this._startLine >= this._goodLines.length) {
this._error(); try {
if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
} catch(e) {}
this._callback(false, "recognizePDF.noMatches");
return; return;
} }
this._iteration++;
// take the relevant parts of some lines (exclude hyphenated word) // take the relevant parts of some lines (exclude hyphenated word)
var queryStringWords = 0; var queryStringWords = 0;
var queryString = ""; var queryString = "";
while(queryStringWords < 25 && this._startLine < this._goodLines.length) { while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
var words = this._goodLines[this._startLine].split(/\s+/); var words = this._goodLines[this._startLine].split(/\s+/);
// get rid of first and last words
words.shift(); words.shift();
words.pop(); words.pop();
if(words.length) { // make sure there are no long words (probably OCR mistakes)
var skipLine = false;
for(var i=0; i<words.length; i++) {
if(words[i].length > 20) {
skipLine = true;
break;
}
}
// add words to query
if(!skipLine && words.length) {
queryStringWords += words.length; queryStringWords += words.length;
queryString += '"'+words.join(" ")+'" '; queryString += '"'+words.join(" ")+'" ';
} }
@ -185,27 +352,61 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
Zotero.debug("RecognizePDF: Query string "+queryString); Zotero.debug("RecognizePDF: Query string "+queryString);
// pass query string to Google Scholar and translate // pass query string to Google Scholar and translate
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString); var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
this.hiddenBrowser = Zotero.Browser.createHiddenBrowser(); if(!this._hiddenBrowser) {
this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
this._hiddenBrowser.docShell.allowImages = false;
}
var me = this; var me = this;
var translate = new Zotero.Translate("web", true, false); var translate = new Zotero.Translate("web", true, false);
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
translate.setHandler("itemDone", this._callback); translate.setHandler("itemDone", function(translate, item) {
Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
me._callback(item);
});
translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) }); translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle() }); translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
this.hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
this.hiddenBrowser.loadURI(url);
// to make us a little less obvious, specify a referrer
var referrer = Components.classes["@mozilla.org/network/io-service;1"]
.getService(Components.interfaces.nsIIOService)
.newURI(this._previousURL ? this._previousURL : "http://scholar.google.com/", null, null);
this._hiddenBrowser.loadURIWithFlags(url,
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, referrer, null, null);
this._previousURL = url;
} }
/** /**
* Callback to be executed when Google Scholar is loaded * To be executed when Google Scholar is loaded
* @private * @private
*/ */
Zotero_RecognizePDF.Recognizer.prototype._scrape = function(/**Zotero.Translate*/ translate) { Zotero_RecognizePDF.Recognizer.prototype._scrape = function(/**Zotero.Translate*/ translate) {
this.hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true); if(this._hiddenBrowser.contentDocument.title == "403 Forbidden") {
translate.setDocument(this.hiddenBrowser.contentDocument); // hit the captcha
/*
var forms = this._hiddenBrowser.contentDocument.getElementsByTagName("form");
if(forms.length && forms[0].getAttribute("action") == "Captcha") {
var captchaImage = forms[0].getElementsByTagName("img");
var captchaBox = this._hiddenBrowser.contentDocument.getElementsByName("captcha");
if(captchaImage.length && captchaBox.length && this._captchaCallback) {
var text = this._captchaCallback(captchaImage[0].src);
if(text) {
captchaBox[0].value = text;
forms[0].submit();
return;
}
}
}*/
this._callback(false, "recognizePDF.limit");
return;
}
this._hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true);
translate.setDocument(this._hiddenBrowser.contentDocument);
translate.translate(); translate.translate();
} }
@ -221,15 +422,3 @@ Zotero_RecognizePDF.Recognizer.prototype._selectItems = function(/**Zotero.Trans
return obj; return obj;
} }
} }
/**
* Displays an error when a PDF cannot be recognized
* @private
*/
Zotero_RecognizePDF.Recognizer.prototype._error = function() {
var promptService = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
.getService(Components.interfaces.nsIPromptService);
promptService.alert(window,
Zotero.getString('recognizePDF.couldNotRecognize.title'),
Zotero.getString('recognizePDF.couldNotRecognize.message', this._pdfTitle));
}

View File

@ -164,3 +164,9 @@
<!ENTITY zotero.proxy.recognized.warning.secondary "Adding other proxies allows malicious sites to masquerade as sites you trust."> <!ENTITY zotero.proxy.recognized.warning.secondary "Adding other proxies allows malicious sites to masquerade as sites you trust.">
<!ENTITY zotero.proxy.recognized.disable.label "Do not automatically redirect requests through previously recognized proxies"> <!ENTITY zotero.proxy.recognized.disable.label "Do not automatically redirect requests through previously recognized proxies">
<!ENTITY zotero.proxy.recognized.ignore.label "Ignore"> <!ENTITY zotero.proxy.recognized.ignore.label "Ignore">
<!ENTITY zotero.recognizePDF.recognizing.label "Retrieving Metadata...">
<!ENTITY zotero.recognizePDF.cancel.label "Cancel">
<!ENTITY zotero.recognizePDF.pdfName.label "PDF Name">
<!ENTITY zotero.recognizePDF.itemName.label "Item Name">
<!ENTITY zotero.recognizePDF.captcha.label "Type the text below to continue retrieving metadata.">

View File

@ -516,5 +516,10 @@ proxies.recognized.add = Add Proxy
proxies.enableTransparentWarning.title = Warning proxies.enableTransparentWarning.title = Warning
proxies.enableTransparentWarning.description = Please ensure that the proxies listed below belong to a library, school, or other institution with which you are affiliated. A malicious proxy could pose a security risk. proxies.enableTransparentWarning.description = Please ensure that the proxies listed below belong to a library, school, or other institution with which you are affiliated. A malicious proxy could pose a security risk.
recognizePDF.couldNotRecognize.title = Could Not Retrieve Metada recognizePDF.noOCR = PDF does not contain OCRed text.
recognizePDF.couldNotRecognize.message = Zotero could not retrieve metadata for "%1$S". recognizePDF.couldNotRead = Could not read text from PDF.
recognizePDF.noMatches = No matching references found.
recognizePDF.fileNotFound = File not found.
recognizePDF.limit = Query limit reached. Try again later.
recognizePDF.complete.label = Metadata Retrieval Complete.
recognizePDF.close.label = Close

Binary file not shown.

After

Width:  |  Height:  |  Size: 655 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB