zotero/chrome/content/zotero/xpcom/recognizePDF.js

589 lines
14 KiB
JavaScript

/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2018 Center for History and New Media
George Mason University, Fairfax, Virginia, USA
http://zotero.org
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/
Zotero.RecognizePDF = new function () {
const OFFLINE_RECHECK_DELAY = 60 * 1000;
const MAX_PAGES = 5;
const UNRECOGNIZE_TIMEOUT = 86400 * 1000;
this.ROW_QUEUED = 1;
this.ROW_PROCESSING = 2;
this.ROW_FAILED = 3;
this.ROW_SUCCEEDED = 4;
let _newItems = new WeakMap();
let _listeners = {};
let _rows = [];
let _queue = [];
let _queueProcessing = false;
/**
* Add listener
* @param name Event name
* @param callback
*/
this.addListener = function (name, callback) {
_listeners[name] = callback;
};
/**
* Remove listener
* @param name Event name
*/
this.removeListener = function (name) {
delete _listeners[name];
};
/**
* Checks whether a given PDF could theoretically be recognized
* @param {Zotero.Item} item
* @return {Boolean} True if the PDF can be recognized, false if it cannot be
*/
this.canRecognize = function (item) {
return item.attachmentContentType
&& item.attachmentContentType === 'application/pdf'
&& item.isTopLevelItem();
};
/**
* Adds items to the queue and starts processing it
* @param items {Zotero.Item}
*/
this.recognizeItems = function (items) {
for (let item of items) {
_addItem(item);
}
_processQueue();
};
this.autoRecognizeItems = function (items) {
if (!Zotero.Prefs.get('autoRecognizeFiles')) return;
var pdfs = items.filter((item) => {
return item
&& item.isFileAttachment()
&& item.attachmentContentType == 'application/pdf';
});
if (!pdfs.length) {
return;
}
this.recognizeItems(pdfs);
var win = Services.wm.getMostRecentWindow("navigator:browser");
if (win) {
win.Zotero_RecognizePDF_Dialog.open();
}
};
/**
* Returns all rows
* @return {Array}
*/
this.getRows = function () {
return _rows;
};
/**
* Returns rows count
* @return {Number}
*/
this.getTotal = function () {
return _rows.length;
};
/**
* Returns processed rows count
* @return {Number}
*/
this.getProcessedTotal = function () {
return _rows.filter(row => row.status > Zotero.RecognizePDF.ROW_PROCESSING).length;
};
/**
* Stop processing items
*/
this.cancel = function () {
_queue = [];
_rows = [];
if (_listeners['empty']) {
_listeners['empty']();
}
};
this.canUnrecognize = function (item) {
var threshold = UNRECOGNIZE_TIMEOUT;
var added = _newItems.get(item);
// Item must have been recognized recently, must not have been modified since it was
// created, and must have only one attachment and no other children
if (!added
|| Zotero.Date.sqlToDate(added, true) < new Date() - threshold
|| item.dateModified != added
|| item.numAttachments(true) != 1
|| item.numChildren(true) != 1) {
_newItems.delete(item);
return false;
}
// Child attachment must be not be in trash and must be a PDF
var attachments = Zotero.Items.get(item.getAttachments());
if (!attachments.length || attachments[0].attachmentContentType != 'application/pdf') {
_newItems.delete(item);
return false;
}
return true;
};
this.unrecognize = async function (item) {
var attachment = Zotero.Items.get(item.getAttachments()[0]);
return Zotero.DB.executeTransaction(async function () {
let collections = item.getCollections();
attachment.parentItemID = null
attachment.setCollections(collections);
await attachment.save();
await item.erase();
}.bind(this));
};
this.report = async function (item) {
var attachment = Zotero.Items.get(item.getAttachments()[0]);
var filePath = await attachment.getFilePath();
if (!filePath || !await OS.File.exists(filePath)) {
throw new Error("File not found when reporting metadata");
}
var version = Zotero.version;
var json = await extractJSON(filePath, MAX_PAGES);
var metadata = item.toJSON();
var data = { version, json, metadata };
var uri = ZOTERO_CONFIG.RECOGNIZE_URL + 'report';
return Zotero.HTTP.request(
"POST",
uri,
{
successCodes: [200, 204],
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(data)
}
);
};
/**
* Add item for processing
* @param item
* @return {null}
*/
function _addItem(item) {
for (let row of _rows) {
if (row.id === item.id) {
if (row.status > Zotero.RecognizePDF.ROW_PROCESSING) {
_deleteRow(row.id);
break;
}
return null;
}
}
let row = {
id: item.id,
status: Zotero.RecognizePDF.ROW_QUEUED,
fileName: item.getField('title'),
message: ''
};
_rows.unshift(row);
_queue.unshift(item.id);
if (_listeners['rowadded']) {
_listeners['rowadded'](row);
}
if (_listeners['nonempty'] && _rows.length === 1) {
_listeners['nonempty']();
}
}
/**
* Update row status and message
* @param itemID
* @param status
* @param message
*/
function _updateRow(itemID, status, message) {
for (let row of _rows) {
if (row.id === itemID) {
row.status = status;
row.message = message;
if (_listeners['rowupdated']) {
_listeners['rowupdated']({
id: row.id,
status,
message: message || ''
});
}
return;
}
}
}
/**
* Delete row
* @param itemID
*/
function _deleteRow(itemID) {
for (let i = 0; i < _rows.length; i++) {
let row = _rows[i];
if (row.id === itemID) {
_rows.splice(i, 1);
if (_listeners['rowdeleted']) {
_listeners['rowdeleted']({
id: row.id
});
}
return;
}
}
}
/**
* Triggers queue processing and returns when all items in the queue are processed
* @return {Promise}
*/
async function _processQueue() {
await Zotero.Schema.schemaUpdatePromise;
if (_queueProcessing) return;
_queueProcessing = true;
while (1) {
if (Zotero.HTTP.browserIsOffline()) {
await Zotero.Promise.delay(OFFLINE_RECHECK_DELAY);
continue;
}
let itemID = _queue.shift();
if (!itemID) break;
_updateRow(itemID, Zotero.RecognizePDF.ROW_PROCESSING, Zotero.getString('general.processing'));
try {
let newItem = await _processItem(itemID);
if (newItem) {
_updateRow(itemID, Zotero.RecognizePDF.ROW_SUCCEEDED, newItem.getField('title'));
}
else {
_updateRow(itemID, Zotero.RecognizePDF.ROW_FAILED, Zotero.getString('recognizePDF.noMatches'));
}
}
catch (e) {
Zotero.logError(e);
_updateRow(
itemID,
Zotero.RecognizePDF.ROW_FAILED,
e instanceof Zotero.Exception.Alert
? e.message
: Zotero.getString('recognizePDF.error')
);
}
}
_queueProcessing = false;
}
/**
* Processes the item and places it as a children of the new item
* @param itemID
* @return {Promise}
*/
async function _processItem(itemID) {
let attachment = await Zotero.Items.getAsync(itemID);
if (!attachment || attachment.parentItemID) {
throw new Zotero.Exception.Alert('recognizePDF.error');
}
let parentItem = await _recognize(attachment);
if (!parentItem) {
return null;
}
// Put new item in same collections as the old one
let collections = attachment.getCollections();
await Zotero.DB.executeTransaction(async function () {
if (collections.length) {
for (let collectionID of collections) {
parentItem.addToCollection(collectionID);
}
await parentItem.save();
}
// Put old item as a child of the new item
attachment.parentID = parentItem.id;
await attachment.save();
});
// Rename attachment file to match new metadata
if (Zotero.Prefs.get('autoRenameFiles')) {
let path = attachment.getFilePath();
let ext = Zotero.File.getExtension(path);
let fileBaseName = Zotero.Attachments.getFileBaseNameFromItem(parentItem);
let newName = fileBaseName + (ext ? '.' + ext : '');
let result = await attachment.renameAttachmentFile(newName, false, true);
if (result !== true) {
throw new Error("Error renaming " + path);
}
// Rename attachment title
attachment.setField('title', newName);
await attachment.saveTx();
}
_newItems.set(parentItem, parentItem.dateModified);
return parentItem;
}
/**
* Get json from a PDF
* @param {String} filePath PDF file path
* @param {Number} pages Number of pages to extract
* @return {Promise}
*/
async function extractJSON(filePath, pages) {
let cacheFile = Zotero.getTempDirectory();
cacheFile.append("recognizePDFcache.txt");
if (cacheFile.exists()) {
cacheFile.remove(false);
}
let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
args.push('-json', '-l', pages, filePath, cacheFile.path);
Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
try {
await Zotero.Utilities.Internal.exec(exec, args);
let content = await Zotero.File.getContentsAsync(cacheFile.path);
Zotero.debug("RecognizePDF: Extracted JSON:");
Zotero.debug(content);
cacheFile.remove(false);
return JSON.parse(content);
}
catch (e) {
Zotero.logError(e);
try {
cacheFile.remove(false);
} catch(e) {
Zotero.logError(e);
}
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
}
}
/**
* Attach appropriate handlers to a Zotero.Translate instance and begin translation
* @return {Promise}
*/
async function _promiseTranslate(translate, libraryID) {
translate.setHandler('select', function (translate, items, callback) {
for (let i in items) {
let obj = {};
obj[i] = items[i];
callback(obj);
return;
}
});
let newItems = await translate.translate({
libraryID,
saveAttachments: false
});
if (newItems.length) {
return newItems[0];
}
throw new Error('No items found');
}
async function _query(json) {
// TODO: Use main API URL for recognizer server
//let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.API_URL;
let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.RECOGNIZE_URL;
if (!uri.endsWith('/')) {
uri += '/';
}
uri += 'recognize';
let client = Zotero.Sync.Runner.getAPIClient();
let req = await client.makeRequest(
'POST',
uri,
{
successCodes: [200],
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(json),
noAPIKey: true
}
);
return JSON.parse(req.responseText);
}
/**
* Retrieves metadata for a PDF and saves it as an item
* @param {Zotero.Item} item
* @return {Promise}
*/
async function _recognize(item) {
let filePath = await item.getFilePath();
if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
let json = await extractJSON(filePath, MAX_PAGES);
let containingTextPages = 0;
for(let page of json.pages) {
if(page[2].length) {
containingTextPages++;
}
}
if(!containingTextPages) {
throw new Zotero.Exception.Alert('recognizePDF.noOCR');
}
let libraryID = item.libraryID;
let res = await _query(json);
if (!res) return null;
if (res.doi) {
Zotero.debug('RecognizePDF: Getting metadata by DOI');
let translateDOI = new Zotero.Translate.Search();
translateDOI.setTranslator('11645bd1-0420-45c1-badb-53fb41eeb753');
translateDOI.setSearch({'itemType': 'journalArticle', 'DOI': res.doi});
try {
let newItem = await _promiseTranslate(translateDOI, libraryID);
if (!newItem.abstractNote && res.abstract) {
newItem.setField('abstractNote', res.abstract);
}
newItem.saveTx();
return newItem;
}
catch (e) {
Zotero.debug('RecognizePDF: ' + e);
}
}
if (res.isbn) {
Zotero.debug('RecognizePDF: Getting metadata by ISBN');
let translate = new Zotero.Translate.Search();
translate.setSearch({'itemType': 'book', 'ISBN': res.isbn});
try {
let translatedItems = await translate.translate({
libraryID: false,
saveAttachments: false
});
Zotero.debug('RecognizePDF: Translated items:');
Zotero.debug(translatedItems);
if (translatedItems.length) {
let newItem = new Zotero.Item;
newItem.fromJSON(translatedItems[0]);
newItem.libraryID = libraryID;
if (!newItem.abstractNote && res.abstract) {
newItem.setField('abstractNote', res.abstract);
}
newItem.saveTx();
return newItem;
}
}
catch (e) {
Zotero.debug('RecognizePDF: ' + e);
}
}
if (res.title) {
let type = 'journalArticle';
if (res.type === 'book-chapter') {
type = 'bookSection';
}
let newItem = new Zotero.Item(type);
newItem.setField('title', res.title);
let creators = [];
for (let author of res.authors) {
creators.push({
firstName: author.firstName,
lastName: author.lastName,
creatorType: 'author'
})
}
newItem.setCreators(creators);
if (res.abstract) newItem.setField('abstractNote', res.abstract);
if (res.year) newItem.setField('date', res.year);
if (res.pages) newItem.setField('pages', res.pages);
if (res.volume) newItem.setField('volume', res.volume);
if (res.url) newItem.setField('url', res.url);
if (type === 'journalArticle') {
if (res.issue) newItem.setField('issue', res.issue);
if (res.ISSN) newItem.setField('issn', res.issn);
if (res.container) newItem.setField('publicationTitle', res.container);
}
else if (type === 'bookSection') {
if (res.container) newItem.setField('bookTitle', res.container);
if (res.publisher) newItem.setField('publisher', res.publisher);
}
newItem.setField('libraryCatalog', 'Zotero');
await newItem.saveTx();
return newItem;
}
return null;
}
};