zotero/chrome/content/zotero/xpcom/fulltext.js

/*
    ***** BEGIN LICENSE BLOCK *****

    Copyright © 2009 Center for History and New Media
                     George Mason University, Fairfax, Virginia, USA
                     http://zotero.org

    This file is part of Zotero.

    Zotero is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Zotero is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with Zotero.  If not, see <http://www.gnu.org/licenses/>.

    ***** END LICENSE BLOCK *****
*/

Zotero.Fulltext = new function(){
	const CACHE_FILE = '.zotero-ft-cache';

	this.init = init;
	this.registerPDFTool = registerPDFTool;
	this.pdfConverterIsRegistered = pdfConverterIsRegistered;
	this.pdfInfoIsRegistered = pdfInfoIsRegistered;
	this.isCachedMIMEType = isCachedMIMEType;
	this.indexWords = indexWords;
	this.indexDocument = indexDocument;
	this.indexString = indexString;
	this.indexFile = indexFile;
	this.indexPDF = indexPDF;
	this.indexItems = indexItems;
	this.findTextInItems = findTextInItems;
	this.clearItemWords = clearItemWords;
	this.getPages = getPages;
	this.getTotalPagesFromFile = getTotalPagesFromFile;
	this.getChars = getChars;
	this.getTotalCharsFromFile = getTotalCharsFromFile;
	this.setChars = setChars;
	this.setPages = setPages;
	this.getIndexedState = getIndexedState;
	this.getIndexStats = getIndexStats;
	this.canReindex = canReindex;
	this.rebuildIndex = rebuildIndex;
	this.clearIndex = clearIndex;
	this.clearCacheFile = clearCacheFile;
	this.clearCacheFiles = clearCacheFiles;
	//this.clearItemContent = clearItemContent;
	this.purgeUnusedWords = purgeUnusedWords;
	this.semanticSplitter = semanticSplitter;

	this.__defineGetter__("pdfToolsDownloadBaseURL", function() { return 'http://www.zotero.org/download/xpdf/'; });
	this.__defineGetter__("pdfToolsName", function() { return 'Xpdf'; });
	this.__defineGetter__("pdfToolsURL", function() { return 'http://www.foolabs.com/xpdf/'; });
	this.__defineGetter__("pdfConverterName", function() { return 'pdftotext'; });
	this.__defineGetter__("pdfInfoName", function() { return 'pdfinfo'; });
	this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; });
	this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });

	this.__defineGetter__("INDEX_STATE_UNAVAILABLE", function () { return 0; });
	this.__defineGetter__("INDEX_STATE_UNINDEXED", function () { return 1; });
	this.__defineGetter__("INDEX_STATE_PARTIAL", function () { return 2; });
	this.__defineGetter__("INDEX_STATE_INDEXED", function () { return 3; });

	const _processorCacheFile = '.zotero-ft-unprocessed';

	var _pdfConverterVersion = null;
	var _pdfConverterFileName = null;
	var _pdfConverter = null; // nsIFile to executable
	var _pdfInfoVersion = null;
	var _pdfInfoFileName = null;
	var _pdfInfo = null; // nsIFile to executable

	var _idleObserverIsRegistered = false;
	var _idleObserverDelay = 5;
	var _processorTimer = null;
	var _upgradeCheck = true;

	const SYNC_STATE_UNSYNCED = 0;
	const SYNC_STATE_IN_SYNC = 1;
	const SYNC_STATE_TO_PROCESS = 2;
	const SYNC_STATE_TO_DOWNLOAD = 3;

	var self = this;

	function init() {
		var platform = Zotero.platform.replace(' ', '-');
		_pdfConverterFileName = this.pdfConverterName + '-' + platform;
		_pdfInfoFileName = this.pdfInfoName + '-' + platform;
		if (Zotero.isWin) {
			_pdfConverterFileName += '.exe';
			_pdfInfoFileName += '.exe';
		}

		this.__defineGetter__("pdfConverterFileName", function() { return _pdfConverterFileName; });
		this.__defineGetter__("pdfConverterVersion", function() { return _pdfConverterVersion; });
		this.__defineGetter__("pdfInfoFileName", function() { return _pdfInfoFileName; });
		this.__defineGetter__("pdfInfoVersion", function() { return _pdfInfoVersion; });

		this.registerPDFTool('converter');
		this.registerPDFTool('info');

		// TEMP: Remove after 4.1 DB schema change
		var cols = Zotero.DB.getColumns('fulltextItems');
		if (cols.indexOf("synced") == -1) {
			Zotero.DB.beginTransaction();
			Zotero.DB.query("ALTER TABLE fulltextItems ADD COLUMN synced INT DEFAULT 0");
			Zotero.DB.query("REPLACE INTO settings (setting, key, value) VALUES ('fulltext', 'downloadAll', 1)");
			Zotero.DB.commitTransaction();
		}

		this.startContentProcessor();
		Zotero.addShutdownListener(this.stopContentProcessor);
	}


	/*
	 * Looks for pdftotext-{platform}[.exe] in the Zotero data directory
	 *
	 * {platform} is navigator.platform, with spaces replaced by hyphens
	 *   e.g. "Win32", "Linux-i686", "MacPPC", "MacIntel", etc.
	 */
	function registerPDFTool(tool) {
		var errMsg = false;
		var exec = Zotero.getZoteroDirectory();

		switch (tool) {
			case 'converter':
				var toolName = this.pdfConverterName;
				var fileName = _pdfConverterFileName
				break;

			case 'info':
				var toolName = this.pdfInfoName;
				var fileName = _pdfInfoFileName
				break;

			default:
				throw ("Invalid PDF tool type '" + tool + "' in Zotero.Fulltext.registerPDFTool()");
		}

		exec.append(fileName);
		if (!exec.exists()) {
			exec = null;
			errMsg = fileName + ' not found';
		}

		if (!exec) {
			if (tool == 'converter') {
				Zotero.debug(errMsg + ' -- PDF indexing disabled');
			}
			return false;
		}

		var versionFile = exec.parent;
		versionFile.append(fileName + '.version');
		if (versionFile.exists()) {
			var version = Zotero.File.getSample(versionFile).split(/[\r\n\s]/)[0];
		}
		if (!version) {
			var version = 'UNKNOWN';
		}

		switch (tool) {
			case 'converter':
				_pdfConverter = exec;
				_pdfConverterVersion = version;
				break;

			case 'info':
				_pdfInfo = exec;
				_pdfInfoVersion = version;
				break;
		}

		Zotero.debug(toolName + ' version ' + version + ' registered at ' + exec.path);

		return true;
	}


	function pdfConverterIsRegistered() {
		return !!_pdfConverter;
	}


	function pdfInfoIsRegistered() {
		return !!_pdfInfo;
	}


	/*
	 * Returns true if MIME type is converted to text and cached before indexing
	 *   (e.g. application/pdf is run through pdftotext)
	 */
	function isCachedMIMEType(mimeType) {
		switch (mimeType) {
			case 'application/pdf':
				return true;
		}
		return false;
	}


	/*
	 * Index multiple words at once
	 */
	function indexWords(itemID, words) {
		if (!words || !words.length || !itemID){
			return false;
		}

		var existing = [];
		var done = 0;
		var maxWords = 999; // compiled limit
		var numWords = words.length;

		Zotero.DB.beginTransaction();

		var origWords = [];

		do {
			var chunk = words.splice(0, maxWords);
			origWords = origWords.concat(chunk);

			var sqlQues = [];
			var sqlParams = [];

			for each(var word in chunk) {
				sqlQues.push('?');
				sqlParams.push( { string: word } );
			}

			var sql = "SELECT word, wordID from fulltextWords WHERE word IN ("
			sql += sqlQues.join() + ")";
			var wordIDs = Zotero.DB.query(sql, sqlParams);

			for (var i in wordIDs) {
				// Underscore avoids problems with JS reserved words
				existing['_' + wordIDs[i].word] = wordIDs[i].wordID;
			}

			done += chunk.length;
		}
		while (done < numWords);

		if (!Zotero.DB.valueQuery("SELECT COUNT(*) FROM fulltextItems WHERE itemID=?", itemID)) {
			let sql = "INSERT INTO fulltextItems (itemID, version) VALUES (?,?)";
			Zotero.DB.query(sql, [itemID, 0]);
		}

		// Handle bound parameters manually for optimal speed
		var statement1 = Zotero.DB.getStatement("INSERT INTO fulltextWords (word) VALUES (?)");
		var statement2 = Zotero.DB.getStatement("INSERT OR IGNORE INTO fulltextItemWords VALUES (?,?)");

		for each(var word in origWords) {
			// Skip words containing invalid characters
			if (word.match(/[\u0000-\u0008\u000b\u000c\u000e-\u001f\ud800-\udfff\ufffe\uffff]/)) {
				Zotero.debug("Skipping word '" + word + "' due to invalid characters");
				continue;
			}
			if (existing['_' + word]){
				var wordID = existing['_' + word];
			}
			else {
				statement1.bindUTF8StringParameter(0, word);
				statement1.execute()
				var wordID = Zotero.DB.getLastInsertID();
			}

			statement2.bindInt32Parameter(0, wordID);
			statement2.bindInt32Parameter(1, itemID);
			statement2.execute();
		}

		statement1.reset();
		statement2.reset();

		Zotero.DB.commitTransaction();
	}


	function indexString(text, charset, itemID, stats, version, synced) {
		try {
			Zotero.UnresponsiveScriptIndicator.disable();

			var words = semanticSplitter(text, charset);

			Zotero.DB.beginTransaction();

			this.clearItemWords(itemID, true);
			this.indexWords(itemID, words, stats, version, synced);

			var sql = "UPDATE fulltextItems SET synced=?";
			var params = [synced ? parseInt(synced) : SYNC_STATE_UNSYNCED];
			if (stats) {
				for (let stat in stats) {
					sql += ", " + stat + "=?";
					params.push(stats[stat] ? parseInt(stats[stat]) : null);
				}
			}
			if (version) {
				sql += ", version=?";
				params.push(parseInt(version));
			}
			sql += " WHERE itemID=?";
			params.push(itemID);
			Zotero.DB.query(sql, params);

			/*
			var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
			Zotero.DB.query(sql, [itemID, {string:text}]);
			*/

			Zotero.DB.commitTransaction();

			// If there's a processor cache file, delete it (whether or not we just used it)
			var cacheFile = this.getItemProcessorCacheFile(itemID);
			if (cacheFile.exists()) {
				cacheFile.remove(false);
			}

			Zotero.Notifier.trigger('refresh', 'item', itemID);
		}
		finally {
			Zotero.UnresponsiveScriptIndicator.enable();
		}
	}


	function indexDocument(document, itemID){
		if (!itemID){
			throw ('Item ID not provided to indexDocument()');
		}

		Zotero.debug("Indexing document '" + document.title + "'");

		if (!Zotero.MIME.isTextType(document.contentType)) {
			Zotero.debug(document.contentType + " document is not text", 2);
			return false;
		}

		if (!document.body) {
			Zotero.debug("Cannot index " + document.contentType + " file", 2);
			return false;
		}

		if (!document.characterSet){
			Zotero.debug("Text file didn't have charset", 2);
			return false;
		}

		var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
		var obj = convertItemHTMLToText(itemID, document.body.innerHTML, maxLength);
		var text = obj.text;
		var totalChars = obj.totalChars;

		if (totalChars > maxLength) {
			Zotero.debug('Only indexing first ' + maxLength + ' characters of item '
				+ itemID + ' in indexDocument()');
		}

		this.indexString(text, document.characterSet, itemID);
		this.setChars(itemID, { indexed: text.length, total: totalChars });
	}


	/**
	 * @param {Boolean} [complete=FALSE]  Index the file in its entirety, ignoring maxLength
	 */
	function indexFile(file, mimeType, charset, itemID, complete, isCacheFile) {
		if (!file.exists()){
			Zotero.debug('File not found in indexFile()', 2);
			return false;
		}

		if (!itemID){ throw ('Item ID not provided to indexFile()'); }

		if (!mimeType) {
			Zotero.debug("MIME type not provided in indexFile()", 1);
			return false;
		}

		if (mimeType == 'application/pdf') {
			try {
				Zotero.UnresponsiveScriptIndicator.disable();
				return this.indexPDF(file, itemID, complete);
			}
			finally {
				Zotero.UnresponsiveScriptIndicator.enable();
			}
		}

		if (!Zotero.MIME.isTextType(mimeType)) {
			Zotero.debug('File is not text in indexFile()', 2);
			return false;
		}

		if (!charset){
			Zotero.debug("Text file didn't have charset in indexFile()", 1);
			return false;
		}

		Zotero.debug('Indexing file ' + file.path);

		var text = Zotero.File.getContents(file, charset);
		var totalChars = text.length;
		var maxLength = complete ? false : Zotero.Prefs.get('fulltext.textMaxLength');

		if (mimeType == 'text/html') {
			let obj = convertItemHTMLToText(itemID, text, maxLength);
			text = obj.text;
			totalChars = obj.totalChars;
		}
		else {
			if (maxLength && text.length > maxLength) {
				text = text.substr(0, maxLength);
			}
		}

		this.indexString(text, charset, itemID);

		// Record the number of characters indexed (unless we're indexing a (PDF) cache file,
		// in which case the stats are coming from elsewhere)
		if (!isCacheFile) {
			this.setChars(itemID, { indexed: text.length, total: totalChars });
		}

		return true;
	}


	/*
	 * Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info
	 * and .zotero-ft-cache, and pass the text file back to indexFile()
	 *
	 * @param	 allPages	 If true, index all pages rather than pdfMaxPages
	 */
	function indexPDF(file, itemID, allPages) {
		if (!_pdfConverter) {
			Zotero.debug("PDF tools are not installed -- skipping indexing");
			return false;
		}

		var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages');
		if (maxPages == 0) {
			return false;
		}

		var item = Zotero.Items.get(itemID);
		var linkMode = item.attachmentLinkMode;
		// If file is stored outside of Zotero, create a directory for the item
		// in the storage directory and save the cache file there
		if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) {
			var cacheFile = Zotero.Attachments.createDirectoryForItem(itemID);
		}
		else {
			var cacheFile = file.parent;
		}
		cacheFile.append(this.pdfConverterCacheFile);

		if (_pdfInfo) {
			var infoFile = cacheFile.parent;
			infoFile.append(this.pdfInfoCacheFile);
			Zotero.debug('Running pdfinfo "' + file.path + '" "' + infoFile.path + '"');

			var proc = Components.classes["@mozilla.org/process/util;1"].
					createInstance(Components.interfaces.nsIProcess);
			proc.init(_pdfInfo);

			var args = [file.path, infoFile.path];
			try {
				proc.runw(true, args, args.length);
				var totalPages = this.getTotalPagesFromFile(itemID);
			}
			catch (e) {
				Zotero.debug("Error running pdfinfo");
			}
		}
		else {
			Zotero.debug(this.pdfInfoName + " is not available");
		}

		Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk '
			+ (allPages ? '' : '-l ' + maxPages) + ' "' + file.path + '" "'
			+ cacheFile.path + '"');

		var proc = Components.classes["@mozilla.org/process/util;1"].
				createInstance(Components.interfaces.nsIProcess);
		proc.init(_pdfConverter);

		var args = ['-enc', 'UTF-8', '-nopgbrk'];
		if (allPages) {
			if (totalPages) {
				var pagesIndexed = totalPages;
			}
		}
		else {
			args.push('-l', maxPages);
			var pagesIndexed = Math.min(maxPages, totalPages);
		}
		args.push(file.path, cacheFile.path);
		try {
			proc.runw(true, args, args.length);
		}
		catch (e) {
			Zotero.debug("Error running pdftotext");
			return false;
		}

		if (!cacheFile.exists()) {
			var msg = file.leafName + " was not indexed";
			if (!file.leafName.match(/^[\u0000-\u007F]+$/)) {
				msg += " -- PDFs with filenames containing extended characters cannot currently be indexed due to a Firefox limitation";
			}
			Zotero.debug(msg, 2);
			Components.utils.reportError(msg);
			return false;
		}

		Zotero.DB.beginTransaction();
		this.indexFile(cacheFile, 'text/plain', 'utf-8', itemID, true, true);
		this.setPages(itemID, { indexed: pagesIndexed, total: totalPages });
		Zotero.DB.commitTransaction();
		return true;
	}


	function indexItems(items, complete, ignoreErrors) {
		if (!Array.isArray(items)) {
			items = [items];
		}
		var items = Zotero.Items.get(items);
		var found = [];

		Zotero.DB.beginTransaction();

		for each (let item in items) {
			if (!item.isAttachment()) {
				continue;
			}

			let itemID = item.id;

			var file = item.getFile();
			if (!file){
				Zotero.debug("No file to index for item " + itemID + " in Fulltext.indexItems()");
				continue;
			}

			if (ignoreErrors) {
				try {
					this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, complete);
				}
				catch (e) {
					Zotero.debug(e, 1);
					Components.utils.reportError("Error indexing " + file.path);
					Components.utils.reportError(e);
				}
			}
			else {
				this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, complete);
			}
		}

		Zotero.DB.commitTransaction();
	}


	//
	// Full-text content syncing
	//
	/**
	 * Get content and stats that haven't yet been synced
	 *
	 * @param {Integer} maxChars  Maximum total characters to include.
	 *                            The total can go over this if there's a
	 *                            single large item.
	 * @return {Array<Object>}
	 */
	this.getUnsyncedContent = function (maxChars) {
		var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
		var first = true;
		var chars = 0;
		var contentItems = [];
		var sql = "SELECT itemID, indexedChars, totalChars, indexedPages, totalPages "
			+ "FROM fulltextItems JOIN items USING (itemID) WHERE synced=" + SYNC_STATE_UNSYNCED
			+ " ORDER BY clientDateModified DESC";
		var rows = Zotero.DB.query(sql) || [];
		for each (let row in rows) {
			let text;
			let itemID = row.itemID;
			let item = Zotero.Items.get(itemID);
			let libraryKey = item.libraryID + "/" + item.key;
			let mimeType = item.attachmentMIMEType;
			if (isCachedMIMEType(mimeType) || Zotero.MIME.isTextType(mimeType)) {
				try {
					let cacheFile = this.getItemCacheFile(itemID);
					if (cacheFile.exists()) {
						Zotero.debug("Adding full-text content from cache "
							+ "file for item " + libraryKey);
						text = Zotero.File.getContents(cacheFile);
					}
					else {
						if (!Zotero.MIME.isTextType(mimeType)) {
							Zotero.debug("Full-text content cache file doesn't exist for item "
								+ libraryKey, 2);
							continue;
						}

						let file = item.getFile();
						if (!file) {
							Zotero.debug("File doesn't exist getting full-text content for item "
								+ libraryKey, 2);
							continue;
						}

						Zotero.debug("Adding full-text content from file for item " + libraryKey);
						text = Zotero.File.getContents(file, item.attachmentCharset);

						// If HTML, convert to plain text first, and cache the result
						if (item.attachmentMIMEType == 'text/html') {
							let obj = convertItemHTMLToText(
								itemID,
								text,
								// Include in the cache file only as many characters as we
								// indexed previously
								row.indexedChars
							);
							text = obj.text;
						}
						else {
							// Include only as many characters as we've indexed
							text = text.substr(0, row.indexedChars);
						}
					}
				}
				catch (e) {
					Zotero.debug(e, 1);
					Components.utils.reportError(e);
					continue;
				}
			}
			else {
				Zotero.debug("Skipping non-text file getting full-text content for item "
					+ libraryKey, 2);

				// Delete rows for items that weren't supposed to be indexed
				this.clearItemWords(itemID);
				continue;
			}

			// If this isn't the first item and it would put us over the limit,
			// skip it
			if (!first && maxChars && ((chars + text.length) > maxChars)) {
				continue;
			}
			chars += text.length;
			first = false;

			contentItems.push({
				libraryID: item.libraryID,
				key: item.key,
				text: text,
				indexedChars: row.indexedChars ? row.indexedChars : 0,
				totalChars: row.totalChars ? row.totalChars : 0,
				indexedPages: row.indexedPages ? row.indexedPages : 0,
				totalPages: row.totalPages ? row.totalPages : 0
			});

			if (maxChars && chars > maxChars) {
				break;
			}
		}
		return contentItems;
	}


	/**
	 * @return {String}  PHP-formatted POST data for items not yet downloaded
	 */
	this.getUndownloadedPostData = function () {
		// On upgrade, get all content
		var sql = "SELECT value FROM settings WHERE setting='fulltext' AND key='downloadAll'";
		if (Zotero.DB.valueQuery(sql)) {
			return "&ftkeys=all";
		}

		var sql = "SELECT itemID FROM fulltextItems WHERE synced="
			+ SYNC_STATE_TO_DOWNLOAD;
		var itemIDs = Zotero.DB.columnQuery(sql);
		if (!itemIDs) {
			return "";
		}
		var undownloaded = {};
		for each (let itemID in itemIDs) {
			let item = Zotero.Items.get(itemID);
			let libraryID = item.libraryID
			libraryID = libraryID ? libraryID : Zotero.libraryID;
			if (!undownloaded[libraryID]) {
				undownloaded[libraryID] = [];
			}
			undownloaded[libraryID].push(item.key);
		}
		var data = "";
		for (let libraryID in undownloaded) {
			for (let i = 0; i < undownloaded[libraryID].length; i++) {
				data += "&" + encodeURIComponent("ftkeys[" + libraryID + "][" + i + "]")
					+ "=" + undownloaded[libraryID][i];
			}
		}
		return data;
	}


	/**
	 * Save full-text content and stats to a cache file
	 */
	this.setItemContent = function (libraryID, key, text, stats, version) {
		var item = Zotero.Items.getByLibraryAndKey(libraryID, key);
		if (!item) {
			let msg = "Item not found setting full-text content";
			Zotero.debug(msg, 1);
			Components.utils.reportError(msg);
			return;
		}
		var itemID = item.id;

		if (text !== '') {
			var cacheFile = this.getItemProcessorCacheFile(itemID);

			// If a storage directory doesn't exist, create it
			if (!cacheFile.parent.exists()) {
				Zotero.Attachments.createDirectoryForItem(itemID);
			}

			Zotero.debug("Writing full-text content and data to " + cacheFile.path);
			Zotero.File.putContents(cacheFile, JSON.stringify({
				indexedChars: stats.indexedChars,
				totalChars: stats.totalChars,
				indexedPages: stats.indexedPages,
				totalPages: stats.totalPages,
				version: version,
				text: text
			}));
			var synced = SYNC_STATE_TO_PROCESS;
		}
		else {
			Zotero.debug("Marking full-text content for download");
			var synced = SYNC_STATE_TO_DOWNLOAD;
		}

		// Mark the item as unprocessed
		if (Zotero.DB.valueQuery("SELECT COUNT(*) FROM fulltextItems WHERE itemID=?", itemID)) {
			Zotero.DB.query("UPDATE fulltextItems SET synced=? WHERE itemID=?", [synced, itemID]);
		}
		// If not yet indexed, add an empty row
		else {
			Zotero.DB.query(
				"REPLACE INTO fulltextItems (itemID, version, synced) VALUES (?, 0, ?)",
				[itemID, synced]
			);
		}

		if (_upgradeCheck) {
			Zotero.DB.query("DELETE FROM settings WHERE setting='fulltext' AND key='downloadAll'");
			_upgradeCheck = false;
		}

		this.startContentProcessor();
	}


	/**
	 * Start the idle observer for the background content processor
	 */
	this.startContentProcessor = function () {
		if (!_idleObserverIsRegistered) {
			Zotero.debug("Initializing full-text content ingester idle observer");
			var idleService = Components.classes["@mozilla.org/widget/idleservice;1"]
					.getService(Components.interfaces.nsIIdleService);
			idleService.addIdleObserver(this.idleObserver, _idleObserverDelay);
			_idleObserverIsRegistered = true;
		}
	}

	/**
	 * Stop the idle observer and a running timer, if there is one
	 */
	this.stopContentProcessor = function () {
		if (_idleObserverIsRegistered) {
			var idleService = Components.classes["@mozilla.org/widget/idleservice;1"]
				.getService(Components.interfaces.nsIIdleService);
			idleService.removeIdleObserver(this.idleObserver, _idleObserverDelay);
			_idleObserverIsRegistered = false;
		}

		if (_processorTimer) {
			_processorTimer.cancel();
			_processorTimer = null;
		}
	}

	/**
	 *
	 * @param {Array<Integer>} itemIDs  An array of itemIDs to process; if this
	 *                                  is omitted, a database query is made
	 *                                  to find unprocessed content
	 * @return {Boolean}  TRUE if there's more content to process; FALSE otherwise
	 */
	this.processUnprocessedContent = function (itemIDs) {
		if (!itemIDs) {
			Zotero.debug("Checking for unprocessed full-text content");
			let sql = "SELECT itemID FROM fulltextItems WHERE synced="
				+ SYNC_STATE_TO_PROCESS;
			itemIDs = Zotero.DB.columnQuery(sql) || [];
		}
		// If there's no more unprocessed content, stop the idle observer
		if (!itemIDs.length) {
			Zotero.debug("No unprocessed full-text content found");
			this.stopContentProcessor();
			return;
		}

		let itemID = itemIDs.shift();
		let item = Zotero.Items.get(itemID);

		Zotero.debug("Processing full-text content for item " + item.libraryKey);

		Zotero.Fulltext.indexFromProcessorCache(itemID)
		.then(function () {
			if (itemIDs.length) {
				if (!_processorTimer) {
					_processorTimer = Components.classes["@mozilla.org/timer;1"]
						.createInstance(Components.interfaces.nsITimer);
				}
				_processorTimer.initWithCallback(
					function () {
						Zotero.Fulltext.processUnprocessedContent(itemIDs);
					},
					100,
					Components.interfaces.nsITimer.TYPE_ONE_SHOT
				);
			}
		})
		.done();
	}

	this.idleObserver = {
		observe: function (subject, topic, data) {
			// On idle, start the background processor
			if (topic == 'idle') {
				Zotero.Fulltext.processUnprocessedContent();
			}
			// When back from idle, stop the processor (but keep the idle
			// observer registered)
			else if (topic == 'active') {
				if (_processorTimer) {
					Zotero.debug("Stopping full-text content processor");
					_processorTimer.cancel();
				}
			}
		}
	};


	this.indexFromProcessorCache = function (itemID) {
		var self = this;
		return Q.fcall(function () {
			var cacheFile = self.getItemProcessorCacheFile(itemID);
			if (!cacheFile.exists())  {
				Zotero.debug("Full-text content processor cache file doesn't exist for item " + itemID);
				return false;
			}

			let data;

			return Zotero.File.getContentsAsync(cacheFile)
			.then(function (json) {
				data = JSON.parse(json);

				// Write the text content to the regular cache file
				cacheFile = self.getItemCacheFile(itemID);

				Zotero.debug("Writing full-text content to " + cacheFile.path);
				return Zotero.File.putContentsAsync(cacheFile, data.text, "UTF-8");
			})
			.then(function () {
				Zotero.Fulltext.indexString(
					data.text,
					"UTF-8",
					itemID,
					{
						indexedChars: data.indexedChars,
						totalChars: data.totalChars,
						indexedPages: data.indexedPages,
						totalPages: data.totalPages
					},
					data.version,
					1
				);
			});
		})
		.catch(function (e) {
			Components.utils.reportError(e);
			Zotero.debug(e, 1);
			return false;
		});
	}

	//
	// End full-text content syncing
	//


	/*
	 * Scan a string for another string
	 *
	 * _items_ -- one or more attachment items to search
	 * _searchText_ -- text pattern to search for
	 * _mode_:
	 *    'regexp' -- regular expression (case-insensitive)
	 *    'regexpCS' -- regular expression (case-sensitive)
	 *
	 * - Slashes in regex are optional
	 */
	this.findTextInString = function (content, searchText, mode) {
		switch (mode){
			case 'regexp':
			case 'regexpCS':
			case 'regexpBinary':
			case 'regexpCSBinary':
				// Do a multiline search by default
				var flags = 'm';
				var parts = searchText.match(/^\/(.*)\/([^\/]*)/);
				if (parts){
					searchText = parts[1];
					// Ignore user-supplied flags
					//flags = parts[2];
				}

				if (mode.indexOf('regexpCS')==-1){
					flags += 'i';
				}

				try {
					var re = new RegExp(searchText, flags);
					var matches = re.exec(content);
				}
				catch (e) {
					Zotero.debug(e, 1);
					Components.utils.reportError(e);
				}
				if (matches){
					Zotero.debug("Text found");
					return content.substr(matches.index, 50);
				}

				break;

			default:
				// Case-insensitive
				searchText = searchText.toLowerCase();
				content = content.toLowerCase();

				var pos = content.indexOf(searchText);
				if (pos!=-1){
					Zotero.debug('Text found');
					return content.substr(pos, 50);
				}
		}

		return -1;
	}

	/*
	 * Scan item files for a text string
	 *
	 * _items_ -- one or more attachment items to search
	 * _searchText_ -- text pattern to search for
	 * _mode_:
	 *    'phrase'
	 *    'regexp'
	 *    'regexpCS' -- case-sensitive regular expression
	 *
	 * Note:
	 *  - Slashes in regex are optional
	 *  - Add 'Binary' to the mode to search all files, not just text files
	 */
	function findTextInItems(items, searchText, mode){
		if (!searchText){
			return [];
		}

		var items = Zotero.Items.get(items);
		var found = [];

		for each (let item in items) {
			if (!item.isAttachment()) {
				continue;
			}

			let itemID = item.id;
			let content;
			let mimeType = item.attachmentMIMEType;
			let maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
			let binaryMode = mode && mode.indexOf('Binary') != -1;

			if (isCachedMIMEType(mimeType)) {
				let file = this.getItemCacheFile(itemID);
				if (!file.exists()) {
					continue;
				}

				Zotero.debug("Searching for text '" + searchText + "' in " + file.path);
				content = Zotero.File.getContents(file, 'utf-8', maxLength);
			}
			else {
				// If not binary mode, only scan plaintext files
				if (!binaryMode) {
					if (!Zotero.MIME.isTextType(mimeType)) {
						Zotero.debug('Not scanning MIME type ' + mimeType, 4);
						continue;
					}
				}

				// Check for a cache file
				let cacheFile = this.getItemCacheFile(itemID);
				if (cacheFile.exists()) {
					Zotero.debug("Searching for text '" + searchText + "' in " + cacheFile.path);
					content = Zotero.File.getContents(cacheFile, 'utf-8', maxLength);
				}
				else {
					// If that doesn't exist, check for the actual file
					let file = item.getFile();
					if (!file) {
						continue;
					}

					Zotero.debug("Searching for text '" + searchText + "' in " + file.path);
					content = Zotero.File.getContents(file, item.attachmentCharset);

					// If HTML and not binary mode, convert to text
					if (mimeType == 'text/html' && !binaryMode) {
						// Include in the cache file only as many characters as we've indexed
						let chars = this.getChars(itemID);

						let obj = convertItemHTMLToText(
							itemID, content, chars ? chars.indexedChars : null
						);
						content = obj.text;
					}
				}
			}

			let match = this.findTextInString(content, searchText, mode);
			if (match != -1) {
				found.push({
					id: itemID,
					match: match
				});
			}
		}

		return found;
	}


	function clearItemWords(itemID, skipCacheClear) {
		Zotero.DB.beginTransaction();
		var sql = "SELECT rowid FROM fulltextItems WHERE itemID=? LIMIT 1";
		var indexed = Zotero.DB.valueQuery(sql, itemID);
		if (indexed) {
			Zotero.DB.query("DELETE FROM fulltextItemWords WHERE itemID=?", itemID);
			Zotero.DB.query("DELETE FROM fulltextItems WHERE itemID=?", itemID);
		}
		Zotero.DB.commitTransaction();

		if (indexed) {
			Zotero.Prefs.set('purge.fulltext', true);
		}

		if (!skipCacheClear) {
			// Delete fulltext cache file if there is one
			this.clearCacheFile(itemID);
		}
	}


	function getPages(itemID, force) {
		var sql = "SELECT indexedPages, totalPages AS total "
			+ "FROM fulltextItems WHERE itemID=?";
		return Zotero.DB.rowQuery(sql, itemID);
	}


	/*
	 * Gets the number of pages from the PDF info cache file
	 */
	function getTotalPagesFromFile(itemID) {
		var file = Zotero.Attachments.getStorageDirectory(itemID);
		file.append(this.pdfInfoCacheFile);
		if (!file.exists()) {
			return false;
		}
		var contents = Zotero.File.getContents(file);
		try {
			// Parse pdfinfo output
			var pages = contents.match('Pages:[^0-9]+([0-9]+)')[1];
		}
		catch (e) {
			Zotero.debug(e);
			return false;
		}
		return pages;
	}


	function getChars(itemID) {
		var sql = "SELECT indexedChars, totalChars AS total "
			+ "FROM fulltextItems WHERE itemID=?";
		return Zotero.DB.rowQuery(sql, itemID);
	}


	/*
	 * Gets the number of characters from the PDF converter cache file
	 */
	function getTotalCharsFromFile(itemID) {
		var item = Zotero.Items.get(itemID);
		switch (item.attachmentMIMEType) {
			case 'application/pdf':
				var file = Zotero.Attachments.getStorageDirectory(itemID);
				file.append(this.pdfConverterCacheFile);
				if (!file.exists()) {
					return false;
				}
				break;

			default:
				var file = item.getFile();
				if (!file) {
					return false;
				}
		}

		return Zotero.File.getContents(file).length;
	}


	function setPages(itemID, obj) {
		var sql = "UPDATE fulltextItems SET indexedPages=?, totalPages=? WHERE itemID=?";
		Zotero.DB.query(
			sql,
			[
				obj.indexed ? parseInt(obj.indexed) : null,
				obj.total ? parseInt(obj.total) : null,
				itemID
			]
		);
	}


	function setChars(itemID, obj) {
		var sql = "UPDATE fulltextItems SET indexedChars=?, totalChars=? WHERE itemID=?";
		Zotero.DB.query(
			sql,
			[
				obj.indexed ? parseInt(obj.indexed) : null,
				obj.total ? parseInt(obj.total) : null,
				itemID
			]
		);
	}


	/*
	 * Gets the indexed state of an item,
	 */
	function getIndexedState(itemID) {
		var item = Zotero.Items.get(itemID);
		if (!item) {
			throw ("Invalid item " + itemID + " in Zotero.Fulltext.getIndexedState()");
		}

		if (!item.isAttachment()) {
			throw ('Item ' + itemID + ' is not an attachment in Zotero.Fulltext.getIndexedState()');
		}

		switch (item.attachmentMIMEType) {
			// Use pages for PDFs
			case 'application/pdf':
				var pages = this.getPages(itemID);
				if (pages) {
					var indexedPages = pages.indexedPages;
					var totalPages = pages.total;

					if (!totalPages && !indexedPages) {
						var status = this.INDEX_STATE_UNAVAILABLE;
					}
					else if (!indexedPages) {
						var status = this.INDEX_STATE_UNINDEXED;
					}
					else if (indexedPages < totalPages) {
						var status = this.INDEX_STATE_PARTIAL;
					}
					else {
						var status = this.INDEX_STATE_INDEXED;
					}
				}
				else {
					var status = this.INDEX_STATE_UNINDEXED;
				}
				break;

			// Use chars
			default:
				var chars = this.getChars(itemID);
				if (chars) {
					var indexedChars = chars.indexedChars;
					var totalChars = chars.total;

					if (!totalChars && !indexedChars) {
						var status = this.INDEX_STATE_UNAVAILABLE;
					}
					else if (!indexedChars) {
						var status = this.INDEX_STATE_UNINDEXED;
					}
					else if (indexedChars < totalChars) {
						var status = this.INDEX_STATE_PARTIAL;
					}
					else {
						var status = this.INDEX_STATE_INDEXED;
					}
				}
				else {
					var status = this.INDEX_STATE_UNINDEXED;
				}
		}
		return status;
	}


	this.isFullyIndexed = function (itemID) {
		if (!itemID) {
			throw ("itemID not provided in Zotero.Fulltext.isFullyIndexed()");
		}
		return this.getIndexedState(itemID) == this.INDEX_STATE_INDEXED;
	}


	function getIndexStats() {
		var sql = "SELECT COUNT(*) FROM fulltextItems WHERE "
			+ "(indexedPages IS NOT NULL AND indexedPages=totalPages) OR "
			+ "(indexedChars IS NOT NULL AND indexedChars=totalChars)"
		var indexed = Zotero.DB.valueQuery(sql);

		var sql = "SELECT COUNT(*) FROM fulltextItems WHERE "
			+ "(indexedPages IS NOT NULL AND indexedPages<totalPages) OR "
			+ "(indexedChars IS NOT NULL AND indexedChars<totalChars)"
		var partial = Zotero.DB.valueQuery(sql);

		var sql = "SELECT COUNT(*) FROM itemAttachments WHERE itemID NOT IN "
			+ "(SELECT itemID FROM fulltextItems WHERE "
			+ "indexedPages IS NOT NULL OR indexedChars IS NOT NULL)";
		var unindexed = Zotero.DB.valueQuery(sql);

		var sql = "SELECT COUNT(*) FROM fulltextWords";
		var words = Zotero.DB.valueQuery(sql);

		return { indexed: indexed, partial: partial, unindexed: unindexed,
			words: words };
	}


	this.getItemCacheFile = function (itemID) {
		var cacheFile = Zotero.Attachments.getStorageDirectory(itemID);
		cacheFile.append(self.pdfConverterCacheFile);
		return cacheFile;
	}


	this.getItemProcessorCacheFile = function (itemID) {
		var cacheFile = Zotero.Attachments.getStorageDirectory(itemID);
		cacheFile.append(_processorCacheFile);
		return cacheFile;
	}


	/*
	 * Returns true if an item can be reindexed
	 *
	 * Item must be a non-web-link attachment that isn't already fully indexed
	 */
	function canReindex(itemID) {
		var item = Zotero.Items.get(itemID);
		if (item && item.isAttachment() && item.attachmentLinkMode !=
				Zotero.Attachments.LINK_MODE_LINKED_URL) {
			switch (this.getIndexedState(itemID)) {
				case this.INDEX_STATE_UNAVAILABLE:
				case this.INDEX_STATE_UNINDEXED:
				case this.INDEX_STATE_PARTIAL:

				// TODO: automatically reindex already-indexed attachments?
				case this.INDEX_STATE_INDEXED:
					return true;
			}
		}

		return false;
	}


	function rebuildIndex(unindexedOnly){
		Zotero.DB.beginTransaction();

		// Get all attachments other than web links
		var sql = "SELECT itemID FROM itemAttachments WHERE linkMode!="
			+ Zotero.Attachments.LINK_MODE_LINKED_URL;
		if (unindexedOnly) {
			sql += " AND itemID NOT IN (SELECT itemID FROM fulltextItems "
				+ "WHERE indexedChars IS NOT NULL OR indexedPages IS NOT NULL)";
		}
		var items = Zotero.DB.columnQuery(sql);
		if (items) {
			Zotero.DB.query("DELETE FROM fulltextItemWords WHERE itemID IN (" + sql + ")");
			Zotero.DB.query("DELETE FROM fulltextItems WHERE itemID IN (" + sql + ")");
			this.indexItems(items, false, true);
		}
		Zotero.DB.commitTransaction();
	}


	/*
	 * Clears full-text word index and all full-text cache files
	 */
	function clearIndex(skipLinkedURLs) {
		Zotero.DB.beginTransaction();

		var sql = "DELETE FROM fulltextItems";
		if (skipLinkedURLs) {
			var linkSQL = "SELECT itemID FROM itemAttachments WHERE linkMode ="
				+ Zotero.Attachments.LINK_MODE_LINKED_URL;

			sql += " WHERE itemID NOT IN (" + linkSQL + ")";
		}
		Zotero.DB.query(sql);

		sql = "DELETE FROM fulltextItemWords";
		if (skipLinkedURLs) {
			sql += " WHERE itemID NOT IN (" + linkSQL + ")";
		}
		Zotero.DB.query(sql);

		if (skipLinkedURLs) {
			this.purgeUnusedWords();
		}
		else {
			Zotero.DB.query("DELETE FROM fulltextWords");
		}

		this.clearCacheFiles();

		Zotero.DB.commitTransaction();
	}


	/*
	 * Clears cache file for an item
	 */
	function clearCacheFile(itemID) {
		var item = Zotero.Items.get(itemID);
		if (!item) {
			return;
		}

		if (!item.isAttachment()) {
			Zotero.debug("Item " + itemID + " is not an attachment in Zotero.Fulltext.clearCacheFile()");
			return;
		}

		Zotero.debug('Clearing full-text cache file for item ' + itemID);
		var cacheFile = this.getItemCacheFile(itemID);
		if (cacheFile.exists()) {
			try {
				cacheFile.remove(false);
			}
			catch (e) {
				Zotero.File.checkFileAccessError(e, cacheFile, 'delete');
			}
		}
	}


	/*
	 * Clear cache files for all attachments
	 */
	function clearCacheFiles(skipLinkedURLs) {
		var sql = "SELECT itemID FROM itemAttachments";
		if (skipLinkedURLs) {
			sql += " WHERE linkMode != " + Zotero.Attachments.LINK_MODE_LINKED_URL;
		}
		var items = Zotero.DB.columnQuery(sql);
		for (var i=0; i<items.length; i++) {
			this.clearCacheFile(items[i]);
		}
	}


	/*
	function clearItemContent(itemID){
		Zotero.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
	}
	*/


	function purgeUnusedWords() {
		if (!Zotero.Prefs.get('purge.fulltext')) {
			return;
		}

		var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN "
					+ "(SELECT wordID FROM fulltextItemWords)";
		Zotero.DB.query(sql);

		Zotero.Prefs.set('purge.fulltext', false)
	}


	/**
	 * Convert HTML to text for an item and cache the result
	 */
	function convertItemHTMLToText(itemID, html, maxLength) {
		// Split elements to avoid word concatentation
		html = html.replace(/>/g, '> ');

		var text = HTMLToText(html);
		var totalChars = text.length;

		if (maxLength) {
			text = text.substr(0, maxLength);
		}

		// Write the converted text to a cache file
		var cacheFile = Zotero.Fulltext.getItemCacheFile(itemID);
		Zotero.debug("Writing converted full-text HTML content to " + cacheFile.path);
		if (!cacheFile.parent.exists()) {
			Zotero.Attachments.createDirectoryForItem(itemID);
		}
		Zotero.File.putContentsAsync(cacheFile, text)
		.catch(function (e) {
			Zotero.debug(e, 1);
			Components.utils.reportError(e);
		});

		return {
			text: text,
			totalChars: totalChars
		};
	}

	function HTMLToText(html) {
		var	nsIFC = Components.classes['@mozilla.org/widget/htmlformatconverter;1']
			.createInstance(Components.interfaces.nsIFormatConverter);
		var from = Components.classes['@mozilla.org/supports-string;1']
			.createInstance(Components.interfaces.nsISupportsString);
		from.data = html;
		var to = { value: null };
		try {
			nsIFC.convert('text/html', from, from.toString().length, 'text/unicode', to, {});
			to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
			return to.toString();
		}
		catch(e) {
			Zotero.debug(e, 1);
			return html;
		}
	}


	function semanticSplitter(text, charset){
		if (!text){
			Zotero.debug('No text to index');
			return;
		}

		text = _markTroubleChars(text);

		var serv = Components.classes["@mozilla.org/intl/semanticunitscanner;1"]
				.createInstance(Components.interfaces.nsISemanticUnitScanner);

		var words = [], unique = {}, begin = {}, end = {}, nextPos = 0;
		serv.start(charset ? charset : null);
		do {
			var next = serv.next(text, text.length, nextPos, true, begin, end);
			var str = text.substring(begin.value, end.value);

			// Skip non-breaking spaces
			if (!str || str.charCodeAt(0)==32 || str.charCodeAt(0)==160){
				nextPos = end.value;
				begin = {}, end = {};
				continue;
			}

			// Create alphanum hash keys out of the character codes
			var lc = str.toLowerCase();

			// And store the unique ones
			if (!unique[lc]){
				unique[lc] = true;
			}

			nextPos = end.value;
			begin = {}, end = {};
		}
		while (next);

		for (var i in unique){
			words.push(_restoreTroubleChars(i));
		}

		return words;
	}


	/*
	 * Add spaces between elements, since HTMLToText doesn't
	 *
	 * NOTE: SLOW AND NOT USED!
	 */
	function _separateElements(node){
		var next = node;
		do {
			if (next.hasChildNodes()){
				_separateElements(next.firstChild);
			}

			var space = node.ownerDocument.createTextNode(' ');
			next.parentNode.insertBefore(space, next);
		}
		while (next = next.nextSibling);
	}


	function _markTroubleChars(text){
		text = text.replace("'", "zoteroapostrophe");
		return text;
	}


	function _restoreTroubleChars(text){
		text = text.replace("zoteroapostrophe", "'");
		return text;
	}
}