Better handling of maxLength and HTML

- Don't truncate before HTML conversion
- Correctly calculate indexed chars and total chars
- Move HTML conversion code into one function
This commit is contained in:
Dan Stillman 2013-11-04 04:34:51 -05:00
parent 0abd903917
commit a89388e77a

View File

@ -54,7 +54,6 @@ Zotero.Fulltext = new function(){
this.clearCacheFiles = clearCacheFiles; this.clearCacheFiles = clearCacheFiles;
//this.clearItemContent = clearItemContent; //this.clearItemContent = clearItemContent;
this.purgeUnusedWords = purgeUnusedWords; this.purgeUnusedWords = purgeUnusedWords;
this.HTMLToText = HTMLToText;
this.semanticSplitter = semanticSplitter; this.semanticSplitter = semanticSplitter;
this.__defineGetter__("pdfToolsDownloadBaseURL", function() { return 'http://www.zotero.org/download/xpdf/'; }); this.__defineGetter__("pdfToolsDownloadBaseURL", function() { return 'http://www.zotero.org/download/xpdf/'; });
@ -358,38 +357,25 @@ Zotero.Fulltext = new function(){
return false; return false;
} }
var text = document.body.innerHTML;
var maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
if (text.length > maxLength) { var obj = convertItemHTMLToText(itemID, document.body.innerHTML, maxLength);
var text = obj.text;
var totalChars = obj.totalChars;
if (totalChars > maxLength) {
Zotero.debug('Only indexing first ' + maxLength + ' characters of item ' Zotero.debug('Only indexing first ' + maxLength + ' characters of item '
+ itemID + ' in indexDocument()'); + itemID + ' in indexDocument()');
text = text.substr(0, maxLength);
} }
text = text.replace(/(>)/g, '$1 ');
text = this.HTMLToText(text);
this.indexString(text, document.characterSet, itemID); this.indexString(text, document.characterSet, itemID);
var charsIndexed = Math.min(maxLength, text.length); this.setChars(itemID, { indexed: text.length, total: totalChars });
this.setChars(itemID, { indexed: charsIndexed, total: text.length });
// Write the converted text to a cache file
Q.fcall(function () {
let cacheFile = self.getItemCacheFile(itemID);
Zotero.debug("Writing converted full-text HTML content to " + cacheFile.path);
if (!cacheFile.parent.exists()) {
Zotero.Attachments.createDirectoryForItem(itemID);
}
return Zotero.File.putContentsAsync(cacheFile, text);
})
.catch(function (e) {
Zotero.debug(e, 1);
Components.utils.reportError(e);
})
} }
function indexFile(file, mimeType, charset, itemID, maxLength, isCacheFile) { /**
* @param {Boolean} [complete=FALSE] Index the file in its entirety, ignoring maxLength
*/
function indexFile(file, mimeType, charset, itemID, complete, isCacheFile) {
if (!file.exists()){ if (!file.exists()){
Zotero.debug('File not found in indexFile()', 2); Zotero.debug('File not found in indexFile()', 2);
return false; return false;
@ -402,18 +388,10 @@ Zotero.Fulltext = new function(){
return false; return false;
} }
if (maxLength == undefined || maxLength === true) {
maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
}
// If maxLength is explicitly false, index everything
else if (maxLength === false || maxLength === null) {
maxLength = false;
}
if (mimeType == 'application/pdf') { if (mimeType == 'application/pdf') {
try { try {
Zotero.UnresponsiveScriptIndicator.disable(); Zotero.UnresponsiveScriptIndicator.disable();
return this.indexPDF(file, itemID, !maxLength); return this.indexPDF(file, itemID, complete);
} }
finally { finally {
Zotero.UnresponsiveScriptIndicator.enable(); Zotero.UnresponsiveScriptIndicator.enable();
@ -432,29 +410,27 @@ Zotero.Fulltext = new function(){
Zotero.debug('Indexing file ' + file.path); Zotero.debug('Indexing file ' + file.path);
var text = Zotero.File.getContents(file, charset, maxLength); var text = Zotero.File.getContents(file, charset);
// Split elements to avoid word concatentation var totalChars = text.length;
text = text.replace(/(>)/g, '$1 '); var maxLength = complete ? false : Zotero.Prefs.get('fulltext.textMaxLength');
text = this.HTMLToText(text);
if (mimeType == 'text/html') {
let obj = convertItemHTMLToText(itemID, text, maxLength);
text = obj.text;
totalChars = obj.totalChars;
}
else {
if (maxLength && text.length > maxLength) {
text = text.substr(0, maxLength);
}
}
this.indexString(text, charset, itemID); this.indexString(text, charset, itemID);
// Record number of characters indexed // Record the number of characters indexed (unless we're indexing a (PDF) cache file,
// in which case the stats are coming from elsewhere)
if (!isCacheFile) { if (!isCacheFile) {
try { this.setChars(itemID, { indexed: text.length, total: totalChars });
var totalChars = this.getTotalCharsFromFile(itemID);
}
catch (e) {
Zotero.debug(e);
Components.utils.reportError(e);
totalChars = 0;
}
if (maxLength) {
var charsIndexed = Math.min(maxLength, totalChars);
}
else {
var charsIndexed = totalChars;
}
this.setChars(itemID, { indexed: charsIndexed, total: totalChars });
} }
return true; return true;
@ -550,7 +526,7 @@ Zotero.Fulltext = new function(){
} }
Zotero.DB.beginTransaction(); Zotero.DB.beginTransaction();
this.indexFile(cacheFile, 'text/plain', 'utf-8', itemID, false, true); this.indexFile(cacheFile, 'text/plain', 'utf-8', itemID, true, true);
this.setPages(itemID, { indexed: pagesIndexed, total: totalPages }); this.setPages(itemID, { indexed: pagesIndexed, total: totalPages });
Zotero.DB.commitTransaction(); Zotero.DB.commitTransaction();
return true; return true;
@ -581,7 +557,7 @@ Zotero.Fulltext = new function(){
if (ignoreErrors) { if (ignoreErrors) {
try { try {
this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, !complete); this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, complete);
} }
catch (e) { catch (e) {
Zotero.debug(e, 1); Zotero.debug(e, 1);
@ -590,7 +566,7 @@ Zotero.Fulltext = new function(){
} }
} }
else { else {
this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, !complete); this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, complete);
} }
} }
@ -646,29 +622,18 @@ Zotero.Fulltext = new function(){
} }
Zotero.debug("Adding full-text content from file for item " + libraryKey); Zotero.debug("Adding full-text content from file for item " + libraryKey);
text = Zotero.File.getContents(file, item.attachmentCharset, maxLength); text = Zotero.File.getContents(file, item.attachmentCharset);
// If HTML, convert to plain text first, and cache the result // If HTML, convert to plain text first, and cache the result
if (item.attachmentMIMEType == 'text/html') { if (item.attachmentMIMEType == 'text/html') {
// Split elements to avoid word concatentation let obj = convertItemHTMLToText(
text = text.replace(/(>)/g, '$1 '); itemID,
text,
text = this.HTMLToText(text); // Include in the cache file only as many characters as we
// indexed previously
// Include in the cache file only as many characters as we've indexed row.indexedChars
text = text.substr(0, row.indexedChars); );
text = obj.text;
// Write the converted text to a cache file
Zotero.debug("Writing converted full-text HTML content to "
+ cacheFile.path);
if (!cacheFile.parent.exists()) {
Zotero.Attachments.createDirectoryForItem(itemID);
}
Zotero.File.putContentsAsync(cacheFile, text)
.catch(function (e) {
Zotero.debug(e, 1);
Components.utils.reportError(e);
});
} }
else { else {
// Include only as many characters as we've indexed // Include only as many characters as we've indexed
@ -1079,31 +1044,17 @@ Zotero.Fulltext = new function(){
} }
Zotero.debug("Searching for text '" + searchText + "' in " + file.path); Zotero.debug("Searching for text '" + searchText + "' in " + file.path);
content = Zotero.File.getContents(file, item.attachmentCharset, maxLength); content = Zotero.File.getContents(file, item.attachmentCharset);
// If HTML and not binary mode, convert to text // If HTML and not binary mode, convert to text
if (mimeType == 'text/html' && !binaryMode) { if (mimeType == 'text/html' && !binaryMode) {
// Split elements to avoid word concatentation
content = content.replace(/(>)/g, '$1 ');
content = this.HTMLToText(content);
// Include in the cache file only as many characters as we've indexed // Include in the cache file only as many characters as we've indexed
let chars = this.getChars(itemID); let chars = this.getChars(itemID);
if (chars && chars.indexedChars) {
content = content.substr(0, chars.indexedChars);
}
// Write the converted text to a cache file for future searches let obj = convertItemHTMLToText(
Zotero.debug("Writing converted full-text content to " + cacheFile.path); itemID, content, chars ? chars.indexedChars : null
if (!cacheFile.parent.exists()) { );
Zotero.Attachments.createDirectoryForItem(itemID); content = obj.text;
}
Zotero.File.putContentsAsync(cacheFile, content)
.catch(function (e) {
Zotero.debug(e, 1);
Components.utils.reportError(e);
})
} }
} }
} }
@ -1482,23 +1433,53 @@ Zotero.Fulltext = new function(){
} }
function HTMLToText(text){ /**
var nsIFC = * Convert HTML to text for an item and cache the result
Components.classes['@mozilla.org/widget/htmlformatconverter;1']. */
createInstance(Components.interfaces.nsIFormatConverter); function convertItemHTMLToText(itemID, html, maxLength) {
var from = Components.classes['@mozilla.org/supports-string;1']. // Split elements to avoid word concatentation
createInstance(Components.interfaces.nsISupportsString); html = html.replace(/>/g, '> ');
from.data = text;
var to = {value:null}; var text = HTMLToText(html);
var totalChars = text.length;
if (maxLength) {
text = text.substr(0, maxLength);
}
// Write the converted text to a cache file
var cacheFile = Zotero.Fulltext.getItemCacheFile(itemID);
Zotero.debug("Writing converted full-text HTML content to " + cacheFile.path);
if (!cacheFile.parent.exists()) {
Zotero.Attachments.createDirectoryForItem(itemID);
}
Zotero.File.putContentsAsync(cacheFile, text)
.catch(function (e) {
Zotero.debug(e, 1);
Components.utils.reportError(e);
});
return {
text: text,
totalChars: totalChars
};
}
function HTMLToText(html) {
var nsIFC = Components.classes['@mozilla.org/widget/htmlformatconverter;1']
.createInstance(Components.interfaces.nsIFormatConverter);
var from = Components.classes['@mozilla.org/supports-string;1']
.createInstance(Components.interfaces.nsISupportsString);
from.data = html;
var to = { value: null };
try { try {
nsIFC.convert('text/html', from, from.toString().length, nsIFC.convert('text/html', from, from.toString().length, 'text/unicode', to, {});
'text/unicode', to, {});
to = to.value.QueryInterface(Components.interfaces.nsISupportsString); to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
return to.toString(); return to.toString();
} }
catch(e){ catch(e) {
Zotero.debug(e, 1); Zotero.debug(e, 1);
return text; return html;
} }
} }