Sped up fulltext indexing (of loaded documents, at least) by about 75%

- Switched to manually repeated bound parameters in indexWords()
- Switched to the innerHTML regex used elsewhere instead of a more proper but nevertheless misguided DOM traverser to split elements in indexDocument

This may invalidate the fulltext progress indicator ticket
This commit is contained in:
Dan Stillman 2006-09-23 09:05:01 +00:00
parent 3f0fb0e4e6
commit 27d1d63bfc
2 changed files with 80 additions and 24 deletions

View File

@ -12,15 +12,18 @@ Scholar.DB = new function(){
this.rowQuery = rowQuery;
this.columnQuery = columnQuery;
this.statementQuery = statementQuery;
this.getColumns = getColumns;
this.getColumnHash = getColumnHash;
this.getNextID = getNextID;
this.getNextName = getNextName;
this.getStatement = getStatement;
this.getLastInsertID = getLastInsertID;
this.getLastErrorString = getLastErrorString;
this.beginTransaction = beginTransaction;
this.commitTransaction = commitTransaction;
this.rollbackTransaction = rollbackTransaction;
this.transactionInProgress = transactionInProgress;
this.tableExists = tableExists;
this.getColumns = getColumns;
this.getColumnHash = getColumnHash;
this.getNextID = getNextID;
this.getNextName = getNextName;
/////////////////////////////////////////////////////////////////
//
@ -242,6 +245,47 @@ Scholar.DB = new function(){
}
/*
* Get a raw mozStorage statement from the DB for manual processing
*
* This should only be needed for manual parameter binding for
* large repeated queries
*/
function getStatement(sql){
var db = _getDBConnection();
try {
Scholar.debug(sql,5);
var statement = db.createStatement(sql);
}
catch (e){
var dberr = (db.lastErrorString!='not an error')
? ' [ERROR: ' + db.lastErrorString + ']' : '';
throw(e + ' [QUERY: ' + sql + ']' + dberr);
}
return statement;
}
/*
* Only for use with getStatement()
*/
function getLastInsertID(){
var db = _getDBConnection();
return db.lastInsertRowID;
}
/*
* Only for use with getStatement()
*/
function getLastErrorString(){
var db = _getDBConnection();
return db.lastErrorString;
}
function beginTransaction(){
var db = _getDBConnection();

View File

@ -10,7 +10,7 @@ Scholar.Fulltext = new function(){
this.cacheIsOutdated = cacheIsOutdated;
this.rebuildCache = rebuildCache;
this.clearItemWords = clearItemWords;
this.clearItemContent = clearItemContent;
//this.clearItemContent = clearItemContent;
this.purgeUnusedWords = purgeUnusedWords;
this.HTMLToText = HTMLToText;
this.semanticSplitter = semanticSplitter;
@ -32,7 +32,7 @@ Scholar.Fulltext = new function(){
var sql = "SELECT itemID FROM itemAttachments";
var items = Scholar.DB.columnQuery(sql);
this.indexItems(items);
indexItems(items);
Scholar.DB.commitTransaction();
}
@ -63,7 +63,7 @@ Scholar.Fulltext = new function(){
* Index multiple words at once
*/
function indexWords(itemID, words){
if (!words.length){
if (!words || !words.length){
return false;
}
@ -87,31 +87,39 @@ Scholar.Fulltext = new function(){
existing['_' + wordIDs[i]['word']] = wordIDs[i]['wordID'];
}
// TODO: use repeated bound statements once db.js supports it
// Handle bound parameters manually for optimal speed
var statement1 = Scholar.DB.getStatement("INSERT INTO fulltextWords (word) VALUES (?)");
var statement2 = Scholar.DB.getStatement("INSERT OR IGNORE INTO fulltextItems VALUES (?,?)");
statement2.bindInt32Parameter(1, itemID);
for each(var word in words){
if (existing['_' + word]){
var wordID = existing['_' + word];
}
else {
var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
var wordID = Scholar.DB.query(sql, {string:word});
statement1.bindUTF8StringParameter(0, word);
statement1.execute()
var wordID = Scholar.DB.getLastInsertID();
}
var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
Scholar.DB.query(sql, [{int:wordID}, {int:itemID}]);
statement2.bindInt32Parameter(0, wordID);
statement2.execute();
}
statement1.reset();
statement2.reset();
Scholar.DB.commitTransaction();
}
function indexString(text, charset, itemID){
var words = this.semanticSplitter(text, charset);
var words = semanticSplitter(text, charset);
Scholar.DB.beginTransaction();
this.clearItemWords(itemID);
this.indexWords(itemID, words);
clearItemWords(itemID);
indexWords(itemID, words);
/*
var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
@ -129,9 +137,9 @@ Scholar.Fulltext = new function(){
Scholar.debug("Indexing document '" + document.title + "'");
_separateElements(document.body);
var text = this.HTMLToText(document.body.innerHTML);
this.indexString(text, document.characterSet, itemID);
var text = document.body.innerHTML.replace(/(>)/g, '$1 ');
text = HTMLToText(text);
indexString(text, document.characterSet, itemID);
}
@ -154,8 +162,8 @@ Scholar.Fulltext = new function(){
var text = Scholar.File.getContents(file, charset);
// Split elements to avoid word concatentation
text = text.replace(/(>)/g, '$1 ');
text = this.HTMLToText(text);
this.indexString(text, charset, itemID);
text = HTMLToText(text);
indexString(text, charset, itemID);
}
@ -175,7 +183,7 @@ Scholar.Fulltext = new function(){
continue;
}
this.indexFile(file, i.getAttachmentMimeType(),
indexFile(file, i.getAttachmentMimeType(),
i.getAttachmentCharset(), i.getID());
}
@ -208,7 +216,7 @@ Scholar.Fulltext = new function(){
str = str.replace(/(>)/g, '$1 ');
// Parse to avoid searching on HTML
str = this.HTMLToText(str);
str = HTMLToText(str);
}
switch (mode){
@ -294,7 +302,7 @@ Scholar.Fulltext = new function(){
var charset = i.getAttachmentCharset();
var match = this.findTextInFile(file, charset, searchText, mode);
var match = findTextInFile(file, charset, searchText, mode);
if (match != -1){
found.push({id:i.getID(), match:match});
@ -310,9 +318,11 @@ Scholar.Fulltext = new function(){
}
/*
function clearItemContent(itemID){
Scholar.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
}
*/
function purgeUnusedWords(){
@ -389,7 +399,9 @@ Scholar.Fulltext = new function(){
/*
* Add spaces between elements, since body.textContent doesn't
* Add spaces between elements, since HTMLToText doesn't
*
* NOTE: SLOW AND NOT USED!
*/
function _separateElements(node){
var next = node;