
Added "Save Link As Zotero Snapshot" and "Save Image As Zotero Snapshot" options to the browser content context menu where appropriate Other fixes: - Implemented standalone image and plugin snapshots the right way (as opposed to the fairly broken way from yesterday) - Only natively handled files are loaded into a hidden browser when using importFromURL() -- plugin files are now saved directly with saveURI() - indexDocument() doesn't try to index non-text files Notes: - There's no feedback when saving large files, which will likely be a bit confusing for users -- one option would be to put the transfer into the downloads window, though that's a little weird. - I suspect this will fix the reported JSTOR PDF download issue (http://forums.zotero.org/discussion/217/), though I don't currently have a way of testing it.
466 lines
11 KiB
JavaScript
466 lines
11 KiB
JavaScript
/*
|
|
***** BEGIN LICENSE BLOCK *****
|
|
|
|
Copyright (c) 2006 Center for History and New Media
|
|
George Mason University, Fairfax, Virginia, USA
|
|
http://chnm.gmu.edu
|
|
|
|
Licensed under the Educational Community License, Version 1.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.opensource.org/licenses/ecl1.php
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
|
|
***** END LICENSE BLOCK *****
|
|
*/
|
|
|
|
Zotero.Fulltext = new function(){
|
|
this.indexWord = indexWord;
|
|
this.indexWords = indexWords;
|
|
this.indexDocument = indexDocument;
|
|
this.indexString = indexString;
|
|
this.indexFile = indexFile;
|
|
this.indexItems = indexItems;
|
|
this.findTextInFile = findTextInFile;
|
|
this.findTextInItems = findTextInItems;
|
|
this.cacheIsOutdated = cacheIsOutdated;
|
|
this.rebuildCache = rebuildCache;
|
|
this.clearItemWords = clearItemWords;
|
|
//this.clearItemContent = clearItemContent;
|
|
this.purgeUnusedWords = purgeUnusedWords;
|
|
this.HTMLToText = HTMLToText;
|
|
this.semanticSplitter = semanticSplitter;
|
|
|
|
const FULLTEXT_VERSION = 1;
|
|
|
|
|
|
function cacheIsOutdated(){
|
|
var sql = "SELECT version FROM version WHERE schema='fulltext'";
|
|
return Zotero.DB.valueQuery(sql) < FULLTEXT_VERSION;
|
|
}
|
|
|
|
|
|
function rebuildCache(){
|
|
Zotero.DB.beginTransaction();
|
|
Zotero.DB.query("DELETE FROM fulltextWords");
|
|
Zotero.DB.query("DELETE FROM fulltextItems");
|
|
//Zotero.DB.query("DELETE FROM fulltextContent");
|
|
|
|
var sql = "SELECT itemID FROM itemAttachments";
|
|
var items = Zotero.DB.columnQuery(sql);
|
|
indexItems(items);
|
|
|
|
Zotero.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
/*
|
|
* Index a single word
|
|
*/
|
|
function indexWord(itemID, word){
|
|
Zotero.DB.beginTransaction();
|
|
|
|
var sql = "SELECT wordID FROM fulltextWords WHERE word=?";
|
|
var wordID = Zotero.DB.valueQuery(sql, {string:word});
|
|
|
|
if (!wordID){
|
|
var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
|
|
var wordID = Zotero.DB.query(sql, {string:word});
|
|
}
|
|
|
|
var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
|
|
Zotero.DB.query(sql, [wordID, itemID]);
|
|
|
|
Zotero.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
/*
|
|
* Index multiple words at once
|
|
*/
|
|
function indexWords(itemID, words){
|
|
if (!words || !words.length || !itemID){
|
|
return false;
|
|
}
|
|
|
|
var sqlQues = [];
|
|
var sqlParams = [];
|
|
|
|
for each(var word in words){
|
|
sqlQues.push('?');
|
|
sqlParams.push({string:word});
|
|
}
|
|
|
|
Zotero.DB.beginTransaction();
|
|
|
|
var sql = "SELECT word, wordID from fulltextWords WHERE word IN ("
|
|
sql += sqlQues.join() + ")";
|
|
var wordIDs = Zotero.DB.query(sql, sqlParams);
|
|
|
|
var existing = [];
|
|
for (var i in wordIDs){
|
|
// Underscore avoids problems with JS reserved words
|
|
existing['_' + wordIDs[i]['word']] = wordIDs[i]['wordID'];
|
|
}
|
|
|
|
// Handle bound parameters manually for optimal speed
|
|
var statement1 = Zotero.DB.getStatement("INSERT INTO fulltextWords (word) VALUES (?)");
|
|
var statement2 = Zotero.DB.getStatement("INSERT OR IGNORE INTO fulltextItems VALUES (?,?)");
|
|
|
|
for each(var word in words){
|
|
if (existing['_' + word]){
|
|
var wordID = existing['_' + word];
|
|
}
|
|
else {
|
|
statement1.bindUTF8StringParameter(0, word);
|
|
statement1.execute()
|
|
var wordID = Zotero.DB.getLastInsertID();
|
|
}
|
|
|
|
statement2.bindInt32Parameter(0, wordID);
|
|
statement2.bindInt32Parameter(1, itemID);
|
|
statement2.execute();
|
|
}
|
|
|
|
statement1.reset();
|
|
statement2.reset();
|
|
|
|
Zotero.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
function indexString(text, charset, itemID){
|
|
var words = semanticSplitter(text, charset);
|
|
|
|
Zotero.DB.beginTransaction();
|
|
|
|
clearItemWords(itemID);
|
|
indexWords(itemID, words);
|
|
|
|
/*
|
|
var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
|
|
Zotero.DB.query(sql, [itemID, {string:text}]);
|
|
*/
|
|
|
|
Zotero.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
function indexDocument(document, itemID){
|
|
if (!itemID){
|
|
throw ('Item ID not provided to indexDocument()');
|
|
}
|
|
|
|
Zotero.debug("Indexing document '" + document.title + "'");
|
|
|
|
if (document.contentType.indexOf('text/') !== 0) {
|
|
Zotero.debug('File is not text in indexDocument()', 2);
|
|
return false;
|
|
}
|
|
|
|
if (!document.characterSet){
|
|
Zotero.debug("Text file didn't have charset in indexFile()", 1);
|
|
return false;
|
|
}
|
|
|
|
var text = document.body.innerHTML.replace(/(>)/g, '$1 ');
|
|
text = HTMLToText(text);
|
|
indexString(text, document.characterSet, itemID);
|
|
}
|
|
|
|
|
|
function indexFile(file, mimeType, charset, itemID){
|
|
if (!file.exists()){
|
|
Zotero.debug('File not found in indexFile()', 2);
|
|
return false;
|
|
}
|
|
|
|
if (!itemID){ throw ('Item ID not provided to indexFile()'); }
|
|
if (!mimeType){ throw ('MIME type not provided to indexFile()'); }
|
|
|
|
if (mimeType.substr(0, 5)!='text/'){
|
|
Zotero.debug('File is not text in indexFile()', 2);
|
|
return false;
|
|
}
|
|
|
|
if (!charset){
|
|
Zotero.debug("Text file didn't have charset in indexFile()", 1);
|
|
return false;
|
|
}
|
|
|
|
var text = Zotero.File.getContents(file, charset);
|
|
// Split elements to avoid word concatentation
|
|
text = text.replace(/(>)/g, '$1 ');
|
|
text = HTMLToText(text);
|
|
indexString(text, charset, itemID);
|
|
}
|
|
|
|
|
|
function indexItems(items){
|
|
var items = Zotero.Items.get(items);
|
|
var found = [];
|
|
|
|
Zotero.DB.beginTransaction();
|
|
|
|
for each(var i in items){
|
|
if (!i.isAttachment()){
|
|
continue;
|
|
}
|
|
|
|
var file = i.getFile();
|
|
if (!file){
|
|
continue;
|
|
}
|
|
|
|
indexFile(file, i.getAttachmentMimeType(),
|
|
i.getAttachmentCharset(), i.getID());
|
|
}
|
|
|
|
var sql = "REPLACE INTO version (schema,version) VALUES (?,?)";
|
|
Zotero.DB.query(sql, ['fulltext', FULLTEXT_VERSION]);
|
|
|
|
Zotero.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
/*
|
|
* Scan a file for a text string
|
|
*
|
|
* _items_ -- one or more attachment items to search
|
|
* _searchText_ -- text pattern to search for
|
|
* _mode_:
|
|
* 'regexp' -- regular expression (case-insensitive)
|
|
* 'regexpCS' -- regular expression (case-sensitive)
|
|
*
|
|
* - Slashes in regex are optional
|
|
*/
|
|
function findTextInFile(file, charset, searchText, mode){
|
|
Zotero.debug("Searching for text '" + searchText + "' in " + file.path);
|
|
|
|
var str = Zotero.File.getContents(file, charset);
|
|
|
|
// If not binary mode, convert HTML to text
|
|
if (!mode || mode.indexOf('Binary')==-1){
|
|
// Split elements to avoid word concatentation
|
|
str = str.replace(/(>)/g, '$1 ');
|
|
|
|
// Parse to avoid searching on HTML
|
|
str = HTMLToText(str);
|
|
}
|
|
|
|
switch (mode){
|
|
case 'regexp':
|
|
case 'regexpCS':
|
|
case 'regexpBinary':
|
|
case 'regexpCSBinary':
|
|
// Do a multiline search by default
|
|
var flags = 'm';
|
|
var parts = searchText.match(/^\/(.*)\/([^\/]*)/);
|
|
if (parts){
|
|
searchText = parts[1];
|
|
// Ignore user-supplied flags
|
|
//flags = parts[2];
|
|
}
|
|
|
|
if (mode.indexOf('regexpCS')==-1){
|
|
flags += 'i';
|
|
}
|
|
|
|
var re = new RegExp(searchText, flags);
|
|
var matches = re(str);
|
|
if (matches){
|
|
Zotero.debug("Text found");
|
|
return str.substr(matches.index, 50);
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
// Case-insensitive
|
|
searchText = searchText.toLowerCase();
|
|
str = str.toLowerCase();
|
|
|
|
var pos = str.indexOf(searchText);
|
|
if (pos!=-1){
|
|
Zotero.debug('Text found');
|
|
return str.substr(pos, 50);
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Scan item files for a text string
|
|
*
|
|
* _items_ -- one or more attachment items to search
|
|
* _searchText_ -- text pattern to search for
|
|
* _mode_:
|
|
* 'phrase'
|
|
* 'regexp'
|
|
* 'regexpCS' -- case-sensitive regular expression
|
|
*
|
|
* Note:
|
|
* - Slashes in regex are optional
|
|
* - Add 'Binary' to the mode to search all files, not just text files
|
|
*/
|
|
function findTextInItems(items, searchText, mode){
|
|
if (!searchText){
|
|
return [];
|
|
}
|
|
|
|
var items = Zotero.Items.get(items);
|
|
var found = [];
|
|
|
|
for each(var i in items){
|
|
if (!i.isAttachment()){
|
|
continue;
|
|
}
|
|
|
|
var file = i.getFile();
|
|
if (!file){
|
|
continue;
|
|
}
|
|
|
|
// If not binary mode, only scan plaintext files
|
|
if (!mode || mode.indexOf('Binary')==-1){
|
|
if (i.getAttachmentMimeType().substr(0,5)!='text/'){
|
|
continue;
|
|
}
|
|
}
|
|
|
|
var charset = i.getAttachmentCharset();
|
|
|
|
var match = findTextInFile(file, charset, searchText, mode);
|
|
|
|
if (match != -1){
|
|
found.push({id:i.getID(), match:match});
|
|
}
|
|
}
|
|
|
|
return found;
|
|
}
|
|
|
|
|
|
function clearItemWords(itemID){
|
|
Zotero.DB.query("DELETE FROM fulltextItems WHERE itemID=" + itemID);
|
|
}
|
|
|
|
|
|
/*
|
|
function clearItemContent(itemID){
|
|
Zotero.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
|
|
}
|
|
*/
|
|
|
|
|
|
function purgeUnusedWords(){
|
|
var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN "
|
|
+ "(SELECT wordID FROM fulltextItems)";
|
|
Zotero.DB.query(sql);
|
|
}
|
|
|
|
|
|
function HTMLToText(text){
|
|
var nsIFC =
|
|
Components.classes['@mozilla.org/widget/htmlformatconverter;1'].
|
|
createInstance(Components.interfaces.nsIFormatConverter);
|
|
var from = Components.classes['@mozilla.org/supports-string;1'].
|
|
createInstance(Components.interfaces.nsISupportsString);
|
|
from.data = text;
|
|
var to = {value:null};
|
|
try {
|
|
nsIFC.convert('text/html', from, from.toString().length,
|
|
'text/unicode', to, {});
|
|
to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
|
|
return to.toString();
|
|
}
|
|
catch(e){
|
|
Zotero.debug(e, 1);
|
|
return text;
|
|
}
|
|
}
|
|
|
|
|
|
function semanticSplitter(text, charset){
|
|
if (!text){
|
|
Zotero.debug('No text to index');
|
|
return;
|
|
}
|
|
|
|
text = _markTroubleChars(text);
|
|
|
|
var serv = Components.classes["@mozilla.org/intl/semanticunitscanner;1"]
|
|
.createInstance(Components.interfaces.nsISemanticUnitScanner);
|
|
|
|
var words = [], unique = {}, begin = {}, end = {}, nextPos = 0;
|
|
serv.start(charset ? charset : null);
|
|
do {
|
|
var next = serv.next(text, text.length, nextPos, true, begin, end);
|
|
var str = text.substring(begin.value, end.value);
|
|
|
|
// Skip non-breaking spaces
|
|
if (!str || str.charCodeAt(0)==32 || str.charCodeAt(0)==160){
|
|
nextPos = end.value;
|
|
begin = {}, end = {};
|
|
continue;
|
|
}
|
|
|
|
// Create alphanum hash keys out of the character codes
|
|
var lc = str.toLowerCase();
|
|
|
|
// And store the unique ones
|
|
if (!unique[lc]){
|
|
unique[lc] = true;
|
|
}
|
|
|
|
nextPos = end.value;
|
|
begin = {}, end = {};
|
|
}
|
|
while (next);
|
|
|
|
for (var i in unique){
|
|
words.push(_restoreTroubleChars(i));
|
|
}
|
|
|
|
return words;
|
|
}
|
|
|
|
|
|
/*
|
|
* Add spaces between elements, since HTMLToText doesn't
|
|
*
|
|
* NOTE: SLOW AND NOT USED!
|
|
*/
|
|
function _separateElements(node){
|
|
var next = node;
|
|
do {
|
|
if (next.hasChildNodes()){
|
|
_separateElements(next.firstChild);
|
|
}
|
|
|
|
var space = node.ownerDocument.createTextNode(' ');
|
|
next.parentNode.insertBefore(space, next);
|
|
}
|
|
while (next = next.nextSibling);
|
|
}
|
|
|
|
|
|
function _markTroubleChars(text){
|
|
text = text.replace("'", "zoteroapostrophe");
|
|
return text;
|
|
}
|
|
|
|
|
|
function _restoreTroubleChars(text){
|
|
text = text.replace("zoteroapostrophe", "'");
|
|
return text;
|
|
}
|
|
}
|