206 lines
6.6 KiB
Python
Executable File
206 lines
6.6 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
import hashlib
|
|
import os
|
|
import sqlite3
|
|
import time
|
|
import sys
|
|
import stat
|
|
import math
|
|
import threading, Queue
|
|
|
|
# Common functions
|
|
|
|
def removePrefix(fileName):
|
|
while fileName[0:2] == ".%":
|
|
fileName = fileName[2:]
|
|
return fileName
|
|
|
|
def removePrefixPath(path):
|
|
return '/'.join([removePrefix(component) for component in path.split('/')])
|
|
|
|
# Code for this utility
|
|
|
|
md5Jobs = Queue.Queue(4)
|
|
sha1Jobs = Queue.Queue(4)
|
|
# md5Thread is defined below
|
|
# sha1Thread is defined below
|
|
processedFilesCount = 0
|
|
updatedFilesCount = 0
|
|
skippedFilesCount = 0
|
|
processedFoldersCount = 0
|
|
|
|
class checksumThread(threading.Thread):
|
|
def __init__(self, hashlibObjectBuilder, jobsQueue):
|
|
threading.Thread.__init__(self)
|
|
self.hashlibObjectBuilder = hashlibObjectBuilder
|
|
self.hashlibObject = hashlibObjectBuilder()
|
|
self.jobsQueue = jobsQueue
|
|
self.isAlive = True
|
|
def run(self):
|
|
while self.isAlive:
|
|
chunk = self.jobsQueue.get(block = True)
|
|
if chunk is not None:
|
|
self.hashlibObject.update(chunk)
|
|
self.jobsQueue.task_done()
|
|
def stop(self):
|
|
self.isAlive = False
|
|
# Note: Injecting a string in the queue is a bad idea since it would change the checksum
|
|
self.jobsQueue.put(None)
|
|
def getSum(self):
|
|
self.jobsQueue.join() # Wait until all chunks sent until this point are processed.
|
|
sum = self.hashlibObject.hexdigest()
|
|
self.hashlibObject = self.hashlibObjectBuilder()
|
|
return sum
|
|
|
|
multithread = True
|
|
if multithread:
|
|
def checksumFile(path):
|
|
md5 = hashlib.md5()
|
|
sha1 = hashlib.sha1()
|
|
with open(path,'rb') as f:
|
|
while True:
|
|
chunk = f.read(2*md5.block_size*sha1.block_size)
|
|
if not chunk:
|
|
return {'md5':md5.hexdigest(), 'sha1':sha1.hexdigest()}
|
|
md5.update(chunk)
|
|
sha1.update(chunk)
|
|
else:
|
|
def checksumFile(path):
|
|
with open(path,'rb') as f:
|
|
while True:
|
|
chunk = f.read(1048576) # 1 Megabyte
|
|
if not chunk:
|
|
return {'md5':md5Thread.getSum(), 'sha1':sha1Thread.getSum()}
|
|
md5Jobs.put(chunk)
|
|
sha1Jobs.put(chunk)
|
|
|
|
def fileInfo(path):
|
|
st = os.lstat(path)
|
|
if not stat.S_ISREG(st.st_mode):
|
|
return None
|
|
return {'mtime':st.st_mtime, 'size':st.st_size}
|
|
|
|
def initdb(cursor):
|
|
cursor.execute("create table if not exists files(timestamp,path primary key,md5,sha1,mtime,size)")
|
|
cursor.execute("create index if not exists i_files_path_md5_sha1 on files(path,md5,sha1)")
|
|
cursor.execute("create table if not exists removedfiles(rmtimestamp,timestamp,path,md5,sha1,mtime,size)")
|
|
|
|
def cacheFileInfo(cursor, path):
|
|
cursor.execute('select mtime,size from files where path = ?', (path,))
|
|
data = cursor.fetchone()
|
|
return data and {'mtime':data[0], 'size':data[1]}
|
|
|
|
def update(connection,cursor,path):
|
|
global processedFilesCount
|
|
global processedFoldersCount
|
|
global updatedFilesCount
|
|
global skippedFilesCount
|
|
|
|
cursor.execute("create temp table newfiles(path)")
|
|
cursor.execute("create index i_newfiles_path on newfiles(path)")
|
|
timestamp = time.time()
|
|
currentTime = time.clock()
|
|
lastTime = currentTime
|
|
for d in os.walk(path):
|
|
dirpath=d[0]
|
|
processedFoldersCount += 1
|
|
for f in d[2]:
|
|
prefixPath = os.path.join(dirpath, f)
|
|
if os.path.isfile(prefixPath):
|
|
processedFilesCount += 1
|
|
fi = fileInfo(prefixPath)
|
|
if fi is None:
|
|
skippedFilesCount +=1
|
|
print "!skipping: no fileinfo: ", prefixPath
|
|
continue
|
|
fpath = removePrefixPath(prefixPath)
|
|
if fpath != prefixPath and os.path.exists(fpath):
|
|
skippedFilesCount +=1
|
|
print "!skipping: collision between '%s' and '%s'" % (prefixPath, fpath,)
|
|
continue
|
|
cfi = cacheFileInfo(cursor,fpath)
|
|
cursor.execute("insert into newfiles(path) values(?)", (fpath,))
|
|
if fi != cfi:
|
|
updatedFilesCount += 1
|
|
if fpath != prefixPath:
|
|
print " updating %s (%s)" % (prefixPath, fpath,)
|
|
else:
|
|
print " updating %s" % (fpath,)
|
|
sums = checksumFile(prefixPath)
|
|
values = (timestamp,fpath,sums['md5'],sums['sha1'],fi['mtime'],fi['size'])
|
|
cursor.execute("insert or replace into files(timestamp,path,md5,sha1,mtime,size) values(?,?,?,?,?,?)", values)
|
|
|
|
currentTime = time.clock()
|
|
if abs(lastTime-currentTime) >= 10:
|
|
lastTime = currentTime
|
|
connection.commit()
|
|
print "commit!"
|
|
connection.commit()
|
|
print "commit!"
|
|
|
|
print "cleaning up..."
|
|
likepath=((path + '') if (path[-1:] == '/') else (path + '/')).replace('%', '%%') + '%';
|
|
cursor.execute("create temp table deletedfiles(path)")
|
|
cursor.execute("create index i_deletedfiles_path on deletedfiles(path)")
|
|
cursor.execute("insert into deletedfiles(path) select path from files where path like ?", (likepath,));
|
|
|
|
nbFilesBefore = cursor.execute("select count(*) from deletedfiles").fetchone()[0];
|
|
nbFilesAfter = cursor.execute("select count(*) from newfiles").fetchone()[0];
|
|
print 'number of files before: ', nbFilesBefore
|
|
print 'number of files after: ', nbFilesAfter
|
|
|
|
cursor.execute("delete from deletedfiles where path in newfiles");
|
|
nbFilesDelete = cursor.execute("select count(*) from deletedfiles").fetchone()[0];
|
|
print 'number of files to remove from database (moved in table removedfiles): ', nbFilesDelete
|
|
|
|
if (nbFilesAfter < math.ceil(nbFilesBefore * 0.5)):
|
|
print "!!! Not deleting hashes from database: there are less than 50% files after. Did you forget to mount your harddisk?"
|
|
else:
|
|
cursor.execute("insert into removedfiles(rmtimestamp,timestamp,path,md5,sha1,mtime,size)"
|
|
+ " select ?,timestamp,path,md5,sha1,mtime,size from files where path in deletedfiles", (timestamp,))
|
|
cursor.execute("delete from files where path in deletedfiles")
|
|
|
|
connection.commit()
|
|
|
|
def walk(db,path):
|
|
connection = sqlite3.connect(db)
|
|
connection.text_factory = str # For utf-8 file names…
|
|
cursor = connection.cursor()
|
|
initdb(cursor)
|
|
update(connection, cursor, path)
|
|
cursor.close()
|
|
|
|
def help():
|
|
print 'Usage : %s database-file directory' % sys.argv[0]
|
|
sys.exit(1)
|
|
|
|
if len(sys.argv) != 3:
|
|
help()
|
|
for arg in sys.argv[1:]:
|
|
if arg == '-h' or arg == '--help':
|
|
help()
|
|
|
|
# Start threads and walk the filesystem
|
|
currentTime = time.time()
|
|
md5Thread = checksumThread(hashlib.md5(), md5Jobs);
|
|
md5Thread.start()
|
|
sha1Thread = checksumThread(hashlib.sha1(), sha1Jobs);
|
|
sha1Thread.start()
|
|
walk(sys.argv[1], sys.argv[2])
|
|
md5Thread.stop()
|
|
sha1Thread.stop()
|
|
elapsedTime = time.time()-currentTime
|
|
elapsedTime = round(elapsedTime,3)
|
|
|
|
# Statistics
|
|
print '\n== Result ================================'
|
|
if elapsedTime > 1:
|
|
print ' Total elapsed time: ', format(elapsedTime), ' seconds'
|
|
else:
|
|
print ' Total elapsed time: ', format(elapsedTime), ' second'
|
|
print ' Processed files:', format(processedFilesCount)
|
|
print ' Processed folders:', format(processedFoldersCount)
|
|
print ' Updated files:', format(updatedFilesCount)
|
|
print ' Skipped files:', format(skippedFilesCount)
|