#!/usr/bin/env python # -*- coding: utf-8 -*- import hashlib import os import sqlite3 import time import sys import stat import math import threading, Queue # Common functions def removePrefix(fileName): while fileName[0:2] == ".%": fileName = fileName[2:] return fileName def removePrefixPath(path): return '/'.join([removePrefix(component) for component in path.split('/')]) # Code for this utility md5Jobs = Queue.Queue(4) sha1Jobs = Queue.Queue(4) # md5Thread is defined below # sha1Thread is defined below processedFilesCount = 0 updatedFilesCount = 0 skippedFilesCount = 0 processedFoldersCount = 0 class checksumThread(threading.Thread): def __init__(self, hashlibObjectBuilder, jobsQueue): threading.Thread.__init__(self) self.hashlibObjectBuilder = hashlibObjectBuilder self.hashlibObject = hashlibObjectBuilder() self.jobsQueue = jobsQueue self.isAlive = True def run(self): while self.isAlive: chunk = self.jobsQueue.get(block = True) if chunk is not None: self.hashlibObject.update(chunk) self.jobsQueue.task_done() def stop(self): self.isAlive = False # Note: Injecting a string in the queue is a bad idea since it would change the checksum self.jobsQueue.put(None) def getSum(self): self.jobsQueue.join() # Wait until all chunks sent until this point are processed. sum = self.hashlibObject.hexdigest() self.hashlibObject = self.hashlibObjectBuilder() return sum multithread = True if multithread: def checksumFile(path): md5 = hashlib.md5() sha1 = hashlib.sha1() with open(path,'rb') as f: while True: chunk = f.read(2*md5.block_size*sha1.block_size) if not chunk: return {'md5':md5.hexdigest(), 'sha1':sha1.hexdigest()} md5.update(chunk) sha1.update(chunk) else: def checksumFile(path): with open(path,'rb') as f: while True: chunk = f.read(1048576) # 1 Megabyte if not chunk: return {'md5':md5Thread.getSum(), 'sha1':sha1Thread.getSum()} md5Jobs.put(chunk) sha1Jobs.put(chunk) def fileInfo(path): st = os.lstat(path) if not stat.S_ISREG(st.st_mode): return None return {'mtime':st.st_mtime, 'size':st.st_size} def initdb(cursor): cursor.execute("create table if not exists files(timestamp,path primary key,md5,sha1,mtime,size)") cursor.execute("create index if not exists i_files_path_md5_sha1 on files(path,md5,sha1)") cursor.execute("create table if not exists removedfiles(rmtimestamp,timestamp,path,md5,sha1,mtime,size)") def cacheFileInfo(cursor, path): cursor.execute('select mtime,size from files where path = ?', (path,)) data = cursor.fetchone() return data and {'mtime':data[0], 'size':data[1]} def update(connection,cursor,path): global processedFilesCount global processedFoldersCount global updatedFilesCount global skippedFilesCount cursor.execute("create temp table newfiles(path)") cursor.execute("create index i_newfiles_path on newfiles(path)") timestamp = time.time() currentTime = time.clock() lastTime = currentTime for d in os.walk(path): dirpath=d[0] processedFoldersCount += 1 for f in d[2]: prefixPath = os.path.join(dirpath, f) if os.path.isfile(prefixPath): processedFilesCount += 1 fi = fileInfo(prefixPath) if fi is None: skippedFilesCount +=1 print "!skipping: no fileinfo: ", prefixPath continue fpath = removePrefixPath(prefixPath) if fpath != prefixPath and os.path.exists(fpath): skippedFilesCount +=1 print "!skipping: collision between '%s' and '%s'" % (prefixPath, fpath,) continue cfi = cacheFileInfo(cursor,fpath) cursor.execute("insert into newfiles(path) values(?)", (fpath,)) if fi != cfi: updatedFilesCount += 1 if fpath != prefixPath: print " updating %s (%s)" % (prefixPath, fpath,) else: print " updating %s" % (fpath,) sums = checksumFile(prefixPath) values = (timestamp,fpath,sums['md5'],sums['sha1'],fi['mtime'],fi['size']) cursor.execute("insert or replace into files(timestamp,path,md5,sha1,mtime,size) values(?,?,?,?,?,?)", values) currentTime = time.clock() if abs(lastTime-currentTime) >= 10: lastTime = currentTime connection.commit() print "commit!" connection.commit() print "commit!" print "cleaning up..." likepath=((path + '') if (path[-1:] == '/') else (path + '/')).replace('%', '%%') + '%'; cursor.execute("create temp table deletedfiles(path)") cursor.execute("create index i_deletedfiles_path on deletedfiles(path)") cursor.execute("insert into deletedfiles(path) select path from files where path like ?", (likepath,)); nbFilesBefore = cursor.execute("select count(*) from deletedfiles").fetchone()[0]; nbFilesAfter = cursor.execute("select count(*) from newfiles").fetchone()[0]; print 'number of files before: ', nbFilesBefore print 'number of files after: ', nbFilesAfter cursor.execute("delete from deletedfiles where path in newfiles"); nbFilesDelete = cursor.execute("select count(*) from deletedfiles").fetchone()[0]; print 'number of files to remove from database (moved in table removedfiles): ', nbFilesDelete if (nbFilesAfter < math.ceil(nbFilesBefore * 0.5)): print "!!! Not deleting hashes from database: there are less than 50% files after. Did you forget to mount your harddisk?" else: cursor.execute("insert into removedfiles(rmtimestamp,timestamp,path,md5,sha1,mtime,size)" + " select ?,timestamp,path,md5,sha1,mtime,size from files where path in deletedfiles", (timestamp,)) cursor.execute("delete from files where path in deletedfiles") connection.commit() def walk(db,path): connection = sqlite3.connect(db) connection.text_factory = str # For utf-8 file names… cursor = connection.cursor() initdb(cursor) update(connection, cursor, path) cursor.close() def help(): print 'Usage : %s database-file directory' % sys.argv[0] sys.exit(1) if len(sys.argv) != 3: help() for arg in sys.argv[1:]: if arg == '-h' or arg == '--help': help() # Start threads and walk the filesystem currentTime = time.time() md5Thread = checksumThread(hashlib.md5(), md5Jobs); md5Thread.start() sha1Thread = checksumThread(hashlib.sha1(), sha1Jobs); sha1Thread.start() walk(sys.argv[1], sys.argv[2]) md5Thread.stop() sha1Thread.stop() elapsedTime = time.time()-currentTime elapsedTime = round(elapsedTime,3) # Statistics print '\n== Result ================================' if elapsedTime > 1: print ' Total elapsed time: ', format(elapsedTime), ' seconds' else: print ' Total elapsed time: ', format(elapsedTime), ' second' print ' Processed files:', format(processedFilesCount) print ' Processed folders:', format(processedFoldersCount) print ' Updated files:', format(updatedFilesCount) print ' Skipped files:', format(skippedFilesCount)