From 2aef87e97b4dcedebce34d7ce47112c8988d5379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Georges=20Dup=C3=A9ron?= Date: Tue, 1 Oct 2013 01:09:53 +0200 Subject: [PATCH] =?UTF-8?q?Merge=20des=20modifications=20apport=C3=A9es=20?= =?UTF-8?q?par=20Yoann.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- updatehash.py | 96 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 86 insertions(+), 10 deletions(-) diff --git a/updatehash.py b/updatehash.py index 60058c2..97c7799 100755 --- a/updatehash.py +++ b/updatehash.py @@ -7,6 +7,7 @@ import time import sys import stat import math +import threading, Queue # Common functions @@ -20,16 +21,59 @@ def removePrefixPath(path): # Code for this utility -def checksumFile(path): - md5 = hashlib.md5() - sha1 = hashlib.sha1() - with open(path,'rb') as f: - while True: - chunk = f.read(2*md5.block_size*sha1.block_size) - if not chunk: - return {'md5':md5.hexdigest(), 'sha1':sha1.hexdigest()} - md5.update(chunk) - sha1.update(chunk) +md5Jobs = Queue.Queue(4) +sha1Jobs = Queue.Queue(4) +# md5Thread is defined below +# sha1Thread is defined below +processedFilesCount = 0 +updatedFilesCount = 0 +skippedFilesCount = 0 +processedFoldersCount = 0 + +class checksumThread(threading.Thread): + def __init__(self, hashlibObjectBuilder, jobsQueue): + threading.Thread.__init__(self) + self.hashlibObjectBuilder = hashlibObjectBuilder + self.hashlibObject = hashlibObjectBuilder() + self.jobsQueue = jobsQueue + self.isAlive = True + def run(self): + while self.isAlive: + chunk = self.jobsQueue.get(block = True) + if chunk is not None: + self.hashlibObject.update(chunk) + self.jobsQueue.task_done() + def stop(self): + self.isAlive = False + # Note: Injecting a string in the queue is a bad idea since it would change the checksum + self.jobsQueue.put(None) + def getSum(self): + self.jobsQueue.join() # Wait until all chunks sent until this point are processed. + sum = self.hashlibObject.hexdigest() + self.hashlibObject = self.hashlibObjectBuilder() + return sum + +multithread = True +if multithread: + def checksumFile(path): + md5 = hashlib.md5() + sha1 = hashlib.sha1() + with open(path,'rb') as f: + while True: + chunk = f.read(2*md5.block_size*sha1.block_size) + if not chunk: + return {'md5':md5.hexdigest(), 'sha1':sha1.hexdigest()} + md5.update(chunk) + sha1.update(chunk) +else: + def checksumFile(path): + with open(path,'rb') as f: + while True: + chunk = f.read(1048576) # 1 Megabyte + if not chunk: + return {'md5':md5Thread.getSum(), 'sha1':sha1Thread.getSum()} + md5Jobs.put(chunk) + sha1Jobs.put(chunk) def fileInfo(path): st = os.lstat(path) @@ -48,6 +92,11 @@ def cacheFileInfo(cursor, path): return data and {'mtime':data[0], 'size':data[1]} def update(connection,cursor,path): + global processedFilesCount + global processedFoldersCount + global updatedFilesCount + global skippedFilesCount + cursor.execute("create temp table newfiles(path)") cursor.execute("create index i_newfiles_path on newfiles(path)") timestamp = time.time() @@ -55,19 +104,25 @@ def update(connection,cursor,path): lastTime = currentTime for d in os.walk(path): dirpath=d[0] + processedFoldersCount += 1 for f in d[2]: prefixPath = os.path.join(dirpath, f) if os.path.isfile(prefixPath): + processedFilesCount += 1 fi = fileInfo(prefixPath) if fi is None: + skippedFilesCount +=1 print "!skipping: no fileinfo: ", prefixPath continue fpath = removePrefixPath(prefixPath) if fpath != prefixPath and os.path.exists(fpath): + skippedFilesCount +=1 print "!skipping: collision between '%s' and '%s'" % (prefixPath, fpath,) + continue cfi = cacheFileInfo(cursor,fpath) cursor.execute("insert into newfiles(path) values(?)", (fpath,)) if fi != cfi: + updatedFilesCount += 1 if fpath != prefixPath: print " updating %s (%s)" % (prefixPath, fpath,) else: @@ -126,4 +181,25 @@ for arg in sys.argv[1:]: if arg == '-h' or arg == '--help': help() +# Start threads and walk the filesystem +currentTime = time.time() +md5Thread = checksumThread(hashlib.md5(), md5Jobs); +md5Thread.start() +sha1Thread = checksumThread(hashlib.sha1(), sha1Jobs); +sha1Thread.start() walk(sys.argv[1], sys.argv[2]) +md5Thread.stop() +sha1Thread.stop() +elapsedTime = time.time()-currentTime +elapsedTime = round(elapsedTime,3) + +# Statistics +print '\n== Result ================================' +if elapsedTime > 1: + print ' Total elapsed time: ', format(elapsedTime), ' seconds' +else: + print ' Total elapsed time: ', format(elapsedTime), ' second' +print ' Processed files:', format(processedFilesCount) +print ' Processed folders:', format(processedFoldersCount) +print ' Updated files:', format(updatedFilesCount) +print ' Skipped files:', format(skippedFilesCount)