From ea4c1389ba7fd3f77160ea68d0e56e510e42a15b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Georges=20Dup=C3=A9ron?= Date: Thu, 22 Sep 2011 12:16:50 +0200 Subject: [PATCH] Move database entries for deleted files to a separate history table. --- updatehash.py | 11 ++++++++++- updatehash.sql | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 updatehash.sql diff --git a/updatehash.py b/updatehash.py index d437c4b..c583b8f 100755 --- a/updatehash.py +++ b/updatehash.py @@ -28,6 +28,7 @@ def initdb(cursor): cursor.execute("create table if not exists files(tag,timestamp,path primary key,md5,sha1,mtime,size)") cursor.execute("create index if not exists i_files_tag on files(tag)") cursor.execute("create index if not exists i_files_path_md5_sha1 on files(path,md5,sha1)") + cursor.execute("create table removedfiles(rmtimestamp,tag,timestamp,path,md5,sha1,mtime,size)") def cacheFileInfo(cursor, path): cursor.execute('select mtime,size from files where path = ?', (path,)) @@ -35,6 +36,8 @@ def cacheFileInfo(cursor, path): return data and {'mtime':data[0], 'size':data[1]} def update(connection,cursor,path): + cursor.execute("create temp table newfiles(path)") + cursor.execute("create index i_newfiles_path on newfiles(path)") timestamp = time.time() currentTime = time.clock() lastTime = currentTime @@ -48,6 +51,7 @@ def update(connection,cursor,path): print "!skipping", fpath continue cfi = cacheFileInfo(cursor,fpath) + cursor.execute("insert into newfiles(path) values(?)", (fpath,)) if fi != cfi: print " updating", fpath sums = checksumFile(fpath) @@ -59,6 +63,12 @@ def update(connection,cursor,path): lastTime = currentTime connection.commit() print "commit!" + connection.commit() + print "commit!" + print "cleaning up..." + cursor.execute("insert into removedfiles(rmtimestamp,tag,timestamp,path,md5,sha1,mtime,size) select ?,tag,timestamp,path,md5,sha1,mtime,size from files where path not in newfiles", (timestamp,)) + cursor.execute("delete from files where path not in (select path from newfiles)") + connection.commit() def walk(db,path): connection = sqlite3.connect(db) @@ -66,7 +76,6 @@ def walk(db,path): cursor = connection.cursor() initdb(cursor) update(connection, cursor, path) - connection.commit() cursor.close() def help(): diff --git a/updatehash.sql b/updatehash.sql new file mode 100644 index 0000000..08ffee5 --- /dev/null +++ b/updatehash.sql @@ -0,0 +1,8 @@ +# Size of duplicates that can be removed (doesn't count the size of the copy we leave) +select round((B.tot-A.tot)/(1024.*1024.*1024.),2)||' Gb' from (select sum(size) as tot from (select distinct md5,sha1,size from files)) as A, (select sum(size) as tot from (select md5,sha1,size from files)) as B; + +# List of duplicates (all copies) +select size,path from files where md5||'#'||sha1||'#'||size in (select md5||'#'||sha1||'#'||size from files group by md5,sha1,size having count(path) > 1) order by size; + +# Total count of files and total weight in Gb +select round(sum(size)/(1024.*1024.*1024.),2)||' Gb '||count(size)||' files' from files;