-
Notifications
You must be signed in to change notification settings - Fork 0
/
Find_duplicates_files_and_remove.py
135 lines (110 loc) · 4.63 KB
/
Find_duplicates_files_and_remove.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#https://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them
# -*- coding: utf-8 -*-
import sys
import os
import hashlib
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash()
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
file_id = (hashobj.digest(), os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print ("Duplicate found: %s and %s" % (full_path, duplicate))
else:
hashes[file_id] = full_path
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print ("Please pass the paths to check as parameters to the script")
#!/usr/bin/env python
import sys
import os
import hashlib
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
hashobj = hash()
file_object = open(filename, 'rb')
if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
for chunk in chunk_reader(file_object):
hashobj.update(chunk)
hashed = hashobj.digest()
file_object.close()
return hashed
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes_by_size = {}
hashes_on_1k = {}
hashes_full = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
except (OSError,):
# not accessible (permissions, etc) - pass on
continue
duplicate = hashes_by_size.get(file_size)
if duplicate:
hashes_by_size[file_size].append(full_path)
else:
hashes_by_size[file_size] = [] # create the list for this file size
hashes_by_size[file_size].append(full_path)
# For all files with the same file size, get their hash on the 1st 1024 bytes
for __, files in hashes_by_size.items():
if len(files) < 2:
continue # this file size is unique, no need to spend cpy cycles on it
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
except (OSError,):
# the file access might've changed till the exec point got here
continue
duplicate = hashes_on_1k.get(small_hash)
if duplicate:
hashes_on_1k[small_hash].append(filename)
else:
hashes_on_1k[small_hash] = [] # create the list for this 1k hash
hashes_on_1k[small_hash].append(filename)
# For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
for __, files in hashes_on_1k.items():
if len(files) < 2:
continue # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it
for filename in files:
try:
full_hash = get_hash(filename, first_chunk_only=False)
except (OSError,):
# the file access might've changed till the exec point got here
continue
duplicate = hashes_full.get(full_hash)
if duplicate:
print ("Duplicate found: %s and %s" % (filename, duplicate))
else:
hashes_full[full_hash] = filename
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print ("Please pass the paths to check as parameters to the script")