This repository has been archived by the owner on Mar 16, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathfindDuplicates.py
146 lines (127 loc) · 4.24 KB
/
findDuplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/python
##############################################################################
#
# Copyright (C) 2006-2010 Kevin Deldycke <[email protected]>
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
"""Usage : findDuplicates.py folder1 folder2 ...
** [folder1 folder2 ...]
List of folders to check for duplicate files.
Default: current folder.
** -d X, --delete-threshold=X
This option will remove all files having X duplicates or more.
** -h, --help
Print this screen.
"""
import os
import sys
import getopt
import hashlib
from commands import getstatusoutput
def getFileContent(file_path):
"""
This function get the content of a file
"""
# Verify that the file exist
if not os.path.isfile(file_path):
print "ERROR: %s doesn't exist." % file_path
return None
# Get file content
file_path = os.path.abspath(file_path)
file_object = open(file_path, 'r')
return file_object.read()
def getMD5(file_path):
"""
Return the MD5 of a file
"""
file_checksum = None
file_content = getFileContent(file_path)
if file_content:
try:
file_checksum = hashlib.md5(file_content).hexdigest()
except:
# Use the system command line if Python's library fails, as sometimes it fails on big files.
result = getstatusoutput("md5sum %s" % file_path)
if result[0] == 0:
file_checksum = result[1].split(' ')[0]
return file_checksum
def main(argv=None):
if argv is None:
argv = sys.argv
# Parse command line options
try:
opts, args = getopt.getopt(argv[1:], "hd:", ["help", "delete-threshold="])
except getopt.error, msg:
print msg
print "For help use --help"
return 2
# Process options
delete_threshold = None
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
return 0
elif o in ("-d", "--delete-threshold"):
delete_threshold = int(a)
# Process arguments
folder_list = []
for folder in args:
folder_path = os.path.abspath(folder)
if not os.path.isdir(folder_path):
print "%s doesn't exist or is not a directory." % folder_path
return 1
folder_list.append(folder_path)
# No folder defined, use current folder
if not folder_list:
folder_list.append(os.path.abspath(os.getcwd()))
# Browse all folders and set the list of files to check
files_to_check = []
for folder_path in folder_list:
for parent, dirs, files in os.walk(folder_path):
for filename in files:
filepath = os.path.join(parent, filename)
if filepath not in files_to_check:
files_to_check.append(filepath)
# Compute checksums of all files
checksum_dict = {}
for filepath in files_to_check:
checksum = getMD5(filepath)
if not checksum:
print "Can't compute checksum of %s" % filepath
continue
if checksum not in checksum_dict:
checksum_dict[checksum] = []
checksum_dict[checksum] = checksum_dict[checksum] + [filepath]
# Show results
no_duplicates = True
for (checksum, files) in checksum_dict.items():
duplicates = len(files)
if duplicates > 1:
no_duplicates = False
files.sort()
print "%s duplicate files found:%s" % (duplicates, '\n * '.join([''] + files))
# If enough duplicate files found remove them all
if delete_threshold and duplicates >= delete_threshold:
for f in files:
print " Removing: %s" % f
os.remove(f)
print ''
if no_duplicates:
print "No duplicate files found in %r." % folder_list
return 0
if __name__ == "__main__":
sys.exit(main())