Skip to content

Commit

Permalink
New feature - instead of actions, just create a CSV that list all the…
Browse files Browse the repository at this point in the history
… duplicates
  • Loading branch information
niradar committed Jun 11, 2024
1 parent f5942f4 commit 1a3fe6e
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 19 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ python df_finder3.py --scan_dir <scan_folder> --reference_dir <reference_folder>
- `--min_size`: Minimum file size to include. Specify with units (B, KB, MB).
- `--max_size`: Maximum file size to include. Specify with units (B, KB, MB).
- `--full_hash`: Use full file hash for comparison. Default is partial.

- `--action`: Action to take on duplicates. Default is `move_duplicates`. Options are `create_csv`, `move_duplicates`.
- `create_csv` - Create a CSV file with the list of duplicates.
- `move_duplicates` - Move duplicates from scan folder to move_to folder.
### Example

#### Simple usage:
Expand All @@ -53,21 +55,21 @@ python df_finder3.py --scan_dir /path/to/scan_dir --reference_dir /path/to/refer
#### Most common usage:
Ignore differences in modification dates, copy the file to all target folders if found in multiple folders, and run without test mode:
```sh
python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --run --ignore_diff mdate --copy_to_all
python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --to /path/to/move_to --run --ignore_diff mdate --copy_to_all
```

#### Using Whitelist and Blacklist
```sh
python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --whitelist_ext jpg,png --run
python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --to /path/to/move_to --whitelist_ext jpg,png --run
```

```sh
python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --blacklist_ext tmp,log --run
python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --to /path/to/move_to --blacklist_ext tmp,log --run
```

#### Filtering by File Size
```sh
python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --min_size 1MB --max_size 100MB --run
python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --to /path/to/move_to --min_size 1MB --max_size 100MB --run
```

## Installation
Expand Down
18 changes: 12 additions & 6 deletions df_finder3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# https://github.com/niradar/duplicate_files_in_folders

from duplicate_files_in_folders.duplicates_finder import find_duplicates_files_v3, process_duplicates, \
clean_scan_dir_duplications
clean_scan_dir_duplications, create_csv_file
from duplicate_files_in_folders.logging_config import setup_logging
from duplicate_files_in_folders.utils import parse_arguments, setup_hash_manager, setup_file_manager
from duplicate_files_in_folders.utils_io import display_initial_config, output_results, confirm_script_execution
Expand All @@ -16,13 +16,19 @@ def main(args):
hash_manager = setup_hash_manager(args)

duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, args.scan_dir, args.reference_dir)
files_moved, files_created = process_duplicates(duplicates, args)
duplicate_scan_files_moved = clean_scan_dir_duplications(args, duplicates)
deleted_scan_folders = fm.delete_empty_folders_in_tree(args.scan_dir, True) if args.delete_empty_folders else 0

if args.action == 'move_duplicates':
files_moved, files_created = process_duplicates(duplicates, args)
duplicate_scan_files_moved = clean_scan_dir_duplications(args, duplicates)
deleted_scan_folders = fm.delete_empty_folders_in_tree(args.scan_dir, True) if args.delete_empty_folders else 0

output_results(args, files_moved, files_created, deleted_scan_folders, duplicate_scan_files_moved,
scan_stats, ref_stats)
elif args.action == 'create_csv':
# Always run in run mode as it creates a file and maybe a folder.
fm.with_run_mode(create_csv_file, args, duplicates)

hash_manager.save_data()
output_results(args, files_moved, files_created, deleted_scan_folders, duplicate_scan_files_moved,
scan_stats, ref_stats)


if __name__ == "__main__":
Expand Down
28 changes: 27 additions & 1 deletion duplicate_files_in_folders/duplicates_finder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import csv
import os
import concurrent.futures
from datetime import datetime

import tqdm
from probables import BloomFilter
Expand Down Expand Up @@ -192,10 +194,34 @@ def process_duplicates(combined: Dict, args: Namespace) -> (int, int):
return files_moved, files_created


def create_csv_file(args: Namespace, combined: Dict) -> None:
"""
Create a CSV file with the duplicate files' information.
:param args: parsed arguments
:param combined: the dictionary of duplicates returned by find_duplicates_files_v3
:return: number of files moved
"""
csv_file = os.path.join(args.move_to, os.path.basename(args.scan_dir) + "_dups.csv")
if not os.path.exists(args.move_to):
FileManager.get_instance().make_dirs(args.move_to)

# Every line in the CSV file will contain a single duplicate file. The first line will contain the header.
with open(csv_file, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(["key", "path", "size", "modified_time"])
key = 1
for file_key, locations in combined.items():
for category, files in locations.items():
for file in files:
writer.writerow([key, file['path'], file['size'], datetime.fromtimestamp(file['modified_time'])])
key += 1



def clean_scan_dir_duplications(args: Namespace, combined: Dict) -> int:
"""
Clean up the scan_dir duplications after moving files to the move_to folder.
:param args:
:param args: parsed arguments
:param combined: a dictionary which all the files under 'scan' (for all keys) are moved to the move_to folder
:return: number of files moved
"""
Expand Down
14 changes: 14 additions & 0 deletions duplicate_files_in_folders/file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,3 +332,17 @@ def reset_file_manager(protected_dirs: List[str], allowed_dirs: List[str], run_m
for dir_path in allowed_dirs:
fm.add_allowed_dir(dir_path)
return fm

def with_run_mode(self, func, *args, **kwargs):
"""
Run a function with the run_mode set to True, then reset it to its previous state.
:param func: function to run
:param args:
:param kwargs:
:return: result of the function
"""
prev_state = self.run_mode
self.run_mode = True
result = func(*args, **kwargs)
self.run_mode = prev_state
return result
8 changes: 8 additions & 0 deletions duplicate_files_in_folders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ def parse_arguments(cust_args=None, check_folders=True):
parser.set_defaults(delete_empty_folders=True)
parser.add_argument('--clear_cache', action='store_true', help=argparse.SUPPRESS) # for testing
parser.add_argument('--extra_logging', action='store_true', help=argparse.SUPPRESS) # for testing

# add new argument for action that can get the following values: 'move_duplicates', 'create_csv' only as values
parser.add_argument('--action', type=str, choices=['move_duplicates', 'create_csv'],
help='Action to perform: move_duplicates, create_csv', default='move_duplicates')




args = parser.parse_args(cust_args if cust_args else None)

# Validate the folders given in the arguments
Expand Down
23 changes: 16 additions & 7 deletions duplicate_files_in_folders/utils_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ def display_initial_config(args: Namespace):
if not args.delete_empty_folders:
config_items["Empty Folders"] = "Do Not Delete Empty Folders in Scan Folder"

config_items["Script Mode"] = "Run Mode" if args.run else "Test Mode"
config_items["Script Mode"] = (
"Create CSV File" if args.action == 'create_csv' else
"Run Mode" if args.run else
"Test Mode"
)

# Print header
log_and_print(blank_line)
Expand Down Expand Up @@ -142,12 +146,17 @@ def output_results(args: Namespace, files_moved: int, files_created: int, delete
def confirm_script_execution(args: Namespace):
""" Confirm the script execution if not run by pytest. """
if not detect_pytest():
if not args.run:
print("This script is currently in test mode. No files will be moved.")
print(f"In run mode, duplicate files will be moved from {args.scan_dir} to {args.move_to}.")
else:
print(f"This script will move duplicate files from {args.scan_dir}. "
f"No additional confirmation will be asked.")
if args.action == 'move_duplicates':
if not args.run:
print("This script is currently in test mode. No files will be moved.")
print(f"In run mode, duplicate files will be moved from {args.scan_dir} to {args.move_to}.")
else:
print(f"This script will move duplicate files from {args.scan_dir}. "
f"No additional confirmation will be asked.")
elif args.action == 'create_csv':
print(f"This script will create a CSV file in {args.move_to}. The folder will be created if it doesn't "
f"exist.")

print("Do you want to continue? (y/n): ")
# while loop until the user enters 'y' or 'n'
while True:
Expand Down

0 comments on commit 1a3fe6e

Please sign in to comment.