-
Notifications
You must be signed in to change notification settings - Fork 4
/
bloat_data.py
90 lines (68 loc) · 2.61 KB
/
bloat_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import glob
import os
import random
import shutil
from math import floor
# Path to directory where the downloaded data is
DATA_DIR = ""
# Path to directory where new dataset should be created
TARGET_DIR = ""
# Name of the sub-directory containing real images in DATA_DIR
REAL_DIR = "real"
# Name of the sub-directory containing fake images in DATA_DIR
FAKE_DIR = "df"
# Amount of data that should be put in the test set
PROP = 0.10
def create_target_dirs():
r_train = os.path.join(TARGET_DIR, "train", REAL_DIR)
f_train = os.path.join(TARGET_DIR, "train", FAKE_DIR)
os.makedirs(r_train)
os.makedirs(f_train)
r_test = os.path.join(TARGET_DIR, "test", REAL_DIR)
f_test = os.path.join(TARGET_DIR, "test", FAKE_DIR)
os.makedirs(r_test)
os.makedirs(f_test)
def main():
if not os.path.exists(DATA_DIR):
raise FileNotFoundError(f"{DATA_DIR} does not exist")
if not os.path.isdir(DATA_DIR):
raise NotADirectoryError(f"{DATA_DIR} is not a directory")
shutil.rmtree(TARGET_DIR, ignore_errors=True)
path = os.path.join(DATA_DIR, "**", "*.jpg")
imgs = glob.glob(path, recursive=True)
num_imgs = len(imgs)
test_size = floor(len(imgs) * PROP)
train_size = num_imgs - test_size
while True:
print(f"Creating directories at {TARGET_DIR}....")
create_target_dirs()
selected = random.sample(imgs, test_size)
train_r_len = len(
[1 for img in imgs if REAL_DIR in img and img not in selected]
)
train_f_len = train_size - train_r_len
test_r_len = len([1 for img in selected if REAL_DIR in img])
test_f_len = test_size - test_r_len
print(f"Found {num_imgs} images in {DATA_DIR}....")
print(f"Copying {train_size} files to {TARGET_DIR}/train/....")
print(f"Copying {test_size} files to {TARGET_DIR}/test/....")
print(
f"The training set will have {train_r_len} real images and {train_f_len} fake images...."
)
print(
f"The test set will have {test_r_len} real images and {test_f_len} fake images..."
)
for img in imgs:
leaf_dir = REAL_DIR if REAL_DIR in img else FAKE_DIR
intr_dir = "test" if img in selected else "train"
dst = os.path.join(TARGET_DIR, intr_dir, leaf_dir)
shutil.copy(img, dst)
prompt = input("Try again? [Yy/Nn] ")
if prompt in ["n", "N"]:
print("Exiting....")
break
print("\n\nTrying again....")
print(f"Deleting {TARGET_DIR}....\n")
shutil.rmtree(TARGET_DIR)
if __name__ == "__main__":
main()