-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_split.py
61 lines (49 loc) · 2.37 KB
/
data_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import csv
import argparse
def split_csv(input_file, output_dir, chunk_size=100 * 1024 * 1024): # 100 MB
"""
Split a CSV file into multiple files of a specified size.
Args:
input_file (str): Path to the input CSV file.
output_dir (str): Path to the output directory where the split files will be saved.
chunk_size (int): Maximum size of each output file in bytes (default: 100 MB).
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(input_file, 'r') as file:
reader = csv.reader(file)
header = next(reader) # Get the header row
current_chunk = []
current_chunk_size = 0
chunk_count = 0
for row in reader:
row_size = sum(len(str(value)) for value in row)
if current_chunk_size + row_size > chunk_size:
# Save the current chunk to a file
output_file = os.path.join(output_dir, f"chunk_{chunk_count}.csv")
with open(output_file, 'w', newline='') as out_file:
writer = csv.writer(out_file)
writer.writerow(header)
writer.writerows(current_chunk)
# Reset the current chunk
current_chunk = []
current_chunk_size = 0
chunk_count += 1
current_chunk.append(row)
current_chunk_size += row_size
# Save the remaining chunk (if any)
if current_chunk:
output_file = os.path.join(output_dir, f"chunk_{chunk_count}.csv")
with open(output_file, 'w', newline='') as out_file:
writer = csv.writer(out_file)
writer.writerow(header)
writer.writerows(current_chunk)
print(f"CSV file split into {chunk_count + 1} chunks.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Split a CSV file into multiple files of a specified size.")
parser.add_argument("input_file", help="Path to the input CSV file.")
parser.add_argument("output_dir", help="Path to the output directory where the split files will be saved.")
parser.add_argument("--chunk_size", type=int, default=100 * 1024 * 1024, help="Maximum size of each output file in bytes (default: 100 MB).")
args = parser.parse_args()
split_csv(args.input_file, args.output_dir, args.chunk_size)