-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_csv.py
executable file
·93 lines (69 loc) · 3.35 KB
/
read_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
import sys
import pandas as pd
import rfc3339 as rfc # for date object -> date string
script, encoding, error, file_location, is_debug = sys.argv
def main(encoder, errors, file_path, debug):
try:
input_fd = open(file_path, encoding=encoder, errors=errors, newline='')
reader = pd.read_csv(input_fd, parse_dates=["Timestamp"], encoding=encoder, error_bad_lines=False,
warn_bad_lines=True)
normalizer(reader, debug, file_path)
except pd.errors.EmptyDataError:
sys.stderr.write('\nERROR: File name %s is empty\n' % file_path)
except FileNotFoundError:
sys.stderr.write('\nERROR: File name %s not found\n' % file_path)
def normalizer(reader, debug, file_path):
if not reader.empty:
if debug == 'True':
print("\n-----------------------------")
print("UTF-8 with char replacement DataFrame")
print(reader)
print(reader.dtypes)
print("-----------------------------\n")
writer = reader
remove_bad_rows(writer, writer["Timestamp"], pd.to_datetime)
# Timestamp column is clean
writer["Timestamp"] = pd.to_datetime(writer["Timestamp"]) + pd.DateOffset(hours=3)
writer["Timestamp"] = writer["Timestamp"].apply(rfc.rfc3339)
remove_bad_rows(writer, writer["FooDuration"], pd.Timedelta)
remove_bad_rows(writer, writer["BarDuration"], pd.Timedelta)
writer["FooDuration"] = writer["FooDuration"].apply(pd.Timedelta)
writer["BarDuration"] = writer["BarDuration"].apply(pd.Timedelta)
# Total duration in HH:MM:SS.MS
writer["TotalDuration"] = writer["FooDuration"] + writer["BarDuration"]
# Total duration in seconds
writer["TotalDuration"] = writer["TotalDuration"].dt.total_seconds()
# Foo & Bar converted to seconds
writer["FooDuration"] = writer["FooDuration"].dt.total_seconds()
writer["BarDuration"] = writer["BarDuration"].dt.total_seconds()
writer["ZIP"] = writer["ZIP"].apply(str).str.pad(width=5, fillchar='0')
writer["ZIP"] = writer["ZIP"].apply(str).str[:5]
writer["FullName"] = writer["FullName"].str.upper()
if debug == 'True':
print("\n-----------------------------")
print("normalized DataFrame")
print(writer)
print(writer.dtypes)
print("-----------------------------\n")
print(writer["TotalDuration"])
output_file_name = '%s_OUTPUT.csv' % file_path \
.replace("/", "-") \
.replace(".-data-", "") \
.replace(".", "") \
.replace("/", "") \
.replace("-", "") \
.replace("inputs", "") \
.replace("csv", "")
writer.to_csv(output_file_name, index=False)
print("-----------> Normalized %s" % output_file_name)
def remove_bad_rows(data_frame, column_series, method_validator):
if column_series.dtypes == object:
# Then at least one row has bad data
for idx, row in enumerate(column_series):
try:
method_validator(row)
except (ValueError, TypeError):
sys.stderr.write('\nWARNING: Bad Timestamp dropping row containing %s\n' % row)
data_frame.drop(idx, inplace=True)
main(encoding, error, file_location, is_debug)