-
Notifications
You must be signed in to change notification settings - Fork 0
/
fullprocess.py
138 lines (84 loc) · 4.34 KB
/
fullprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import os
import re
from pathlib import Path
from scoring import score_model
# Load config.json and get input and output paths
with open('config.json','r') as f:
config = json.load(f)
prod_deploy_path = os.path.join(config['prod_deployment_path'])
input_folder_path = os.path.join(config['input_folder_path'])
output_folder_path = config['output_folder_path']
output_model_path = config['output_model_path']
##################Check and read new data
# first, read ingestedfiles.txt
with open(f'{prod_deploy_path}/ingestedfiles.txt', 'r') as base_file:
file_contents = base_file.read()
base_datasets = re.findall(r'File \d+: .*/(.+\.csv)', file_contents)
directory = Path(input_folder_path) # replace this with the actual path to your directory
new_files = directory.glob('*.csv')
check_files = []
for file in new_files:
check_files.append(file.name)
#second, determine whether the source data folder has files that aren't listed in ingestedfiles.txt
# print(check_files, base_datasets)
# check_files = ['dataset1.csv', 'dataset2.csv']
set1 = set(check_files)
set2 = set(base_datasets)
common_elements = set1.intersection(set2)
# if there has been change, then we need to ingest the new data and check for model drift
if len(common_elements) != len(base_datasets):
print('running ingestion.py')
# script needs to run the code in ingestion.py to ingest all the new data.
os.system('python3 ingestion.py')
print('getting old and new scores')
## check for model drift
# Read the score from the latest model, recorded in latestscore.txt from the deployment directory,
# specified in the prod_deployment_path entry of your config.json file.
with open(f'{prod_deploy_path}/latestscore.txt', 'r') as score_file:
last_score = float(score_file.read())
# Make predictions using the trainedmodel.pkl model in the /production_deployment directory
# and the most recent data you obtained from the previous "Checking and Reading New Data" step.
# Get a score for the new predictions from step 2 by running the scoring.py.
new_score = score_model(output_folder_path, output_model_path, 'finaldata.csv')
## issue here - prod path is wrong - think i just need to change to output_mdoel_path but just need to dbc
# new_score = subprocess.check_output(['python3', 'scoring.py'])
# new_score = float(new_score.decode('utf-8').strip())
print('new: ', new_score)
print('old: ', last_score)
# Check whether the new score from step 3 is higher or lower than the score recorded in latestscore.txt in step 1
# using the raw comparison test. If the score from step 3 is lower, then model drift has occurred. Otherwise, it has not.
if new_score < last_score:
print('model drift has occured')
print('running training.py')
# script needs to run the code in ingestion.py to ingest all the new data.
os.system('python3 training.py')
print('running deployment.py')
# script needs to run the code in ingestion.py to ingest all the new data.
os.system('python3 deployment.py')
else:
print('no model drift')
exit()
# The last part of your script should run the code from apicalls.py and reporting.py on the most recently
# deployed model
print('running app.py and apicalls.py')
# script needs to run the code in ingestion.py to ingest all the new data.
os.system('python3 app.py')
# do I need to wait here
os.system('python3 apicalls.py')
print('running reporting.py')
# script needs to run the code in ingestion.py to ingest all the new data.
os.system('python3 reporting.py')
else:
print('data not changed so no need to retain')
exit()
##################Deciding whether to proceed, part 1
#if you found new data, you should proceed. otherwise, do end the process here
##################Checking for model drift
#check whether the score from the deployed model is different from the score from the model that uses the newest ingested data
##################Deciding whether to proceed, part 2
#if you found model drift, you should proceed. otherwise, do end the process here
##################Re-deployment
#if you found evidence for model drift, re-run the deployment.py script
##################Diagnostics and reporting
#run diagnostics.py and reporting.py for the re-deployed model