Skip to content

Commit a56f872

Browse files
authored
Merge pull request #26 from dule1322/CTX-4079
CTX-4079: Redesigned terminate condition for experiment worker process
2 parents 5ae9dbc + 5284968 commit a56f872

File tree

1 file changed

+38
-9
lines changed

1 file changed

+38
-9
lines changed

coretex/project/experiment_worker.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,15 @@
1919
from multiprocessing.connection import Connection
2020

2121
import time
22+
import os
23+
import logging
2224

23-
from ..coretex import MetricType
25+
import psutil
26+
27+
from ..coretex import MetricType, Experiment
2428
from ..networking import networkManager, NetworkRequestError
25-
from ..coretex.experiment import Experiment
29+
from ..logging import initializeLogger, LogSeverity
30+
from ..folder_management import FolderManager
2631
from ..coretex.experiment.metrics.metric_factory import createMetric
2732

2833

@@ -65,8 +70,10 @@ def setupGPUMetrics() -> None:
6570
createMetric("gpu_usage", "time (s)", MetricType.interval, "usage (%)", MetricType.percent, None, [0, 100]),
6671
createMetric("gpu_temperature", "time (s)", MetricType.interval, "usage (%)", MetricType.percent)
6772
])
73+
74+
logging.getLogger("coretexpylib").debug(">> [Coretex] Initialized GPU metrics")
6875
except:
69-
pass
76+
logging.getLogger("coretexpylib").debug(">> [Coretex] Failed to initialize GPU metrics")
7077

7178

7279
def _isAlive(output: Connection) -> bool:
@@ -101,29 +108,51 @@ def _uploadMetrics(experiment: Experiment) -> None:
101108
experiment.submitMetrics(metricValues)
102109

103110

104-
def experimentWorker(outputStream: Connection, refreshToken: str, experimentId: int) -> None:
111+
def experimentWorker(output: Connection, refreshToken: str, experimentId: int) -> None:
112+
workerLogPath = FolderManager.instance().logs / f"experiment_worker_{experimentId}.log"
113+
initializeLogger(LogSeverity.debug, workerLogPath)
114+
115+
currentProcess = psutil.Process(os.getpid())
116+
105117
setupGPUMetrics()
106118

107119
response = networkManager.authenticateWithRefreshToken(refreshToken)
108120
if response.hasFailed():
109-
sendFailure(outputStream, "Failed to authenticate with refresh token")
121+
sendFailure(output, "Failed to authenticate with refresh token")
110122
return
111123

112124
try:
113125
experiment: Experiment = Experiment.fetchById(experimentId)
114126
except NetworkRequestError:
115-
sendFailure(outputStream, f"Failed to fetch experiment with id \"{experimentId}\"")
127+
sendFailure(output, f"Failed to fetch experiment with id \"{experimentId}\"")
116128
return
117129

118130
try:
119131
experiment.createMetrics(METRICS)
120132
except NetworkRequestError:
121-
sendFailure(outputStream, "Failed to create metrics")
133+
sendFailure(output, "Failed to create metrics")
122134

123-
sendSuccess(outputStream, "Experiment worker succcessfully started")
135+
sendSuccess(output, "Experiment worker succcessfully started")
124136

125-
while _isAlive(outputStream):
137+
while (parent := currentProcess.parent()) is not None:
138+
logging.getLogger("coretexpylib").debug(f">> [Coretex] Worker process id {currentProcess.pid}, parent process id {parent.pid}...")
139+
140+
# If parent process ID is set to 1 then that means that the parent process has terminated
141+
# the process (this is only true for Unix-based systems), but since we run the Node
142+
# from the docker container which uses Linux as a base then it is safe to use.
143+
#
144+
# In docker container the pid of the Node process is 1, but we are safe to chech since the
145+
# node should never be a parent of this process for metric upload, only the experiment
146+
# process can be the parent.
147+
if parent.pid == 1 or not parent.is_running() or not _isAlive(output):
148+
logging.getLogger("coretexpylib").debug(">> [Coretex] Terminating worker process...")
149+
break
150+
151+
logging.getLogger("coretexpylib").debug(">> [Coretex] Heartbeat")
126152
_heartbeat(experiment)
153+
154+
logging.getLogger("coretexpylib").debug(">> [Coretex] Uploading metrics")
127155
_uploadMetrics(experiment)
128156

157+
logging.getLogger("coretexpylib").debug(">> [Coretex] Sleeping for 5s")
129158
time.sleep(5) # delay between sending generic metrics

0 commit comments

Comments
 (0)