|
19 | 19 | from multiprocessing.connection import Connection
|
20 | 20 |
|
21 | 21 | import time
|
| 22 | +import os |
| 23 | +import logging |
22 | 24 |
|
23 |
| -from ..coretex import MetricType |
| 25 | +import psutil |
| 26 | + |
| 27 | +from ..coretex import MetricType, Experiment |
24 | 28 | from ..networking import networkManager, NetworkRequestError
|
25 |
| -from ..coretex.experiment import Experiment |
| 29 | +from ..logging import initializeLogger, LogSeverity |
| 30 | +from ..folder_management import FolderManager |
26 | 31 | from ..coretex.experiment.metrics.metric_factory import createMetric
|
27 | 32 |
|
28 | 33 |
|
@@ -65,8 +70,10 @@ def setupGPUMetrics() -> None:
|
65 | 70 | createMetric("gpu_usage", "time (s)", MetricType.interval, "usage (%)", MetricType.percent, None, [0, 100]),
|
66 | 71 | createMetric("gpu_temperature", "time (s)", MetricType.interval, "usage (%)", MetricType.percent)
|
67 | 72 | ])
|
| 73 | + |
| 74 | + logging.getLogger("coretexpylib").debug(">> [Coretex] Initialized GPU metrics") |
68 | 75 | except:
|
69 |
| - pass |
| 76 | + logging.getLogger("coretexpylib").debug(">> [Coretex] Failed to initialize GPU metrics") |
70 | 77 |
|
71 | 78 |
|
72 | 79 | def _isAlive(output: Connection) -> bool:
|
@@ -101,29 +108,51 @@ def _uploadMetrics(experiment: Experiment) -> None:
|
101 | 108 | experiment.submitMetrics(metricValues)
|
102 | 109 |
|
103 | 110 |
|
104 |
| -def experimentWorker(outputStream: Connection, refreshToken: str, experimentId: int) -> None: |
| 111 | +def experimentWorker(output: Connection, refreshToken: str, experimentId: int) -> None: |
| 112 | + workerLogPath = FolderManager.instance().logs / f"experiment_worker_{experimentId}.log" |
| 113 | + initializeLogger(LogSeverity.debug, workerLogPath) |
| 114 | + |
| 115 | + currentProcess = psutil.Process(os.getpid()) |
| 116 | + |
105 | 117 | setupGPUMetrics()
|
106 | 118 |
|
107 | 119 | response = networkManager.authenticateWithRefreshToken(refreshToken)
|
108 | 120 | if response.hasFailed():
|
109 |
| - sendFailure(outputStream, "Failed to authenticate with refresh token") |
| 121 | + sendFailure(output, "Failed to authenticate with refresh token") |
110 | 122 | return
|
111 | 123 |
|
112 | 124 | try:
|
113 | 125 | experiment: Experiment = Experiment.fetchById(experimentId)
|
114 | 126 | except NetworkRequestError:
|
115 |
| - sendFailure(outputStream, f"Failed to fetch experiment with id \"{experimentId}\"") |
| 127 | + sendFailure(output, f"Failed to fetch experiment with id \"{experimentId}\"") |
116 | 128 | return
|
117 | 129 |
|
118 | 130 | try:
|
119 | 131 | experiment.createMetrics(METRICS)
|
120 | 132 | except NetworkRequestError:
|
121 |
| - sendFailure(outputStream, "Failed to create metrics") |
| 133 | + sendFailure(output, "Failed to create metrics") |
122 | 134 |
|
123 |
| - sendSuccess(outputStream, "Experiment worker succcessfully started") |
| 135 | + sendSuccess(output, "Experiment worker succcessfully started") |
124 | 136 |
|
125 |
| - while _isAlive(outputStream): |
| 137 | + while (parent := currentProcess.parent()) is not None: |
| 138 | + logging.getLogger("coretexpylib").debug(f">> [Coretex] Worker process id {currentProcess.pid}, parent process id {parent.pid}...") |
| 139 | + |
| 140 | + # If parent process ID is set to 1 then that means that the parent process has terminated |
| 141 | + # the process (this is only true for Unix-based systems), but since we run the Node |
| 142 | + # from the docker container which uses Linux as a base then it is safe to use. |
| 143 | + # |
| 144 | + # In docker container the pid of the Node process is 1, but we are safe to chech since the |
| 145 | + # node should never be a parent of this process for metric upload, only the experiment |
| 146 | + # process can be the parent. |
| 147 | + if parent.pid == 1 or not parent.is_running() or not _isAlive(output): |
| 148 | + logging.getLogger("coretexpylib").debug(">> [Coretex] Terminating worker process...") |
| 149 | + break |
| 150 | + |
| 151 | + logging.getLogger("coretexpylib").debug(">> [Coretex] Heartbeat") |
126 | 152 | _heartbeat(experiment)
|
| 153 | + |
| 154 | + logging.getLogger("coretexpylib").debug(">> [Coretex] Uploading metrics") |
127 | 155 | _uploadMetrics(experiment)
|
128 | 156 |
|
| 157 | + logging.getLogger("coretexpylib").debug(">> [Coretex] Sleeping for 5s") |
129 | 158 | time.sleep(5) # delay between sending generic metrics
|
0 commit comments