Skip to content

Commit

Permalink
Fix bug with running frames not being updated (#1346)
Browse files Browse the repository at this point in the history
* Fix bug with running frames not being updated

Comparing with the cue3 version of HostReportHandler, the verifyRunningFrameInfo was supposed to update the list of frames to only keep what has been verified for the following steps. As grpc objects are immutable, the logic simple kept the full list of frames that may have been canceled, completed or migrated impacting the following calls.

* Add missing import
  • Loading branch information
DiegoTavares authored May 21, 2024
1 parent 3084958 commit 1cdec64
Showing 1 changed file with 46 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import com.imageworks.spcue.CommentDetail;
import com.imageworks.spcue.DispatchHost;
import com.imageworks.spcue.FrameInterface;
import com.imageworks.spcue.FrameDetail;
import com.imageworks.spcue.JobEntity;
import com.imageworks.spcue.LayerEntity;
import com.imageworks.spcue.LayerDetail;
Expand All @@ -50,13 +51,15 @@
import com.imageworks.spcue.VirtualProc;
import com.imageworks.spcue.dao.JobDao;
import com.imageworks.spcue.dao.LayerDao;
import com.imageworks.spcue.dispatcher.HostReportHandler.KillCause;
import com.imageworks.spcue.dispatcher.commands.DispatchBookHost;
import com.imageworks.spcue.dispatcher.commands.DispatchBookHostLocal;
import com.imageworks.spcue.dispatcher.commands.DispatchHandleHostReport;
import com.imageworks.spcue.dispatcher.commands.DispatchRqdKillFrame;
import com.imageworks.spcue.dispatcher.commands.DispatchRqdKillFrameMemory;
import com.imageworks.spcue.grpc.host.HardwareState;
import com.imageworks.spcue.grpc.host.LockState;
import com.imageworks.spcue.grpc.job.FrameState;
import com.imageworks.spcue.grpc.report.BootReport;
import com.imageworks.spcue.grpc.report.CoreDetail;
import com.imageworks.spcue.grpc.report.HostReport;
Expand Down Expand Up @@ -208,23 +211,23 @@ public void handleHostReport(HostReport report, boolean isBoot) {
* Verify all the frames in the report are valid.
* Frames that are not valid are removed.
*/
verifyRunningFrameInfo(report);
List<RunningFrameInfo> runningFrames = verifyRunningFrameInfo(report);

/*
* Updates memory usage for the proc, frames,
* jobs, and layers. And LLU time for the frames.
*/
updateMemoryUsageAndLluTime(report.getFramesList());
updateMemoryUsageAndLluTime(runningFrames);

/*
* kill frames that have over run.
*/
killTimedOutFrames(report);
killTimedOutFrames(runningFrames, report.getHost().getName());

/*
* Prevent OOM (Out-Of-Memory) issues on the host and manage frame reserved memory
*/
handleMemoryUsage(host, report);
handleMemoryUsage(host, report.getHost(), runningFrames);

/*
* The checks are done in order of least CPU intensive to
Expand Down Expand Up @@ -473,21 +476,21 @@ private void changeLockState(DispatchHost host, CoreDetail coreInfo) {
* - A frame is taking more than OOM_FRAME_OVERBOARD_PERCENT of what it had reserved
* For frames that are using more than they had reserved but not above the threshold, negotiate expanding
* the reservations with other frames on the same host
* @param host
*
* @param dispatchHost
* @param report
*/
private void handleMemoryUsage(final DispatchHost host, final HostReport report) {
private void handleMemoryUsage(final DispatchHost dispatchHost, RenderHost renderHost,
List<RunningFrameInfo> runningFrames) {
// Don't keep memory balances on nimby hosts
if (host.isNimby) {
if (dispatchHost.isNimby) {
return;
}

final double OOM_MAX_SAFE_USED_MEMORY_THRESHOLD =
env.getRequiredProperty("dispatcher.oom_max_safe_used_memory_threshold", Double.class);
final double OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD =
env.getRequiredProperty("dispatcher.oom_frame_overboard_allowed_threshold", Double.class);
RenderHost renderHost = report.getHost();
List<RunningFrameInfo> runningFrames = report.getFramesList();
final double OOM_MAX_SAFE_USED_MEMORY_THRESHOLD = env
.getRequiredProperty("dispatcher.oom_max_safe_used_memory_threshold", Double.class);
final double OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD = env
.getRequiredProperty("dispatcher.oom_frame_overboard_allowed_threshold", Double.class);

boolean memoryWarning = renderHost.getTotalMem() > 0 &&
((double)renderHost.getFreeMem()/renderHost.getTotalMem() <
Expand All @@ -500,7 +503,7 @@ private void handleMemoryUsage(final DispatchHost host, final HostReport report)
int killAttemptsRemaining = 10;
VirtualProc killedProc = null;
do {
killedProc = killWorstMemoryOffender(host);
killedProc = killWorstMemoryOffender(dispatchHost);
killAttemptsRemaining -= 1;
if (killedProc != null) {
memoryAvailable = memoryAvailable + killedProc.memoryUsed;
Expand All @@ -514,7 +517,7 @@ private void handleMemoryUsage(final DispatchHost host, final HostReport report)
// them accordingly
for (final RunningFrameInfo frame : runningFrames) {
if (OOM_FRAME_OVERBOARD_ALLOWED_THRESHOLD > 0 && isFrameOverboard(frame)) {
if (!killFrameOverusingMemory(frame, host.getName())) {
if (!killFrameOverusingMemory(frame, dispatchHost.getName())) {
logger.warn("Frame " + frame.getJobName() + "." + frame.getFrameName() +
" is overboard but could not be killed");
}
Expand Down Expand Up @@ -748,25 +751,26 @@ private void handleMemoryReservations(final RunningFrameInfo frame) {
*
* @param rFrames
*/
private void killTimedOutFrames(HostReport report) {
final Map<String, LayerDetail> layers = new HashMap<String, LayerDetail>(5);

for (RunningFrameInfo frame: report.getFramesList()) {
private void killTimedOutFrames(List<RunningFrameInfo> runningFrames, String hostname) {
for (RunningFrameInfo frame : runningFrames) {
String layerId = frame.getLayerId();
LayerDetail layer = layerDao.getLayerDetail(layerId);
long runtimeMinutes = ((System.currentTimeMillis() - frame.getStartTime()) / 1000l) / 60;

String hostname = report.getHost().getName();
try {
LayerDetail layer = layerDao.getLayerDetail(layerId);
long runtimeMinutes = ((System.currentTimeMillis() - frame.getStartTime()) / 1000l) / 60;

if (layer.timeout != 0 && runtimeMinutes > layer.timeout){
killFrame(frame.getFrameId(), hostname, KillCause.FrameTimedOut);
} else if (layer.timeout_llu != 0 && frame.getLluTime() != 0) {
long r = System.currentTimeMillis() / 1000;
long lastUpdate = (r - frame.getLluTime()) / 60;
if (layer.timeout != 0 && runtimeMinutes > layer.timeout) {
killFrame(frame.getFrameId(), hostname, KillCause.FrameTimedOut);
} else if (layer.timeout_llu != 0 && frame.getLluTime() != 0) {
long r = System.currentTimeMillis() / 1000;
long lastUpdate = (r - frame.getLluTime()) / 60;

if (layer.timeout_llu != 0 && lastUpdate > (layer.timeout_llu - 1)){
killFrame(frame.getFrameId(), hostname, KillCause.FrameLluTimedOut);
if (layer.timeout_llu != 0 && lastUpdate > (layer.timeout_llu - 1)) {
killFrame(frame.getFrameId(), hostname, KillCause.FrameLluTimedOut);
}
}
} catch (EmptyResultDataAccessException e) {
logger.info("Unable to get layer with id=" + layerId);
}
}
}
Expand Down Expand Up @@ -871,10 +875,8 @@ private void updateLayerMemoryUsage(List<RunningFrameInfo> frames) {
*
* @param report
*/
public void verifyRunningFrameInfo(HostReport report) {

List<RunningFrameInfo> runningFrames = new
ArrayList<RunningFrameInfo>(report.getFramesCount());
public List<RunningFrameInfo> verifyRunningFrameInfo(HostReport report) {
List<RunningFrameInfo> runningFrames = new ArrayList<RunningFrameInfo>(report.getFramesCount());

for (RunningFrameInfo runningFrame: report.getFramesList()) {

Expand Down Expand Up @@ -928,15 +930,23 @@ public void verifyRunningFrameInfo(HostReport report) {
proc = null;
}
if (proc == null) {
if (killFrame(runningFrame.getFrameId(),
// A frameCompleteReport might have been delivered before this report was
// processed
FrameDetail frameLatestVersion = jobManager.getFrameDetail(runningFrame.getFrameId());
if (frameLatestVersion.state != FrameState.RUNNING) {
logger.info("DelayedVerification, the proc " +
runningFrame.getResourceId() + " on host " +
report.getHost().getName() + " has already Completed " +
runningFrame.getJobName() + "/" + runningFrame.getFrameName());
} else if (killFrame(runningFrame.getFrameId(),
report.getHost().getName(),
KillCause.FrameVerificationFailure)) {
logger.info("FrameVerificationError, the proc " +
runningFrame.getResourceId() + " on host " +
report.getHost().getName() + " was running for " +
(runtimeSeconds / 60.0f) + " minutes " +
runningFrame.getJobName() + "/" + runningFrame.getFrameName() +
"but the DB did not " +
" but the DB did not " +
"reflect this. " +
msg);
} else {
Expand All @@ -946,6 +956,7 @@ public void verifyRunningFrameInfo(HostReport report) {
}
}
}
return runningFrames;
}

public HostManager getHostManager() {
Expand Down

0 comments on commit 1cdec64

Please sign in to comment.