Skip to content

Commit

Permalink
Partial Fix to Memory Allocation Bug
Browse files Browse the repository at this point in the history
Fix the memory allocation bug which allowed for users to request any amount of memory from slurm. Slurm may reject their request, but the filter placed on VCell was not stoping these requests to begin with. Fixed it for the simulation state machine, but SlurmProxy still requires some changes.
  • Loading branch information
AvocadoMoon committed Aug 27, 2024
1 parent b40dee9 commit 1c88e93
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 229 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ public static void setConfigProvider(VCellConfigProvider configProvider) {
public static final String htcPbsHome = record("vcell.htc.pbs.home",ValueType.GEN);
public static final String htcSgeHome = record("vcell.htc.sge.home",ValueType.GEN);
public static final String htcNodeList = record("vcell.htc.nodelist",ValueType.GEN);
public static final String htcMinMemoryMB = record("vcell.htc.memory.min.mb", ValueType.INT); // minimum memory request in MB, currently 4g
public static final String htcMaxMemoryMB = record("vcell.htc.memory.max.mb", ValueType.INT); // maximum memory request in MB
public static final String htcPowerUserMemoryFloorMB = record("vcell.htc.memory.pu.floor.mb", ValueType.INT); // MIN memory allowed if declared to be a power user, currently 50g (Previously Existing Value)
public static final String slurm_cmd_sbatch = record("vcell.slurm.cmd.sbatch",ValueType.GEN);
public static final String slurm_cmd_scancel = record("vcell.slurm.cmd.scancel",ValueType.GEN);
public static final String slurm_cmd_sacct = record("vcell.slurm.cmd.sacct",ValueType.GEN);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.util.Arrays;
import java.util.Date;

import cbit.vcell.resource.PropertyLoader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.vcell.util.DataAccessException;
Expand Down Expand Up @@ -482,16 +483,13 @@ public synchronized void onDispatch(Simulation simulation, SimulationJobStatus o
}
SimulationTask simulationTask = new SimulationTask(new SimulationJob(simulation, jobIndex, fieldDataIdentifierSpecs), taskID,null,isPowerUser);

double requiredMemMB = simulationTask.getEstimatedMemorySizeMB();
//SimulationStateMachine ultimately instantiated from {vcellroot}/docker/build/Dockerfile-sched-dev by way of cbit.vcell.message.server.dispatcher.SimulationDispatcher
String vcellUserid = simulationTask.getUser().getName();
KeyValue simID = simulationTask.getSimulationInfo().getSimulationVersion().getVersionKey();
SolverDescription solverDescription = simulationTask.getSimulation().getSolverTaskDescription().getSolverDescription();
double estimatedMemMB = simulationTask.getEstimatedMemorySizeMB();
double htcMinMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMinMemoryMB));
double htcMaxMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMaxMemoryMB));
double requestedMemoryMB = Math.max(estimatedMemMB, htcMinMemoryMB);

MemLimitResults allowableMemMB = HtcProxy.getMemoryLimit(vcellUserid,simID,solverDescription, requiredMemMB, isPowerUser);

final SimulationJobStatus newSimJobStatus;
if (requiredMemMB > allowableMemMB.getMemLimit()) {
if (requestedMemoryMB > htcMaxMemoryMB) {
//
// fail the simulation
//
Expand All @@ -501,7 +499,7 @@ public synchronized void onDispatch(Simulation simulation, SimulationJobStatus o
SimulationExecutionStatus newSimExeStatus = new SimulationExecutionStatus(null, null, new Date(), null, false, null);
newSimJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(),vcSimID,jobIndex,
oldSimulationJobStatus.getSubmitDate(),SchedulerStatus.FAILED,taskID,
SimulationMessage.jobFailed("simulation required "+requiredMemMB+"MB of memory, only "+allowableMemMB.getMemLimit()+"MB allowed from "+allowableMemMB.getMemLimitSource()),
SimulationMessage.jobFailed("simulation required "+estimatedMemMB+"MB of memory, only "+htcMaxMemoryMB+"MB allowed"),
newQueueStatus,newSimExeStatus);

simulationDatabase.updateSimulationJobStatus(newSimJobStatus);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import cbit.vcell.message.server.htc.HtcProxy;
import cbit.vcell.messaging.server.SimulationTask;
import cbit.vcell.mongodb.VCMongoMessage;
import cbit.vcell.resource.PropertyLoader;
import cbit.vcell.server.*;
import cbit.vcell.solver.*;
import cbit.vcell.solver.server.SimulationMessage;
Expand Down Expand Up @@ -458,16 +459,13 @@ public synchronized void onDispatch(Simulation simulation, SimulationJobStatus o
}
SimulationTask simulationTask = new SimulationTask(new SimulationJob(simulation, jobIndex, fieldDataIdentifierSpecs), taskID,null,isPowerUser);

double requiredMemMB = simulationTask.getEstimatedMemorySizeMB();
//SimulationStateMachine ultimately instantiated from {vcellroot}/docker/build/Dockerfile-sched-dev by way of cbit.vcell.message.server.dispatcher.SimulationDispatcher
String vcellUserid = simulationTask.getUser().getName();
KeyValue simID = simulationTask.getSimulationInfo().getSimulationVersion().getVersionKey();
SolverDescription solverDescription = simulationTask.getSimulation().getSolverTaskDescription().getSolverDescription();

HtcProxy.MemLimitResults allowableMemMB = HtcProxy.getMemoryLimit(vcellUserid,simID,solverDescription, requiredMemMB, isPowerUser);
double estimatedMemMB = simulationTask.getEstimatedMemorySizeMB();
double htcMinMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMinMemoryMB));
double htcMaxMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMaxMemoryMB));
double requestedMemoryMB = Math.max(estimatedMemMB, htcMinMemoryMB);

final SimulationJobStatus newSimJobStatus;
if (requiredMemMB > allowableMemMB.getMemLimit()) {
if (requestedMemoryMB > htcMaxMemoryMB) {
//
// fail the simulation
//
Expand All @@ -477,7 +475,7 @@ public synchronized void onDispatch(Simulation simulation, SimulationJobStatus o
SimulationExecutionStatus newSimExeStatus = new SimulationExecutionStatus(null, null, new Date(), null, false, null);
newSimJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(),vcSimID,jobIndex,
oldSimulationJobStatus.getSubmitDate(), SimulationJobStatus.SchedulerStatus.FAILED,taskID,
SimulationMessage.jobFailed("simulation required "+requiredMemMB+"MB of memory, only "+allowableMemMB.getMemLimit()+"MB allowed from "+allowableMemMB.getMemLimitSource()),
SimulationMessage.jobFailed("simulation required "+estimatedMemMB+"MB of memory, only "+htcMaxMemoryMB+"MB allowed"),
newQueueStatus,newSimExeStatus);

simulationDatabase.updateSimulationJobStatus(newSimJobStatus);
Expand Down
225 changes: 14 additions & 211 deletions vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.vcell.util.BeanUtils;
import org.vcell.util.document.KeyValue;
import org.vcell.util.exe.ExecutableException;

Expand Down Expand Up @@ -251,8 +250,8 @@ public static void writeUnixStyleTextFile(File file, String javaString) throws I

public abstract String getSubmissionFileExtension();
public static class MemLimitResults {
private static final long FALLBACK_MEM_LIMIT_MB=4096; // MAX memory allowed if not set in limitFile, currently 4g
private static final long POWER_USER_MEMORY_FLOOR=51200; // MIN memory allowed if declared to be a power user, currently 50g
private static final long FALLBACK_MEM_LIMIT_MB= Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMinMemoryMB)); // MAX memory allowed if not set in limitFile, currently 4g
private static final long POWER_USER_MEMORY_FLOOR=Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcPowerUserMemoryFloorMB)); // MIN memory allowed if declared to be a power user, currently 50g
private long memLimit;
private String memLimitSource;
public MemLimitResults(long memLimit, String memLimitSource) {
Expand All @@ -266,223 +265,27 @@ public long getMemLimit() {
public String getMemLimitSource() {
return memLimitSource;
}
private static MemLimitResults getFallbackMemLimitMB(SolverDescription solverDescription,double estimatedMemSizeMB, boolean isPowerUser) {
Long result = null;
String source = null;
try {
List<String> solverMemLimits = Files.readAllLines(Paths.get(new File("/"+System.getProperty(PropertyLoader.htcLogDirInternal)+"/slurmMinMem.txt").getAbsolutePath()));
for (Iterator<String> iterator = solverMemLimits.iterator(); iterator.hasNext();) {
String solverAndLimit = iterator.next().trim();
if(solverAndLimit.length()==0 || solverAndLimit.startsWith("//")) {
continue;
}
StringTokenizer st = new StringTokenizer(solverAndLimit,":");
String limitSolver = st.nextToken();
if(limitSolver.equalsIgnoreCase("all") && result == null) {//use all if there is not solver matching name in slurmMinMem.txt
result = Long.parseLong(st.nextToken());
source = "used slurmMinMem.txt all";
}else if(solverDescription != null && limitSolver.equals(solverDescription.name())) {//use matching solver mem limit from file
result = Long.parseLong(st.nextToken());
source = "used slurmMinMem.txt "+solverDescription.name();
break;
}
}
if(result == null) {//empty slurmMinMem.txt
result = FALLBACK_MEM_LIMIT_MB;
source = "Empty used FALLBACK_MEM_LIMIT_MB";
}
} catch (Exception e) {
LG.debug(e);
result = FALLBACK_MEM_LIMIT_MB;
source = "Exception "+e.getClass().getSimpleName()+" used FALLBACK_MEM_LIMIT_MB";
}
if(estimatedMemSizeMB > result) {//Use estimated if bigger
result = (long)estimatedMemSizeMB;
source = "used Estimated";
private static MemLimitResults getJobRequestedMemoryLimit(SolverDescription solverDescription, double estimatedMemSizeMB, boolean isPowerUser) {
long batchJobMemoryLimit = FALLBACK_MEM_LIMIT_MB;
String detailedMessage = "default memory limit";

if(estimatedMemSizeMB > batchJobMemoryLimit) {//Use estimated if bigger
batchJobMemoryLimit = (long)estimatedMemSizeMB;
detailedMessage = "used Estimated";
}
if (isPowerUser && result < POWER_USER_MEMORY_FLOOR){
result = (long)POWER_USER_MEMORY_FLOOR;
source = "poweruser's memory override";
if (isPowerUser && batchJobMemoryLimit < POWER_USER_MEMORY_FLOOR){
batchJobMemoryLimit = POWER_USER_MEMORY_FLOOR;
detailedMessage = "poweruser's memory override";
}

return new MemLimitResults(result, source);
return new MemLimitResults(batchJobMemoryLimit, detailedMessage);
}
}
public static final boolean bDebugMemLimit = false;
public static MemLimitResults getMemoryLimit(String vcellUserid, KeyValue simID, SolverDescription solverDescription ,double estimatedMemSizeMB, boolean isPowerUser) {
return MemLimitResults.getFallbackMemLimitMB(solverDescription, estimatedMemSizeMB*1.5, isPowerUser);
// boolean bUseEstimate = estimatedMemSizeMB >= MemLimitResults.getFallbackMemLimitMB(solverDescription);
// return new MemLimitResults((bUseEstimate?(long)estimatedMemSizeMB:MemLimitResults.getFallbackMemLimitMB(solverDescription)), (bUseEstimate?"used Estimated":"used FALLBACK_MEM_LIMIT"));
// //One of 5 limits are returned (ordered from highest to lowest priority):
// // MemoryMax:PerSimulation Has PropertyLoader.simPerUserMemoryLimitFile, specific user AND simID MATCHED in file (userid MemLimitMb simID)
// // MemoryMax:PerUser Has PropertyLoader.simPerUserMemoryLimitFile, specific user (but not simID) MATCHED in file (userid MemLimitMb '*')
// // MemoryMax:PerSolver Has PropertyLoader.simPerUserMemoryLimitFile, specific solverDescription (but not simID or user) MATCHED in file (solverName MemLimitMb '*')
// // MemoryMax:SimulationTask.getEstimatedMemorySizeMB() Has PropertyLoader.simPerUserMemoryLimitFile, no user or sim MATCHED in file ('defaultSimMemoryLimitMb' MemLimitMb '*')
// // estimated > MemoryMax:AllUsersMemLimit
// // MemoryMax:AllUsersMemLimit(defaultSimMemoryLimitMb) Has PropertyLoader.simPerUserMemoryLimitFile, no user or sim MATCHED in file ('defaultSimMemoryLimitMb' MemLimitMb '*')
// // estimated < MemoryMax:AllUsersMemLimit
// // MemoryMax:HtcProxy.MemLimitResults.FALLBACK_MEM_LIMIT No PropertyLoader.simPerUserMemoryLimitFile
// // estimated < FALLBACK
//
// Long defaultSimMemoryLimitMbFromFile = null;
// File memLimitFile = null;
// try {
// //${vcellroot}/docker/swarm/serverconfig-uch.sh->VCELL_SIMDATADIR_EXTERNAL=/share/apps/vcell3/users
// //${vcellroot}/docker/swarm/serverconfig-uch.sh-> VCELL_SIMDATADIR_HOST=/opt/vcelldata/users
// //${vcellroot}/docker/swarm/docker-compose.yml-> Volume map "${VCELL_SIMDATADIR_HOST}:/simdata"
// Long perUserMemMax = null;
// Long perSimMemMax = null;
// Long perSolverMax = null;
// String memLimitFileDirVal = System.getProperty(PropertyLoader.primarySimDataDirInternalProperty);
// String memLimitFileVal = System.getProperty(PropertyLoader.simPerUserMemoryLimitFile);
// if(memLimitFileDirVal != null && memLimitFileVal != null) {
// memLimitFile = new File(memLimitFileDirVal,memLimitFileVal);
// }
// if(memLimitFile != null && memLimitFile.exists()) {
// List<String> perUserLimits = Files.readAllLines(Paths.get(memLimitFile.getAbsolutePath()));
// for (Iterator<String> iterator = perUserLimits.iterator(); iterator.hasNext();) {
// String userAndLimit = iterator.next().trim();
// if(userAndLimit.length()==0 || userAndLimit.startsWith("//")) {
// if(bDebugMemLimit){LG.trace("-----skipped '"+userAndLimit+"'");}
// continue;
// }
//// LG.trace("-----"+userAndLimit);
//
// StringTokenizer st = new StringTokenizer(userAndLimit);
// String limitUserid = st.nextToken();
// if(limitUserid.equals(vcellUserid) || (solverDescription != null && limitUserid.equals(solverDescription.name()))) {//check user
// long memLimit = 0;
// try {
// memLimit = Long.parseLong(st.nextToken());
// } catch (Exception e) {
// if(bDebugMemLimit){LG.debug("-----ERROR '"+userAndLimit+"' token memlimit not parsed");}
// //bad line in limit file, continue processing other lines
// //lg.debug(e);
// continue;
// }
// if(solverDescription != null && limitUserid.equals(solverDescription.name())) {
// perSolverMax = memLimit;
// if(bDebugMemLimit){LG.debug("-----"+"MATCH Solver "+userAndLimit);}
// continue;
// }
// //get simid
// String simSpecifier = null;
// try {
// simSpecifier = st.nextToken();
// //check token is '*' or long
// if(!simSpecifier.equals("*") && Long.valueOf(simSpecifier).longValue() < 0 ) {
// throw new Exception(" token 'simSpecifier' expected to be '*' or simID");
// }
// } catch (Exception e) {
// if(bDebugMemLimit){LG.debug("-----ERROR '"+userAndLimit+"' "+e.getClass().getName()+" "+e.getMessage());}
// //bad line in limit file, continue processing other lines
// //lg.debug(e);
// continue;
// }
// // * means all sims for that user, don't set if sim specific limit is already set
// if(simSpecifier.equals("*") && perSimMemMax == null) {
// perUserMemMax = memLimit;// use this unless overriden by specific simid
// if(bDebugMemLimit){LG.debug("-----"+"MATCH USER "+userAndLimit);}
// }
// //Set sim specific limit, set even if * limit has been set
// if(simID != null && simID.toString().equals(simSpecifier)) {
// perSimMemMax = memLimit;// use sim limit
// if(bDebugMemLimit){LG.debug("-----"+"MATCH SIM "+userAndLimit);}
// }
// }else if(limitUserid.equals("defaultSimMemoryLimitMb")) {//Master sim mem limit
// try {
// defaultSimMemoryLimitMbFromFile = Long.parseLong(st.nextToken());
// if(bDebugMemLimit){LG.debug("-----"+"MATCH DEFAULT "+userAndLimit);}
// } catch (Exception e) {
// if(bDebugMemLimit){LG.debug("-----ERROR '"+userAndLimit+"' "+e.getClass().getName()+" "+e.getMessage());}
// //bad line in limit file, continue processing other lines
// //LG.debug(e);
// continue;
// }
// }else {
// if(bDebugMemLimit){LG.debug("-----"+"NO MATCH "+userAndLimit);}
// }
// }
// if(perUserMemMax != null || perSimMemMax != null) {
// long finalMax = (perSimMemMax!=null?perSimMemMax:perUserMemMax);
// if(bDebugMemLimit){LG.debug("Set memory limit for user '"+vcellUserid+"' to "+finalMax + (perSimMemMax!=null?" for simID="+simID:""));}
// return new MemLimitResults(finalMax,
// (perSimMemMax!=null?
// "MemoryMax(FILE PerSimulation):"+simID+",User='"+vcellUserid+"' from "+memLimitFile.getAbsolutePath():
// "MemoryMax(FILE PerUser):'"+vcellUserid+"' from "+memLimitFile.getAbsolutePath()));
// }else if(perSolverMax != null) {
// if(perSolverMax == 0) {//Use estimated size always if solver had 0 for memory limit
// return new MemLimitResults(
// Math.max((long)Math.ceil(estimatedMemSizeMB*1.5),
// (defaultSimMemoryLimitMbFromFile!=null?defaultSimMemoryLimitMbFromFile:MemLimitResults.FALLBACK_MEM_LIMIT_MB)),
// "MemoryMax(FILE PerSolver ESTIMATED):'"+solverDescription.name()+"' from "+memLimitFile.getAbsolutePath());
// }else {
// return new MemLimitResults(perSolverMax, "MemoryMax(FILE PerSolver):'"+solverDescription.name()+"' from "+memLimitFile.getAbsolutePath());
// }
// }
// }else {
// if(bDebugMemLimit){LG.debug("-----MemLimitFile "+(memLimitFile==null?"not defined":memLimitFile.getAbsolutePath()+" not exist"));}
// }
// } catch (Exception e) {
// //ignore, try defaults
// LG.error(e);
// }
//// long estimatedMemSizeMBL = (long)Math.ceil(estimatedMemSizeMB*1.5);
// boolean bHasMemLimitFile = defaultSimMemoryLimitMbFromFile!=null;
// long maxAllowedMem = (bHasMemLimitFile?defaultSimMemoryLimitMbFromFile:MemLimitResults.FALLBACK_MEM_LIMIT_MB);
//// boolean bUseEstimated = (estimatedMemSizeMBL <= maxAllowedMem);
//// return new MemLimitResults(maxAllowedMem,
//// (bUseEstimated?
//// "MemoryMax(ESTIMATED):SimulationTask.getEstimatedMemorySizeMB()="+estimatedMemSizeMBL:
//// (bHasMemLimitFile?
//// "MemoryMax(FILE AllUsers):AllUsersMemLimit(defaultSimMemoryLimitMb) from "+memLimitFile.getAbsolutePath():
//// "MemoryMax(HARDCODE):HtcProxy.MemLimitResults.FALLBACK_MEM_LIMIT_MB")));
// return new MemLimitResults(maxAllowedMem,
// (bHasMemLimitFile?
// "MemoryMax(FILE AllUsers):AllUsersMemLimit(defaultSimMemoryLimitMb) from "+memLimitFile.getAbsolutePath():
// "MemoryMax(HARDCODE):HtcProxy.MemLimitResults.FALLBACK_MEM_LIMIT_MB"));
return MemLimitResults.getJobRequestedMemoryLimit(solverDescription, estimatedMemSizeMB*1.5, isPowerUser);
}

// public static boolean isStochMultiTrial(SimulationTask simTask) {
// return simTask.getSimulationJob().getSimulation().getSolverTaskDescription().getSolverDescription() == SolverDescription.StochGibson &&
// simTask.getSimulationJob().getSimulation().getSolverTaskDescription().getStochOpt() != null &&
// !simTask.getSimulationJob().getSimulation().getSolverTaskDescription().getStochOpt().isHistogram() &&
// simTask.getSimulationJob().getSimulation().getSolverTaskDescription().getStochOpt().getNumOfTrials() > 1;
//
// }
}



































Loading

0 comments on commit 1c88e93

Please sign in to comment.