Skip to content

Commit 4ce043a

Browse files
bors[bot]jiahe23
andauthored
Merge #891
891: Revert "make GC deterministic in distributed" r=szy21 a=szy21 revert #821, which may cause the failure in mpi jobs. Co-authored-by: jiahe23 <[email protected]>
2 parents 9066111 + 50497db commit 4ce043a

File tree

3 files changed

+1
-28
lines changed

3 files changed

+1
-28
lines changed

.buildkite/scaling/pipeline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ FT="Float32"
88
resolutions=("low" "mid" "high")
99
max_procs_per_node=16 # limit this artificially for profiling
1010
profiling=enable
11-
exclusive=true
11+
exclusive=false
1212
mpi_impl="openmpi"
1313

1414
# set up environment and agents

examples/hybrid/callbacks.jl

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -56,18 +56,10 @@ function get_callbacks(parsed_args, simulation, model_spec, params)
5656
else
5757
call_every_dt(save_restart_func, dt_save_restart)
5858
end
59-
60-
gc_callback = if simulation.is_distributed
61-
call_every_n_steps(gc_func, 1000)
62-
else
63-
nothing
64-
end
65-
6659
return ODE.CallbackSet(
6760
dss_cb,
6861
save_to_disk_callback,
6962
save_restart_callback,
70-
gc_callback,
7163
additional_callbacks...,
7264
)
7365
end
@@ -403,19 +395,3 @@ function save_restart_func(integrator)
403395
Base.close(hdfwriter)
404396
return nothing
405397
end
406-
407-
function gc_func(integrator)
408-
free_mem = Sys.free_memory()
409-
total_mem = Sys.total_memory()
410-
p_free_mem = free_mem / total_mem
411-
min_p_free_mem =
412-
ClimaCommsMPI.MPI.Allreduce(p_free_mem, min, comms_ctx.mpicomm)
413-
do_gc = min_p_free_mem < 0.2
414-
@debug "GC check" "free mem (MB)" = free_mem / 2^20 "total mem (MB)" =
415-
total_mem / 2^20 "Minimum free memory (%)" = min_p_free_mem * 100 "Calling GC" =
416-
do_gc
417-
if do_gc
418-
GC.gc()
419-
end
420-
return nothing
421-
end

examples/hybrid/driver.jl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,8 +245,6 @@ end
245245
@info "Running job:`$(simulation.job_id)`"
246246
if simulation.is_distributed
247247
OrdinaryDiffEq.step!(integrator)
248-
GC.enable(false)
249-
GC.gc()
250248
ClimaComms.barrier(comms_ctx)
251249
if ClimaComms.iamroot(comms_ctx)
252250
@timev begin
@@ -256,7 +254,6 @@ if simulation.is_distributed
256254
walltime = @elapsed sol = OrdinaryDiffEq.solve!(integrator)
257255
end
258256
ClimaComms.barrier(comms_ctx)
259-
GC.enable(true)
260257
else
261258
sol = @timev OrdinaryDiffEq.solve!(integrator)
262259
end

0 commit comments

Comments
 (0)