Merge pull request #267 from Pyomo/PR265

bknueven · web-flow · commit 0c272122fa24 · 2022-09-02T14:28:58.000-06:00
Pr265
diff --git a/.github/workflows/pull_push_regression.yml b/.github/workflows/pull_push_regression.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Install dependencies
         run: |
           conda install mpi4py pandas setuptools
-          pip install pyomo sphinx sphinx_rtd_theme cplex
+          pip install pyomo sphinx sphinx_rtd_theme dill gridx-egret cplex
           pip install xpress
 
       - name: setup the program
@@ -41,6 +41,11 @@ jobs:
           cd examples
           python afew.py xpress_persistent 
 
+      - name: Test run_all nouc 
+        run: |
+          cd examples
+          python run_all.py xpress_persistent "" nouc
+
       - name: Test docs
         run: |
           cd ./doc/src/
diff --git a/doc/src/spokes.rst b/doc/src/spokes.rst
@@ -121,25 +121,3 @@ cross scenario
 ^^^^^^^^^^^^^^
 
 Passes cross scenario cuts.
-
-
-spoke_sleep_time
-----------------
-
-This is an advanced topic and rarely encountered.
-In some settings, particularly with small sub-problems, it is possible for
-ranks within spokes to become of of sync.  The most common manifestation of this
-is that some ranks do not see the kill signal and sit in a busy-wait I/O loop
-until something external kills them; but it can also be the case that Lagrangian
-bound spokes start operating on data from different hub iterations; they should notice
-this an emit a message if it happens.
-
-This problem is normally avoided by default actions in lower level code (in `spcommunicator.py`)
-that insert a short sleep. To compute the sleep duration, it uses a heuristic based on the
-number of non-anticipative variables. It is also possible to explicitly set this sleep time.
-At the lowest levels, this is done by setting a value for "spoke_sleep_time" in the options
-dictionary passed to the ``SPCommunicator`` constructor. At a higher level, it is possible
-to pass a `spoke_sleep_time` keyword argument to the vanilla hub and spoke constructors. This
-is illustrated in `hydro_cylinders.py` example (in the `hyrdo` example directory). You
-should probably pass the same value to all constructors. The importance of setting
-the spoke sleep time is going down as we improve the code for setting this value automatically.
diff --git a/examples/farmer/farmer_mmw.bash b/examples/farmer/farmer_mmw.bash
@@ -10,4 +10,4 @@ mpiexec -np 3 python -m mpi4py farmer_cylinders.py  --num-scens 3 --lagrangian -
 
 echo "starting mmw"
 
-python -m mpisppy.confidence_intervals.mmw_conf farmer --xhatpath farmer_cyl_nonants.npy --solver-name ${SOLVERNAME} --MMW-num-batches 5 --MMW-batch-size 10 --confidence-level 0.9 --start-scen 10
+python -m mpisppy.confidence_intervals.mmw_conf farmer --xhatpath farmer_cyl_nonants.npy --EF-solver-name ${SOLVERNAME} --MMW-num-batches 5 --MMW-batch-size 10 --confidence-level 0.9 --start-scen 10
diff --git a/examples/hydro/hydro_cylinders.py b/examples/hydro/hydro_cylinders.py
@@ -13,14 +13,6 @@
 
 import mpisppy.cylinders as cylinders
 
-# For this problem, the subproblems are
-# small and take no time to solve. The
-# default SPOKE_SLEEP_TIME of 0.01 *causes*
-# synchronization issues in this case, so
-# we reduce it so as not to dominate the
-# time spent for cylinder synchronization
-SPOKE_SLEEP_TIME = 0.0001
-
 write_solution = True
 
 def _parse_args():
@@ -74,15 +66,15 @@ def main():
                               ph_extensions=None,
                               rho_setter = rho_setter,
                               all_nodenames = all_nodenames,
-                              spoke_sleep_time = SPOKE_SLEEP_TIME)
+                             )
 
     # Standard Lagrangian bound spoke
     if lagrangian:
         lagrangian_spoke = vanilla.lagrangian_spoke(*beans,
                                                     scenario_creator_kwargs=scenario_creator_kwargs,
                                                     rho_setter = rho_setter,
                                                     all_nodenames = all_nodenames,
-                                                    spoke_sleep_time = SPOKE_SLEEP_TIME)
+                                                   )
 
 
     # xhat looper bound spoke
@@ -91,7 +83,7 @@ def main():
         xhatshuffle_spoke = vanilla.xhatshuffle_spoke(*beans,
                                                       all_nodenames=all_nodenames,
                                                       scenario_creator_kwargs=scenario_creator_kwargs,
-                                                      spoke_sleep_time = SPOKE_SLEEP_TIME)
+                                                     )
 
     list_of_spoke_dict = list()
     if lagrangian:
diff --git a/examples/run_all.py b/examples/run_all.py
@@ -139,8 +139,8 @@ def do_one_mmw(dirname, runefstring, npyfile, mmwargstring):
             badguys[dirname].append(runefstring)
     # run mmw, remove .npy file
     else:
-        runstring = "python -m mpisppy.confidence_intervals.mmw_conf {} --xhatpath {} --solver-name {} {}".\
-                    format(dirname, npyfile, solver_name, mmwargstring)
+        runstring = "python -m mpisppy.confidence_intervals.mmw_conf {} --xhatpath {} {}".\
+                    format(dirname, npyfile, mmwargstring)
         code = os.system("echo {} && {}".format(runstring, runstring))
         if code != 0:
             if dirname not in badguys:
@@ -151,7 +151,6 @@ def do_one_mmw(dirname, runefstring, npyfile, mmwargstring):
         os.remove(npyfile)
     os.chdir("..")
 
-
 do_one("farmer", "farmer_ef.py", 1,
        "1 3 {}".format(solver_name))
 # for farmer_cylinders, the first arg is num_scens and is required
@@ -261,6 +260,7 @@ def do_one_mmw(dirname, runefstring, npyfile, mmwargstring):
        "--instance-name=sslp_15_45_10 --bundles-per-rank=2 "
        "--max-iterations=5 --default-rho=1 "
        "--lagrangian --xhatshuffle --fwph "
+       "--linearize-proximal-terms "
        "--solver-name={} --fwph-stop-check-tol 0.01".format(solver_name))
 
 do_one("hydro", "hydro_cylinders.py", 3,
@@ -306,7 +306,7 @@ def do_one_mmw(dirname, runefstring, npyfile, mmwargstring):
 
 #=========MMW TESTS==========
 # do_one_mmw is special
-do_one_mmw("farmer", f"python farmer_ef.py 3 3 {solver_name}", "farmer_cyl_nonants.npy", "--MMW-num-batches=5 --confidence-level 0.95 --MMW-batch-size=10 --objective-gap --start-scen 4 --EF-solver-name={solver_name}")
+do_one_mmw("farmer", f"python farmer_ef.py 3 3 {solver_name}", "farmer_cyl_nonants.npy", f"--MMW-num-batches=5 --confidence-level 0.95 --MMW-batch-size=10 --objective-gap --start-scen 4 --EF-solver-name={solver_name}")
 
 
 #============================
@@ -386,5 +386,6 @@ def do_one_mmw(dirname, runefstring, npyfile, mmwargstring):
         print("Directory={}".format(i))
         for c in v:
             print("    {}".format(c))
+    sys.exit(1)
 else:
     print("\nAll OK.")
diff --git a/examples/uc/uc_ama.py b/examples/uc/uc_ama.py
@@ -11,21 +11,23 @@
 import mpisppy.utils.amalgamator as amalgamator
 from uc_funcs import id_fix_list_fct
 from mpisppy.utils import config
+import pyomo.common.config as pyofig
 
 def main():
     solution_files = {"first_stage_solution":"uc_first_stage.csv",
                       #"tree_solution":"uc_ama_full_solution" 
                       #It takes too long to right the full solution
                       }
-    config.add_and_assign("id_fix_list_fct", "fct used by fixer extension", 
+    cfg = config.Config()
+    cfg.add_and_assign("id_fix_list_fct", "fct used by fixer extension", 
                                         domain=None, default=None,
                                         value = id_fix_list_fct)
-    ama_options = {"2stage": True,   # 2stage vs. mstage
-                   "cylinders": ['ph','xhatshuffle','lagranger'],
-                   "extensions": ['fixer'],
-                   "write_solution": solution_files
-                   }
-    ama = amalgamator.from_module("uc_funcs", ama_options)
+    cfg.add_and_assign("2stage", description="2stage vsus mstage", domain=bool, default=None, value=True)
+    cfg.add_and_assign("cylinders", description="list of cylinders", domain=pyofig.ListOf(str), default=None, value=['ph','xhatshuffle','lagranger'])
+    cfg.add_and_assign("extensions", description="list of extensions", domain=pyofig.ListOf(str), default=None, value= ['fixer'])
+    cfg.add_and_assign("write_solution", description="list of extensions", domain=None, default=None, value=solution_files)
+
+    ama = amalgamator.from_module("uc_funcs", cfg)
     ama.run()
     if ama.on_hub:
         print("first_stage_solution=", ama.first_stage_solution)
diff --git a/examples/uc/uc_funcs.py b/examples/uc/uc_funcs.py
@@ -276,10 +276,10 @@ def scenario_names_creator(scnt,start=0):
     return [F"Scenario{i+1}" for i in range(start,scnt+start)]
 
 #=========
-def inparser_adder():
+def inparser_adder(cfg):
     # (only for Amalgamator): add command options unique to uc
-    config.num_scens_required()
-    config.add_to_config("UC_count_for_path",
+    cfg.num_scens_required()
+    cfg.add_to_config("UC_count_for_path",
                          description="Mainly for confidence intervals to give a prefix for the directory providing the scenario data but will be overridden if scen_count is greater (default 0)",
                           domain=int,
                           default=0)
diff --git a/mpisppy/confidence_intervals/multi_seqsampling.py b/mpisppy/confidence_intervals/multi_seqsampling.py
@@ -79,7 +79,7 @@ def run(self, maxit=200):
         xhat_scenario_names = refmodel.scenario_names_creator(mk)
 
         xgo = self.xhat_gen_kwargs.copy()
-        xgo["solvername"] = self.cfg.solvername
+        xgo["solvername"] = self.solvername
         xgo.pop("solver_options", None)  # it will be given explicitly
         xgo.pop("scenario_names", None)  # it will be given explicitly
         xgo["branching_factors"] = xhat_branching_factors
diff --git a/mpisppy/cylinders/cross_scen_spoke.py b/mpisppy/cylinders/cross_scen_spoke.py
@@ -40,8 +40,7 @@ def _got_kill_signal(self):
         ''' returns True if a kill signal was received,
             and refreshes the array and _locals'''
         self._new_locals = self.spoke_from_hub(self._locals)
-        kill = (self._locals[-1] == -1)
-        return kill
+        return self.remote_write_id == -1 
 
     def prep_cs_cuts(self):
         # create a map scenario -> index, this index is used for various lists containing scenario dependent info.
diff --git a/mpisppy/cylinders/hub.py b/mpisppy/cylinders/hub.py
@@ -346,6 +346,8 @@ def hub_to_spoke(self, values, spoke_strata_rank):
                 f"Attempting to put array of length {len(values)} "
                 f"into local buffer of length {expected_length}"
             )
+        # this is so the spoke ranks all get the same write_id at approximately the same time
+        self.cylinder_comm.Barrier()
         self.local_write_ids[spoke_strata_rank - 1] += 1
         values[-1] = self.local_write_ids[spoke_strata_rank - 1]
         window = self.windows[spoke_strata_rank - 1]
@@ -366,13 +368,26 @@ def hub_from_spoke(self, values, spoke_num):
                 f"Hub trying to get buffer of length {expected_length} "
                 f"from spoke, but provided buffer has length {len(values)}."
             )
+        # so the window in each rank gets read at approximately the same time,
+        # and so has the same write_id
+        self.cylinder_comm.Barrier()
         window = self.windows[spoke_num - 1]
         window.Lock(spoke_num)
         window.Get((values, len(values), MPI.DOUBLE), spoke_num)
         window.Unlock(spoke_num)
 
-        if values[-1] > self.remote_write_ids[spoke_num - 1]:
-            self.remote_write_ids[spoke_num - 1] = values[-1]
+        new_id = int(values[-1])
+        local_val = np.array((new_id,), 'i')
+        sum_ids = np.zeros(1, 'i')
+        self.cylinder_comm.Allreduce((local_val, MPI.INT),
+                                     (sum_ids, MPI.INT),
+                                     op=MPI.SUM)
+
+        if new_id != sum_ids[0] / self.cylinder_comm.size:
+            return False
+
+        if (new_id > self.remote_write_ids[spoke_num - 1]) or (new_id < 0):
+            self.remote_write_ids[spoke_num - 1] = new_id
             return True
         return False
 
diff --git a/mpisppy/cylinders/lagrangian_bounder.py b/mpisppy/cylinders/lagrangian_bounder.py
@@ -49,9 +49,10 @@ def lagrangian(self):
         if total == serial_number_sum:
             return bound
         elif self.cylinder_rank == 0:
-            print("WARNING: Lagrangian spokes out of snyc, consider changing the spoke_sleep_time option; "
-                  "see the documentation for more information. "
-                  f"(The current value is {self.spoke_sleep_time})")
+            # TODO: this whole check can probably be removed as its done
+            #       within `got_kill_signal`. Leaving it for now as an
+            #       additional check.
+            raise RuntimeError("Lagrangian spokes unexpectly out of snyc")
         return None
 
     def _set_weights_and_solve(self):
diff --git a/mpisppy/cylinders/spcommunicator.py b/mpisppy/cylinders/spcommunicator.py
@@ -22,9 +22,6 @@ class SPCommunicator:
     """ Notes: TODO
     """
 
-    # magic constant for spoke_sleep_time calculation below
-    _SLEEP_TIME_MUTLIPLIER = 1e-5
-
     def __init__(self, spbase_object, fullcomm, strata_comm, cylinder_comm, options=None):
         # flag for if the windows have been constructed
         self._windows_constructed = False
@@ -42,11 +39,6 @@ def __init__(self, spbase_object, fullcomm, strata_comm, cylinder_comm, options=
         else:
             self.options = options
 
-        self.spoke_sleep_time = self.options.get('spoke_sleep_time')
-        # the user could set None
-        if self.spoke_sleep_time is None:
-                self.spoke_sleep_time = self._SLEEP_TIME_MUTLIPLIER * spbase_object.nonant_length
-
         # attach the SPCommunicator to
         # the SPBase object
         self.opt.spcomm = self
@@ -77,13 +69,8 @@ def finalize(self):
     def hub_finalize(self):
         """ Every hub may have another finalize function,
             which collects any results from finalize
-
-            Spokes use the implementation below, which just
-            puts a small sleep in so windows are not freed
-            too soon.
         """
-        ## give the hub the chance to catch new values
-        time.sleep(self.spoke_sleep_time)
+        pass
 
     def allreduce_or(self, val):
         local_val = np.array([val], dtype='int8')
diff --git a/mpisppy/cylinders/spoke.py b/mpisppy/cylinders/spoke.py
@@ -73,6 +73,7 @@ def spoke_to_hub(self, values):
                 f"Attempting to put array of length {len(values)} "
                 f"into local buffer of length {expected_length}"
             )
+        self.cylinder_comm.Barrier()
         self.local_write_id += 1
         values[-1] = self.local_write_id
         window = self.windows[self.strata_rank - 1]
@@ -89,27 +90,34 @@ def spoke_from_hub(self, values):
                 f"Spoke trying to get buffer of length {expected_length} "
                 f"from hub, but provided buffer has length {len(values)}."
             )
+        self.cylinder_comm.Barrier()
         window = self.windows[self.strata_rank - 1]
         window.Lock(0)
         window.Get((values, len(values), MPI.DOUBLE), 0)
         window.Unlock(0)
 
-        if values[-1] > self.remote_write_id:
-            self.remote_write_id = values[-1]
+        new_id = int(values[-1])
+        local_val = np.array((new_id,), 'i')
+        sum_ids = np.zeros(1, 'i')
+        self.cylinder_comm.Allreduce((local_val, MPI.INT),
+                                     (sum_ids, MPI.INT),
+                                     op=MPI.SUM)
+
+        # NOTE: we only proceed if all the ranks agree
+        #       on the ID
+        if new_id != sum_ids[0] / self.cylinder_comm.size:
+            return False
+
+        if (new_id > self.remote_write_id) or (new_id < 0):
+            self.remote_write_id = new_id
             return True
         return False
 
     def got_kill_signal(self):
         """ Spoke should call this method at least every iteration
             to see if the Hub terminated
         """
-        # Spokes can sometimes call this frequently in a tight loop,
-        # causing the Allreduces to become out of sync
-        diff = time.time() - self.last_call_to_got_kill_signal
-        if diff < self.spoke_sleep_time:
-            time.sleep(self.spoke_sleep_time - diff)
-        self.last_call_to_got_kill_signal = time.time()
-        return self._got_kill_signal()
+        return self._got_kill_signal() 
 
     @abc.abstractmethod
     def main(self):
@@ -121,9 +129,8 @@ def main(self):
         """
         pass
 
-    @abc.abstractmethod
     def get_serial_number(self):
-        pass
+        return self.remote_write_id
 
     @abc.abstractmethod
     def _got_kill_signal(self):
@@ -173,14 +180,10 @@ def bound(self, value):
         self._bound[0] = value
         self.spoke_to_hub(self._bound)
 
-    def get_serial_number(self):
-        return int(self._kill_sig[-1])
-
     def _got_kill_signal(self):
         """Looks for the kill signal and returns True if sent"""
         self.spoke_from_hub(self._kill_sig)
-        kill = self._kill_sig[-1] == -1
-        return kill
+        return self.remote_write_id == -1
 
     def _append_trace(self, value):
         if self.cylinder_rank != 0 or self.trace_filen is None:
@@ -217,15 +220,11 @@ def make_windows(self):
         self._bound = np.zeros(1 + 1)
         self._new_locals = False
 
-    def get_serial_number(self):
-        return int(self._locals[-1])
-
     def _got_kill_signal(self):
         """ returns True if a kill signal was received, 
             and refreshes the array and _locals"""
         self._new_locals = self.spoke_from_hub(self._locals)
-        kill = self._locals[-1] == -1
-        return kill
+        return self.remote_write_id == -1
 
 
 class InnerBoundSpoke(_BoundSpoke):
diff --git a/mpisppy/cylinders/xhatshufflelooper_bounder.py b/mpisppy/cylinders/xhatshufflelooper_bounder.py
@@ -142,7 +142,6 @@ def _vb(msg):
         while not self.got_kill_signal():
             # When there is no iter0, the serial number must be checked.
             # (unrelated: uncomment the next line to see the source of delay getting an xhat)
-            # print(f"in loop {self.get_serial_number() =}, {self.spoke_sleep_time =}")
             if self.get_serial_number() == 0:
                 continue
 
diff --git a/mpisppy/spin_the_wheel.py b/mpisppy/spin_the_wheel.py
diff --git a/mpisppy/utils/amalgamator.py b/mpisppy/utils/amalgamator.py
diff --git a/mpisppy/utils/cfg_vanilla.py b/mpisppy/utils/cfg_vanilla.py
diff --git a/mpisppy/utils/config.py b/mpisppy/utils/config.py
diff --git a/mpisppy/utils/vanilla.py b/mpisppy/utils/vanilla.py

Original file line number	Diff line number	Diff line change
`@@ -10,4 +10,4 @@ mpiexec -np 3 python -m mpi4py farmer_cylinders.py --num-scens 3 --lagrangian -`
`10`	`10`
`11`	`11`	`echo "starting mmw"`
`12`	`12`
`13`		`-python -m mpisppy.confidence_intervals.mmw_conf farmer --xhatpath farmer_cyl_nonants.npy --solver-name ${SOLVERNAME} --MMW-num-batches 5 --MMW-batch-size 10 --confidence-level 0.9 --start-scen 10`
	`13`	`+python -m mpisppy.confidence_intervals.mmw_conf farmer --xhatpath farmer_cyl_nonants.npy --EF-solver-name ${SOLVERNAME} --MMW-num-batches 5 --MMW-batch-size 10 --confidence-level 0.9 --start-scen 10`