flux-framework · JaeseungYeom · Jul 30, 2024 · Feb 29, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/.clang-format b/.clang-format
@@ -22,4 +22,5 @@ SpaceBeforeParens: Always
 TabWidth: '4'
 UseTab: Never
 AlwaysBreakAfterReturnType: None
-AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterDefinitionReturnType: None
+ContinuationIndentWidth: Always
diff --git a/.gitignore b/.gitignore
@@ -97,9 +97,10 @@ compile_flags.txt
 
 # local editor config dirs
 .vscode
-.idea
+.idea*
 .clangd
-
+.cache
+cmake-build-debug
 # ignore installable version of dyadrun
 dyadrun
 flux_barrier
@@ -112,7 +113,17 @@ flux_barrier
 
 # Python stuff
 **/__pycache__/
-**/build
+**/build*
 **/*.egg-info
 /install/
 /dyad-env/
+env
+hydra_log
+docs/demos/ecp_feb_2023/c_cons
+docs/demos/ecp_feb_2023/cpp_cons
+docs/demos/ecp_feb_2023/c_prod
+docs/demos/ecp_feb_2023/cpp_prod
+tests/integration/dlio_benchmark/logs
+scripts/checkpoints
+scripts/logs
+tests/integration/dlio_benchmark/perf_analysis/.ipynb_checkpoints
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,12 +13,12 @@ set(DYAD_PACKAGE_VERSION_MAJOR "${DYAD_VERSION_MAJOR}.${DYAD_VERSION_MINOR}")
 set(DYAD_PACKAGE_VERSION_MINOR "${DYAD_VERSION_PATCH}")
 set(DYAD_PACKAGE_STRING "${DYAD_PACKAGE_NAME} ${DYAD_PACKAGE_VERSION}")
 set(DYAD_PACKAGE_TARNAME "${DYAD_PACKAGE}")
-
 project(dyad LANGUAGES C CXX)
 
 # Convenience defines
 string(TOUPPER "${PROJECT_NAME}" UPPER_PROJECT_NAME)
 string(TOLOWER "${PROJECT_NAME}" LOWER_PROJECT_NAME)
+set(DYAD_PROJECT_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 #------------------------------------------------------------------------------
 # Internal Includes for header and libraries
@@ -110,7 +110,7 @@ set(DYAD_LOGGER "NONE" CACHE STRING "Logger to use for DYAD")
 set_property(CACHE DYAD_LOGGER PROPERTY STRINGS FLUX CPP_LOGGER NONE)
 set(DYAD_LOGGER_LEVEL "NONE" CACHE STRING "Logging level to use for DYAD")
 set_property(CACHE DYAD_LOGGER_LEVEL PROPERTY STRINGS DEBUG INFO WARN ERROR NONE)
-
+option(DYAD_ENABLE_TESTS "Enable dyad tests" OFF)
 
 #------------------------------------------------------------------------------
 # Compiler setup
@@ -459,6 +459,8 @@ string(APPEND _str
   "  DYAD_ENABLE_UCX_DATA:        ${DYAD_ENABLE_UCX_DATA}\n")
 string(APPEND _str
   "  DYAD_ENABLE_UCX_DATA_RMA:    ${DYAD_ENABLE_UCX_DATA_RMA}\n")
+string(APPEND _str
+        "  DYAD_ENABLE_TESTS:    ${DYAD_ENABLE_TESTS}\n")
 string(APPEND _str
   "  DYAD_PROFILER:               ${DYAD_PROFILER}\n")
 string(APPEND _str
@@ -513,3 +515,9 @@ install(FILES "${CMAKE_BINARY_DIR}/dyad_module.lua.install"
   RENAME "${DYAD_MODULEFILE_NAME}"
   DESTINATION
   "${DYAD_INSTALL_SYSCONFDIR}")
+
+
+if (DYAD_ENABLE_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif ()
diff --git a/pydyad/pydyad/bindings.py b/pydyad/pydyad/bindings.py
@@ -82,9 +82,12 @@ def __del__(self):
             self.dyad_bindings_obj = None
 
 
-class DTLMode(enum.IntEnum):
-    DYAD_DTL_UCX = 0
-    DYAD_DTL_FLUX_RPC = 1
+class DTLMode(enum.Enum):
+    DYAD_DTL_UCX = "UCX"
+    DYAD_DTL_FLUX_RPC = "FLUX_RPC"
+
+    def __str__(self):
+        return self.value
 
 class DTLCommMode(enum.IntEnum):
     DYAD_COMM_NONE = 0
@@ -252,7 +255,7 @@ def init(
             prod_managed_path.encode() if prod_managed_path is not None else None,
             cons_managed_path.encode() if cons_managed_path is not None else None,
             ctypes.c_bool(relative_to_managed_path),
-            dtl_mode.encode() if dtl_mode is not None else None,
+            str(dtl_mode).encode() if dtl_mode is not None else None,
             ctypes.c_int(dtl_comm_mode),
             ctypes.c_void_p(flux_handle)
         )

diff --git a/pydyad/pydyad/hdf.py b/pydyad/pydyad/hdf.py
@@ -0,0 +1,37 @@
+from pydyad.bindings import Dyad
+
+from pathlib import Path
+
+import h5py
+
+
+class DyadFile(h5py.File):
+
+    def __init__(self, fname,  mode, file=None, dyad_ctx=None, metadata_wrapper=None):
+        # According to H5PY, the first positional argument to File.__init__ is fname
+        self.fname = fname
+        if not isinstance(self.fname, Path):
+            self.fname = Path(fname)
+        self.fname = self.fname.expanduser().resolve()
+        self.m = mode
+        if dyad_ctx is None:
+            raise NameError("'dyad_ctx' argument not provided to pydyad.hdf.File constructor")
+        self.dyad_ctx = dyad_ctx
+        if self.m in ("r"):
+            if (self.dyad_ctx.cons_path is not None and
+                    self.dyad_ctx.cons_path in self.fname.parents):
+                if metadata_wrapper:
+                    self.dyad_ctx.consume_w_metadata(str(self.fname), metadata_wrapper)
+                else:
+                    dyad_ctx.consume(str(self.fname))
+        if file:
+            super().__init__(file, mode)
+        else:
+            super().__init__(fname, mode)
+
+    def close(self):
+        super().close()
+        if self.m in ("w", "r+"):
+            if (self.dyad_ctx.prod_path is not None and
+                    self.dyad_ctx.prod_path in self.fname.parents):
+                self.dyad_ctx.produce(str(self.fname))
diff --git a/pydyad/setup.cfg b/pydyad/setup.cfg
@@ -8,5 +8,5 @@ classifier =
 python_requires = >=3.7
 install_requires =
     numpy
-    # dlio_profiler_py @ git+https://github.com/hariharan-devarajan/dlio-profiler.git
-    pydftracer==1.0.2
+    h5py
+    pydftracer==1.0.2
diff --git a/scripts/Testing/Temporary/CTestCostData.txt b/scripts/Testing/Temporary/CTestCostData.txt
@@ -0,0 +1 @@
+---
diff --git a/scripts/corona.sh b/scripts/corona.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+test_case=$1
+NUM_NODES=$2
+PPN=$3
+source ./setup-env.sh ${test_case} $NUM_NODES $PPN
+rm *.core flux.log
+rm -rf logs/* profiler/*
+flux alloc -q $QUEUE -t $TIME -N $NUM_NODES -o per-resource.count=${BROKERS_PER_NODE} --exclusive --broker-opts=--setattr=log-filename=./logs/flux.log ./run.sh $test_case $NUM_NODES $PPN
diff --git a/scripts/dspaces/aggregate.py b/scripts/dspaces/aggregate.py
@@ -0,0 +1,63 @@
+from pathlib import Path
+import re
+import argparse
+import pandas as pd
+
+
+def process_single_run_csvs(dir_path):
+    dirname = dir_path.name
+    match_obj = re.match(r"(?P<test_name>[a-zA-Z]+)_(?P<num_nodes>[0-9]+)_(?P<ppn>[0-9]+)", dirname)
+    if match_obj is None:
+        raise RuntimeError("Cannot parse directory name")
+    num_nodes = int(match_obj.group("num_nodes"))
+    ppn = int(match_obj.group("ppn"))
+    csv_files = list(dir_path.glob("*.csv"))
+    df = pd.concat(map(pd.read_csv, csv_files), ignore_index=True)
+    num_ops = len(df)
+    df = df.drop(columns=["var_name", "version"])
+    df = df.groupby("rank").agg("sum")
+    return {
+        "test_name": match_obj.group("test_name"),
+        "num_nodes": num_nodes,
+        "ppn": ppn,
+        "num_mdata_ops": num_ops,
+        "data_size": df["data_size"].sum(),
+        "mdata_time_ns": df["mdata_time_ns"].max(),
+        "data_time_ns": df["data_time_ns"].max(),
+    }
+
+
+def build_result_dataframe(testdir):
+    top_level_rundir_name = testdir.parent.name
+    test_dir_name = testdir.name
+    output_df_name = "{}_{}.csv".format(top_level_rundir_name, test_dir_name)
+    print("Building", output_df_name)
+    df_rows = []
+    for subdir in testdir.iterdir():
+        if subdir.is_dir():
+            print("Getting data for", str(subdir))
+            df_row = process_single_run_csvs(subdir)
+            df_rows.append(df_row)
+    output_df = pd.DataFrame(data=df_rows)
+    return output_df_name, output_df
+
+
+def main():
+    parser = argparse.ArgumentParser("Aggregate data for test")
+    parser.add_argument("testdir", type=Path,
+                        help="Path to the test directory to collect data for")
+    parser.add_argument("--dump_dir", "-d", type=Path,
+                        help="Directory to dump the resulting CSV into")
+    args = parser.parse_args()
+    csv_name, df = build_result_dataframe(args.testdir.expanduser().resolve())
+    dump_dir = args.dump_dir.expanduser().resolve()
+    if not dump_dir.is_dir():
+        print("Creating non-existant dump directory {}".format(str(dump_dir)))
+        dump_dir.mkdir(parents=True)
+    full_csv_name = dump_dir / csv_name
+    df.to_csv(str(full_csv_name))
+    print("Wrote data to {}".format(str(full_csv_name)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/dspaces/collect.py b/scripts/dspaces/collect.py
@@ -0,0 +1,75 @@
+from pathlib import Path
+import re
+import argparse
+import json
+
+
+def validate_log(out_file):
+    with open(str(out_file), "r") as f:
+        for line in f:
+            if line.startswith("[DSPACES_TEST]"):
+                return line
+    return None
+
+
+def validate_dir(path):
+    dirname = path.name
+    match_obj = re.match(r"(?P<test_name>[a-zA-Z]+)_(?P<num_nodes>[0-9]+)_(?P<ppn>[0-9]+)", dirname)
+    if match_obj is None:
+        raise RuntimeError("Cannot parse directory name")
+    test_name = match_obj.group("test_name")
+    num_nodes = int(match_obj.group("num_nodes"))
+    ppn = int(match_obj.group("ppn"))
+    # num_tasks = num_nodes * ppn
+    out_file = path / "run.out"
+    if not out_file.is_file():
+        raise RuntimeError("Could not find run.out for {}".format(path))
+    perf_line = validate_log(out_file)
+    if perf_line is None:
+        raise RuntimeError("Run for {} failed because we don't have perf numbers".format(path))
+    return {
+        "test_name": test_name,
+        "num_nodes": num_nodes,
+        "ppn": ppn,
+        "perf": perf_line,
+    }
+
+
+def validate_rundir(td):
+    print("Validating tests in {}:".format(td.name))
+    subdirs = [sd for sd in td.iterdir() if sd.is_dir()]
+    perf_entries = []
+    for sd in subdirs:
+        print("  * Validating {}:".format(sd.name), end=" ")
+        try:
+            new_perf = validate_dir(sd)
+            perf_entries.append(new_perf)
+            print("GOOD")
+        except RuntimeError as e:
+            print("BAD")
+            raise e
+    return perf_entries
+
+
+def main():
+    parser = argparse.ArgumentParser("Validate runs")
+    parser.add_argument("testdir", type=Path,
+                        help="Top-level directory representing the results of a single iteration of the testing")
+    parser.add_argument("--dump_file", "-d", type=Path, default=None,
+                        help="Path to JSON file where we want to dump performance results")
+    args = parser.parse_args()
+    perf_entries = validate_rundir(args.testdir.expanduser().resolve())
+    if args.dump_file is not None:
+        dump_file = args.dump_file.expanduser().resolve()
+        if not dump_file.name.endswith(".json"):
+            raise ValueError("Invalid file suffix for JSON file")
+        if not dump_file.parent.is_dir():
+            dump_file.parent.mkdir(parents=True)
+        with open(str(dump_file), "w") as f:
+            json.dump(perf_entries, f, indent=4, sort_keys=True)
+    else:
+        print(json.dumps(perf_entries, sort_keys=True, indent=4))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/dspaces/corona.sh b/scripts/dspaces/corona.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+test_case=$1
+num_nodes=$2
+ppn=$3
+timing_root_dir=$4
+use_alloc=$5
+
+num_iters=16
+num_files=16
+request_size=65536
+hg_conn_str="ofi+verbs"
+
+extra_flux_flags="--setattr=system.bank=ice4hpc"
+
+source ./setup-env.sh
+
+timing_dir=$timing_root_dir/${test_case}_${num_nodes}_${ppn}
+
+if [ -d $timing_dir ]; then
+    echo "Dump directory $timing_dir already exists"
+    exit 1
+fi
+
+mkdir -p $timing_dir
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+if $use_alloc; then
+    flux alloc -q $QUEUE -t $TIME -N $num_nodes --exclusive $extra_flux_flags ./run.sh $test_case $num_nodes $ppn $num_iters $num_files $request_size $hg_conn_str $timing_dir $SCRIPT_DIR
+else
+    flux batch -q $QUEUE -t $TIME -N $num_nodes --output=$timing_dir/run.out --error=$timing_dir/run.err --exclusive $extra_flux_flags ./run.sh $test_case $num_nodes $ppn $num_iters $num_files $request_size $hg_conn_str $timing_dir $SCRIPT_DIR
+fi
diff --git a/scripts/dspaces/dspaces_start.sh b/scripts/dspaces/dspaces_start.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+echo "## Config file for DataSpaces server
+ndim = 1
+dims = $1
+max_versions = $2
+num_apps = 1" > dataspaces.conf
+
+# Use provided number of nodes instead of auto-obtained number
+# dspaces_num_nodes=$(flux resource info | grep -oP "\d+ Nodes" | grep -oP "^\d+")
+
+flux submit -N $3 --cores=$(( $3*1 )) --tasks-per-node=$4 dspaces_server $5
+
+# Wait for DataSpaces configuration file to be created.
+# If we don't do this, the DataSpaces clients will either crash or hang
+sleep 1s
+while [ ! -f conf.ds ]; do
+    sleep 1s
+done
+sleep 3s
diff --git a/scripts/dspaces/dspaces_stop.sh b/scripts/dspaces/dspaces_stop.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+flux run --ntasks=1 terminator
+
+# rm conf.ds dataspaces.conf