hpcgroup · lithomas1 · Apr 18, 2025 · Apr 17, 2025 · Apr 18, 2025 · Apr 18, 2025
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
@@ -12,10 +12,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
-        include:
-        - python-version: "3.6"
-          os: ubuntu-20.04
+        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11"]
 
     steps:
       - uses: actions/checkout@v2

diff --git a/pipit/readers/nsight_sqlite_reader.py b/pipit/readers/nsight_sqlite_reader.py
@@ -22,6 +22,9 @@ class NSightSQLiteReader:
             NVTX_EVENTS as ne
         LEFT JOIN StringIds
             ON StringIds.id = ne.textId
+        WHERE
+            -- Filter to only include range start/end and push/pop events
+            ne.eventType in (59, 60)
         """
         ],
         "cuda_api": [

diff --git a/pipit/tests/trace.py b/pipit/tests/trace.py
@@ -6,6 +6,8 @@
 import numpy as np
 from pipit import Trace
 
+from numpy.testing import assert_allclose
+
 
 def test_comm_matrix(data_dir, ping_pong_otf2_trace):
     # bytes sent between pairs of processes
@@ -160,13 +162,13 @@ def test_time_profile(data_dir, ping_pong_otf2_trace):
     exp_bin_size = exp_duration / 62
     bin_sizes = time_profile["bin_end"] - time_profile["bin_start"]
 
-    assert np.isclose(bin_sizes, exp_bin_size).all()
+    assert_allclose(bin_sizes, exp_bin_size)
 
     # check that sum of function contributions per bin equals bin duration
     exp_bin_total_duration = exp_bin_size * 2
     time_profile.drop(columns=["bin_start", "bin_end"], inplace=True)
 
-    assert np.isclose(time_profile.sum(axis=1), exp_bin_total_duration).all()
+    assert_allclose(time_profile.sum(axis=1), exp_bin_total_duration)
 
     # check for each function that sum of exc time per bin equals total exc time
     total_exc_times = trace.events.groupby("Name")["time.exc"].sum()
@@ -175,7 +177,7 @@ def test_time_profile(data_dir, ping_pong_otf2_trace):
         if column == "idle_time":
             continue
 
-        assert np.isclose(time_profile[column].sum(), total_exc_times[column])
+        assert_allclose(time_profile[column].sum(), total_exc_times[column])
 
     # check normalization
     norm = trace.time_profile(num_bins=62, normalized=True)
@@ -185,34 +187,34 @@ def test_time_profile(data_dir, ping_pong_otf2_trace):
 
     # check against ground truth
     # generated using Vampir's Function Summary chart (step size=16)
-    assert np.isclose(norm.loc[0]["int main(int, char**)"], 0.00299437)
-    assert np.isclose(norm.loc[0]["MPI_Init"], 0.93999815)
-    assert np.isclose(norm.loc[0]["MPI_Comm_size"], 0.0)
-    assert np.isclose(norm.loc[0]["MPI_Comm_rank"], 0.0)
-    assert np.isclose(norm.loc[0]["MPI_Send"], 0.0)
-    assert np.isclose(norm.loc[0]["MPI_Recv"], 0.0)
-    assert np.isclose(norm.loc[0]["MPI_Finalize"], 0.0)
-
-    assert np.isclose(norm.loc[1:59]["int main(int, char**)"], 0.0).all()
-    assert np.isclose(norm.loc[1:59]["MPI_Init"], 1.0).all()
-    assert np.isclose(norm.loc[1:59]["MPI_Comm_size"], 0.0).all()
-    assert np.isclose(norm.loc[1:59]["MPI_Comm_rank"], 0.0).all()
-    assert np.isclose(norm.loc[1:59]["MPI_Send"], 0.0).all()
-    assert np.isclose(norm.loc[1:59]["MPI_Recv"], 0.0).all()
-    assert np.isclose(norm.loc[1:59]["MPI_Finalize"], 0.0).all()
-
-    assert np.isclose(norm.loc[60]["int main(int, char**)"], 0.39464799)
-    assert np.isclose(norm.loc[60]["MPI_Init"], 0.14843661)
-    assert np.isclose(norm.loc[60]["MPI_Send"], 0.24594134)
-    assert np.isclose(norm.loc[60]["MPI_Recv"], 0.21017099)
-    assert np.isclose(norm.loc[60]["MPI_Comm_size"], 0.00046047)
-    assert np.isclose(norm.loc[60]["MPI_Comm_rank"], 0.00034261)
-    assert np.isclose(norm.loc[60]["MPI_Finalize"], 0.0)
-
-    assert np.isclose(norm.loc[61]["int main(int, char**)"], 0.43560727)
-    assert np.isclose(norm.loc[61]["MPI_Init"], 0.0)
-    assert np.isclose(norm.loc[61]["MPI_Send"], 0.29640222)
-    assert np.isclose(norm.loc[61]["MPI_Recv"], 0.24300865)
-    assert np.isclose(norm.loc[61]["MPI_Comm_size"], 0.0)
-    assert np.isclose(norm.loc[61]["MPI_Comm_rank"], 0.0)
-    assert np.isclose(norm.loc[61]["MPI_Finalize"], 0.01614835)
+    assert_allclose(norm.loc[0]["int main(int, char**)"], 0.00299437, rtol=1e-05)
+    assert_allclose(norm.loc[0]["MPI_Init"], 0.93999815)
+    assert_allclose(norm.loc[0]["MPI_Comm_size"], 0.0)
+    assert_allclose(norm.loc[0]["MPI_Comm_rank"], 0.0)
+    assert_allclose(norm.loc[0]["MPI_Send"], 0.0)
+    assert_allclose(norm.loc[0]["MPI_Recv"], 0.0)
+    assert_allclose(norm.loc[0]["MPI_Finalize"], 0.0)
+
+    assert_allclose(norm.loc[1:59]["int main(int, char**)"], 0.0)
+    assert_allclose(norm.loc[1:59]["MPI_Init"], 1.0)
+    assert_allclose(norm.loc[1:59]["MPI_Comm_size"], 0.0)
+    assert_allclose(norm.loc[1:59]["MPI_Comm_rank"], 0.0)
+    assert_allclose(norm.loc[1:59]["MPI_Send"], 0.0)
+    assert_allclose(norm.loc[1:59]["MPI_Recv"], 0.0)
+    assert_allclose(norm.loc[1:59]["MPI_Finalize"], 0.0)
+
+    assert_allclose(norm.loc[60]["int main(int, char**)"], 0.39464799)
+    assert_allclose(norm.loc[60]["MPI_Init"], 0.14843661)
+    assert_allclose(norm.loc[60]["MPI_Send"], 0.24594134)
+    assert_allclose(norm.loc[60]["MPI_Recv"], 0.21017099)
+    assert_allclose(norm.loc[60]["MPI_Comm_size"], 0.00046047, rtol=1e-05)
+    assert_allclose(norm.loc[60]["MPI_Comm_rank"], 0.00034261, rtol=1e-05)
+    assert_allclose(norm.loc[60]["MPI_Finalize"], 0.0)
+
+    assert_allclose(norm.loc[61]["int main(int, char**)"], 0.43560727)
+    assert_allclose(norm.loc[61]["MPI_Init"], 0.0)
+    assert_allclose(norm.loc[61]["MPI_Send"], 0.29640222)
+    assert_allclose(norm.loc[61]["MPI_Recv"], 0.24300865)
+    assert_allclose(norm.loc[61]["MPI_Comm_size"], 0.0)
+    assert_allclose(norm.loc[61]["MPI_Comm_rank"], 0.0)
+    assert_allclose(norm.loc[61]["MPI_Finalize"], 0.01614835, rtol=1e-05)
diff --git a/pipit/trace.py b/pipit/trace.py
@@ -213,11 +213,6 @@ def _match_caller_callee(self):
         """
 
         if "_children" not in self.events.columns:
-            children = [None] * len(self.events)
-            depth, parent = [float("nan")] * len(self.events), [float("nan")] * len(
-                self.events
-            )
-
             # match events so we can
             # ignore unmatched ones
             self._match_events()
@@ -252,18 +247,21 @@ def _match_caller_callee(self):
                         (enter_leave_df["Process"] == curr_loc)
                     ]
 
+                children = np.array([None] * len(filtered_df))
+                depth, parent = [float("nan")] * len(filtered_df), [float("nan")] * len(
+                    filtered_df
+                )
+
                 # Depth is the level in the
                 # Call Tree starting from 0
                 curr_depth = 0
 
                 stack = []
-                df_indices, event_types = list(filtered_df.index), list(
-                    filtered_df["Event Type"]
-                )
+                event_types = list(filtered_df["Event Type"])
 
                 # loop through the events of the filtered dataframe
                 for i in range(len(filtered_df)):
-                    curr_df_index, evt_type = df_indices[i], event_types[i]
+                    evt_type = event_types[i]
 
                     if evt_type == "Enter":
                         if curr_depth > 0:  # if event is a child of some other event
@@ -273,17 +271,17 @@ def _match_caller_callee(self):
                                 # create a new list of children for the
                                 # parent if the current event is the first
                                 # child being added
-                                children[parent_df_index] = [curr_df_index]
+                                children[parent_df_index] = [filtered_df.index[i]]
                             else:
-                                children[parent_df_index].append(curr_df_index)
+                                children[parent_df_index].append(filtered_df.index[i])
 
-                            parent[curr_df_index] = parent_df_index
+                            parent[i] = filtered_df.index[parent_df_index]
 
-                        depth[curr_df_index] = curr_depth
+                        depth[i] = curr_depth
                         curr_depth += 1
 
                         # add enter dataframe index to stack
-                        stack.append(curr_df_index)
+                        stack.append(i)
                     else:
                         # pop event off stack once matching leave found
                         # Note: parent, and children for a leave row
@@ -293,11 +291,26 @@ def _match_caller_callee(self):
 
                         curr_depth -= 1
 
-            self.events["_depth"], self.events["_parent"], self.events["_children"] = (
-                depth,
-                parent,
-                children,
-            )
+                curr_process = curr_loc
+                thread_mask = 1
+                if has_thread:
+                    curr_process, curr_thread = curr_loc
+                    thread_mask = self.events["Thread"] == curr_thread
+                mask = (
+                    self.events["Event Type"].isin(["Enter", "Leave"])
+                    & (self.events["_matching_event"].notnull())
+                    & (self.events["Process"] == curr_process)
+                    & thread_mask
+                )
+                (
+                    self.events.loc[mask, "_depth"],
+                    self.events.loc[mask, "_parent"],
+                    self.events.loc[mask, "_children"],
+                ) = (
+                    depth,
+                    parent,
+                    children,
+                )
 
             self.events = self.events.astype({"_depth": "Int32", "_parent": "Int32"})
             self.events = self.events.astype(