Merge pull request #403 from huggingface/ray_fix

hynky1999 · web-flow · commit b1be8a7e9b9a · 2025-11-25T21:11:33.000+01:00
ray nits
diff --git a/src/datatrove/executor/ray.py b/src/datatrove/executor/ray.py
@@ -10,7 +10,7 @@
 from typing import TYPE_CHECKING, Callable, Optional, Sequence
 
 from datatrove.executor.base import DistributedEnvVars, PipelineExecutor
-from datatrove.io import DataFolderLike, get_datafolder
+from datatrove.io import DataFolderLike, file_is_local, get_datafolder
 from datatrove.pipeline.base import PipelineStep
 from datatrove.utils._import_utils import check_required_dependencies
 from datatrove.utils.logging import add_task_logger, close_task_logger, log_pipeline, logger
@@ -431,18 +431,26 @@ def __init__(
         tasks_per_job: int = 1,
         time: Optional[int] = None,
     ):
+        # Check if the logging_dir is local fs and if so issue a warning that for synchronization it has to be a shared filesystem
+        if logging_dir and file_is_local(logging_dir):
+            logger.warning(
+                "Logging directory points to a local filesystem. For correct synchronization to work this "
+                "filesystem needs be shared across the submitting node as well as the workers and needs "
+                "to be persistent across node restarts."
+            )
+
         super().__init__(pipeline, logging_dir, skip_completed, randomize_start_duration)
         self.tasks = tasks
         self.workers = workers if workers != -1 else tasks
         self.depends = depends
-        # track whether run() has been called
         self.cpus_per_task = cpus_per_task
         self.gpus_per_task = gpus_per_task
         self.mem_per_cpu_gb = mem_per_cpu_gb
         self.ray_remote_kwargs = ray_remote_kwargs
         self.tasks_per_job = tasks_per_job
         self.log_first = log_first
         self.time = time
+        self._launched = False
         self.nodes_per_task = nodes_per_task
 
     def get_distributed_env(self, node_rank: int = -1) -> DistributedEnvVars:
@@ -472,12 +480,22 @@ def run(self):
         check_required_dependencies("ray", ["ray"])
         import ray
 
-        # 1) If there is a depends=, ensure it has run and is finished
+        assert not self.depends or (isinstance(self.depends, RayPipelineExecutor)), (
+            "depends= must be a RayPipelineExecutor"
+        )
         if self.depends:
-            logger.info(f'Launching dependency job "{self.depends}"')
-            self.depends.run()
-
-        # 2) Check if all tasks are already completed
+            # take care of launching any unlaunched dependencies
+            if not self.depends._launched:
+                logger.info(f'Launching dependency job "{self.depends}"')
+                self.depends.run()
+            while (
+                incomplete := len(self.depends.get_incomplete_ranks(skip_completed=True))
+            ) > 0:  # set skip_completed=True to get *real* incomplete task count
+                logger.info(f"Dependency job still has {incomplete}/{self.depends.world_size} tasks. Waiting...")
+                time.sleep(2 * 60)
+
+        self._launched = True
+        # 3) Check if all tasks are already completed
         incomplete_ranks = self.get_incomplete_ranks(range(self.world_size))
         if not incomplete_ranks:
             logger.info(f"All {self.world_size} tasks appear to be completed already. Nothing to run.")
diff --git a/src/datatrove/io.py b/src/datatrove/io.py
@@ -293,6 +293,10 @@ def get_fs_with_filepath(data: DataFileLike) -> tuple[AbstractFileSystem, str]:
     # (str path, initialized fs object)
     if isinstance(data, tuple) and isinstance(data[0], str) and isinstance(data[1], AbstractFileSystem):
         return (data[1], data[0])  # yeah yeah this is a bit weird I agree
+
+    if isinstance(data, DataFolder):
+        return (data.fs, data.path)
+
     raise ValueError(
         "You must pass a DataFileLike instance, a str path, a (str path, fs_init_kwargs) or (str path, fs object)"
     )
diff --git a/tests/executor/test_ray.py b/tests/executor/test_ray.py
@@ -65,6 +65,54 @@ def run(self, data, rank=None, world_size=None):
                         f"Expected file {file} was not found in {log_dir}",
                     )
 
+    def test_dependencies(self):
+        """Test that multiple executors can depend on the same parent executor and the parent only runs once."""
+
+        parent_log_dir = get_datafolder(f"{self.tmp_dir}/parent")
+
+        class ParentSimpleStep(PipelineStep):
+            def run(self, data, rank=None, world_size=None):
+                with open(parent_log_dir.resolve_paths("parent.txt"), "a") as f:
+                    f.write(f"called {rank}\n")
+
+        class ChildSimpleStep(PipelineStep):
+            def run(self, data, rank=None, world_size=None):
+                pass
+
+        # Create parent executor
+        parent_executor = RayPipelineExecutor(
+            pipeline=[ParentSimpleStep()],
+            tasks=2,
+            workers=2,
+            logging_dir=parent_log_dir,
+        )
+
+        # Create two child executors that depend on the same parent
+        child1_log_dir = get_datafolder(f"{self.tmp_dir}/child1")
+        child1_executor = RayPipelineExecutor(
+            pipeline=[ChildSimpleStep()],
+            tasks=2,
+            workers=2,
+            logging_dir=child1_log_dir,
+            depends=parent_executor,
+        )
+
+        child2_log_dir = get_datafolder(f"{self.tmp_dir}/child2")
+        child2_executor = RayPipelineExecutor(
+            pipeline=[ChildSimpleStep()],
+            tasks=2,
+            workers=2,
+            logging_dir=child2_log_dir,
+            depends=parent_executor,
+        )
+
+        # Run child1 - this should launch the parent first
+        child1_executor.run()
+        child2_executor.run()
+        with open(parent_log_dir.resolve_paths("parent.txt"), "r") as f:
+            # Two calls because of two tasks
+            self.assertEqual(sorted(f.read().strip().splitlines()), ["called 0", "called 1"])
+
     def test_placement_group_creation(self):
         """Test that placement groups are created when nodes_per_task > 1"""
         from datatrove.executor.ray import RayTaskManager