diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 36740e458b7..c34a6db07ee 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -55,6 +55,7 @@ ) import fsspec +import multiprocessing as mp import numpy as np import pandas as pd import pyarrow as pa @@ -69,7 +70,6 @@ HfApi, ) from huggingface_hub.hf_api import HfHubHTTPError, RepoFile, RepositoryNotFoundError -from multiprocess import Pool from requests import HTTPError from tqdm.contrib.concurrent import thread_map @@ -150,6 +150,12 @@ logger = logging.get_logger(__name__) +# Prefer spawn-based Pool on platforms where fork can be problematic +try: + Pool = mp.get_context("spawn").Pool # type: ignore[assignment] +except Exception: + Pool = mp.Pool # type: ignore[assignment] + PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED = ( "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.parquet" )