From 299d76072b3f4d9d885a6ca9b96b6765cc842539 Mon Sep 17 00:00:00 2001 From: plaguss Date: Mon, 1 Jul 2024 09:56:19 +0200 Subject: [PATCH] Inform of the new pieline script uploaded to the repository in the README --- src/distilabel/distiset.py | 37 ++++++++++++++++--- .../utils/card/distilabel_template.md | 11 ++++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/distilabel/distiset.py b/src/distilabel/distiset.py index 3e00e3480..3027aff1f 100644 --- a/src/distilabel/distiset.py +++ b/src/distilabel/distiset.py @@ -114,9 +114,6 @@ def push_to_hub( **kwargs, ) - if generate_card: - self._generate_card(repo_id, token) - if include_script and script_path.exists(): upload_file( path_or_fileobj=script_path, @@ -127,8 +124,17 @@ def push_to_hub( commit_message="Include pipeline script.", ) + if generate_card: + self._generate_card( + repo_id, token, include_script=include_script, filename_py=filename_py + ) + def _get_card( - self, repo_id: str, token: Optional[str] = None + self, + repo_id: str, + token: Optional[str] = None, + include_script: bool = False, + filename_py: Optional[str] = None, ) -> DistilabelDatasetCard: """Generates the dataset card for the `Distiset`. @@ -141,6 +147,9 @@ def _get_card( token: The token to authenticate with the Hugging Face Hub. We assume that if it's provided, the dataset will be in the Hugging Face Hub, so the README metadata will be extracted from there. + include_script: Whether to upload the script to the hugging face repository. + filename_py: The name of the script. If `include_script` is True, the script will + be uploaded to the repository using this name, otherwise it won't be used. Returns: The dataset card for the `Distiset`. @@ -167,6 +176,8 @@ def _get_card( card_data=DatasetCardData(**metadata), repo_id=repo_id, sample_records=sample_records, + include_script=include_script, + filename_py=filename_py, ) return card @@ -194,7 +205,13 @@ def _extract_readme_metadata( metadata = yaml.safe_load(metadata) return metadata - def _generate_card(self, repo_id: str, token: str) -> None: + def _generate_card( + self, + repo_id: str, + token: str, + include_script: bool = False, + filename_py: Optional[str] = None, + ) -> None: """Generates a dataset card and pushes it to the Hugging Face Hub, and if the `pipeline.yaml` path is available in the `Distiset`, uploads that to the same repository. @@ -202,8 +219,16 @@ def _generate_card(self, repo_id: str, token: str) -> None: Args: repo_id: The ID of the repository to push to, from the `push_to_hub` method. token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method. + include_script: Whether to upload the script to the hugging face repository. + filename_py: The name of the script. If `include_script` is True, the script will + be uploaded to the repository using this name, otherwise it won't be used. """ - card = self._get_card(repo_id=repo_id, token=token) + card = self._get_card( + repo_id=repo_id, + token=token, + include_script=include_script, + filename_py=filename_py, + ) card.push_to_hub( repo_id, diff --git a/src/distilabel/utils/card/distilabel_template.md b/src/distilabel/utils/card/distilabel_template.md index 94c6dab37..675a55ee4 100644 --- a/src/distilabel/utils/card/distilabel_template.md +++ b/src/distilabel/utils/card/distilabel_template.md @@ -14,6 +14,17 @@ This dataset has been created with [distilabel](https://distilabel.argilla.io/). +{% if include_script%} +The pipeline script was uploaded to easily reproduce the dataset: +[{{ filename_py }}](https://huggingface.co/datasets/{{ repo_id }}/raw/main/{{ filename_py }}). + +You can download the file and run it as: + +```console +python {{ filename_py }} +``` +{% endif %} + ## Dataset Summary This dataset contains a `pipeline.yaml` which can be used to reproduce the pipeline that generated it in distilabel using the `distilabel` CLI: