Skip to content

Commit

Permalink
Inform of the new pieline script uploaded to the repository in the RE…
Browse files Browse the repository at this point in the history
…ADME
  • Loading branch information
plaguss committed Jul 1, 2024
1 parent 7fbdd5a commit 299d760
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 6 deletions.
37 changes: 31 additions & 6 deletions src/distilabel/distiset.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,6 @@ def push_to_hub(
**kwargs,
)

if generate_card:
self._generate_card(repo_id, token)

if include_script and script_path.exists():
upload_file(
path_or_fileobj=script_path,
Expand All @@ -127,8 +124,17 @@ def push_to_hub(
commit_message="Include pipeline script.",
)

if generate_card:
self._generate_card(
repo_id, token, include_script=include_script, filename_py=filename_py
)

def _get_card(
self, repo_id: str, token: Optional[str] = None
self,
repo_id: str,
token: Optional[str] = None,
include_script: bool = False,
filename_py: Optional[str] = None,
) -> DistilabelDatasetCard:
"""Generates the dataset card for the `Distiset`.
Expand All @@ -141,6 +147,9 @@ def _get_card(
token: The token to authenticate with the Hugging Face Hub.
We assume that if it's provided, the dataset will be in the Hugging Face Hub,
so the README metadata will be extracted from there.
include_script: Whether to upload the script to the hugging face repository.
filename_py: The name of the script. If `include_script` is True, the script will
be uploaded to the repository using this name, otherwise it won't be used.
Returns:
The dataset card for the `Distiset`.
Expand All @@ -167,6 +176,8 @@ def _get_card(
card_data=DatasetCardData(**metadata),
repo_id=repo_id,
sample_records=sample_records,
include_script=include_script,
filename_py=filename_py,
)

return card
Expand Down Expand Up @@ -194,16 +205,30 @@ def _extract_readme_metadata(
metadata = yaml.safe_load(metadata)
return metadata

def _generate_card(self, repo_id: str, token: str) -> None:
def _generate_card(
self,
repo_id: str,
token: str,
include_script: bool = False,
filename_py: Optional[str] = None,
) -> None:
"""Generates a dataset card and pushes it to the Hugging Face Hub, and
if the `pipeline.yaml` path is available in the `Distiset`, uploads that
to the same repository.
Args:
repo_id: The ID of the repository to push to, from the `push_to_hub` method.
token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
include_script: Whether to upload the script to the hugging face repository.
filename_py: The name of the script. If `include_script` is True, the script will
be uploaded to the repository using this name, otherwise it won't be used.
"""
card = self._get_card(repo_id=repo_id, token=token)
card = self._get_card(
repo_id=repo_id,
token=token,
include_script=include_script,
filename_py=filename_py,
)

card.push_to_hub(
repo_id,
Expand Down
11 changes: 11 additions & 0 deletions src/distilabel/utils/card/distilabel_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@

This dataset has been created with [distilabel](https://distilabel.argilla.io/).

{% if include_script%}
The pipeline script was uploaded to easily reproduce the dataset:
[{{ filename_py }}](https://huggingface.co/datasets/{{ repo_id }}/raw/main/{{ filename_py }}).

You can download the file and run it as:

```console
python {{ filename_py }}
```
{% endif %}

## Dataset Summary

This dataset contains a `pipeline.yaml` which can be used to reproduce the pipeline that generated it in distilabel using the `distilabel` CLI:
Expand Down

0 comments on commit 299d760

Please sign in to comment.