Skip to content

Commit

Permalink
Add option to include the pipeline script as another artifact when pu…
Browse files Browse the repository at this point in the history
…shing a distiset to the hub
  • Loading branch information
plaguss committed Jun 28, 2024
1 parent e040dec commit e5b28ad
Showing 1 changed file with 24 additions and 1 deletion.
25 changes: 24 additions & 1 deletion src/distilabel/distiset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import logging
import os.path as posixpath
import re
import sys
from os import PathLike
from pathlib import Path
from typing import Any, Dict, Final, Optional, Union
Expand All @@ -23,7 +24,7 @@
import yaml
from datasets import Dataset, load_dataset, load_from_disk
from datasets.filesystems import is_remote_filesystem
from huggingface_hub import DatasetCardData, HfApi
from huggingface_hub import DatasetCardData, HfApi, upload_file
from huggingface_hub.file_download import hf_hub_download
from pyarrow.lib import ArrowInvalid
from typing_extensions import Self
Expand Down Expand Up @@ -61,6 +62,7 @@ def push_to_hub(
private: bool = False,
token: Optional[str] = None,
generate_card: bool = True,
include_script: bool = True,
**kwargs: Any,
) -> None:
"""Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration
Expand All @@ -80,12 +82,23 @@ def push_to_hub(
if no token is passed and the user is not logged-in.
generate_card:
Whether to generate a dataset card or not. Defaults to True.
include_script:
Whether you want to push the pipeline script to the hugging face hub to share it.
Defaults to True
**kwargs:
Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.
Raises:
ValueError: If no token is provided and couldn't be retrieved automatically.
"""
script_filename = sys.argv[0]
filename_py = (
script_filename.split("/")[-1]
if "/" in script_filename
else script_filename
)
script_path = Path.cwd() / script_filename

if token is None:
token = get_hf_token(self.__class__.__name__, "token")

Expand All @@ -101,6 +114,16 @@ def push_to_hub(
if generate_card:
self._generate_card(repo_id, token)

if include_script and script_path.exists():
upload_file(
path_or_fileobj=script_path,
path_in_repo=filename_py,
repo_id=repo_id,
repo_type="dataset",
token=token,
commit_message="Include pipeline script.",
)

def _get_card(
self, repo_id: str, token: Optional[str] = None
) -> DistilabelDatasetCard:
Expand Down

0 comments on commit e5b28ad

Please sign in to comment.