File tree Expand file tree Collapse file tree 14 files changed +246
-2
lines changed Expand file tree Collapse file tree 14 files changed +246
-2
lines changed Original file line number Diff line number Diff line change 30
30
dockerfile : cmd/training-operator.v2alpha1/Dockerfile
31
31
platforms : linux/amd64,linux/arm64,linux/ppc64le
32
32
tag-prefix : v2alpha1
33
+ - component-name : model-initializer-v2
34
+ dockerfile : cmd/initializer_v2/model/Dockerfile
35
+ platforms : linux/amd64,linux/arm64
36
+ tag-prefix : v2
37
+ - component-name : dataset-initializer-v2
38
+ dockerfile : cmd/initializer_v2/dataset/Dockerfile
39
+ platforms : linux/amd64,linux/arm64
40
+ tag-prefix : v2
33
41
- component-name : kubectl-delivery
34
42
dockerfile : build/images/kubectl-delivery/Dockerfile
35
43
platforms : linux/amd64,linux/arm64,linux/ppc64le
Original file line number Diff line number Diff line change @@ -11,8 +11,8 @@ cover.out
11
11
.vscode /
12
12
__debug_bin
13
13
14
- # Compiled python files.
15
- * .pyc
14
+ # Python cache files
15
+ __pycache__ /
16
16
17
17
# Emacs temporary files
18
18
* ~
Original file line number Diff line number Diff line change
1
+ FROM python:3.11-alpine
2
+
3
+ WORKDIR /workspace
4
+
5
+ # Copy the required Python modules.
6
+ COPY cmd/initializer_v2/dataset/requirements.txt .
7
+ COPY sdk/python/kubeflow sdk/python/kubeflow
8
+ COPY pkg/initializer_v2 pkg/initializer_v2
9
+
10
+ # Install the needed packages.
11
+ RUN pip install -r requirements.txt
12
+
13
+ ENTRYPOINT ["python" , "-m" , "pkg.initializer_v2.dataset" ]
Original file line number Diff line number Diff line change
1
+ huggingface_hub == 0.23.4
Original file line number Diff line number Diff line change
1
+ FROM python:3.11-alpine
2
+
3
+ WORKDIR /workspace
4
+
5
+ # Copy the required Python modules.
6
+ COPY cmd/initializer_v2/model/requirements.txt .
7
+ COPY sdk/python/kubeflow sdk/python/kubeflow
8
+ COPY pkg/initializer_v2 pkg/initializer_v2
9
+
10
+ # Install the needed packages.
11
+ RUN pip install -r requirements.txt
12
+
13
+ ENTRYPOINT ["python" , "-m" , "pkg.initializer_v2.model" ]
Original file line number Diff line number Diff line change
1
+ huggingface_hub == 0.23.4
Original file line number Diff line number Diff line change
1
+ import logging
2
+ import os
3
+ from urllib .parse import urlparse
4
+
5
+ import pkg .initializer_v2 .utils .utils as utils
6
+ from pkg .initializer_v2 .dataset .huggingface import HuggingFace
7
+
8
+ logging .basicConfig (
9
+ format = "%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s" ,
10
+ datefmt = "%Y-%m-%dT%H:%M:%SZ" ,
11
+ level = logging .INFO ,
12
+ )
13
+
14
+ if __name__ == "__main__" :
15
+ logging .info ("Starting dataset initialization" )
16
+
17
+ try :
18
+ storage_uri = os .environ [utils .STORAGE_URI_ENV ]
19
+ except Exception as e :
20
+ logging .error ("STORAGE_URI env variable must be set." )
21
+ raise e
22
+
23
+ match urlparse (storage_uri ).scheme :
24
+ # TODO (andreyvelich): Implement more dataset providers.
25
+ case utils .HF_SCHEME :
26
+ hf = HuggingFace ()
27
+ hf .load_config ()
28
+ hf .download_dataset ()
29
+ case _:
30
+ logging .error ("STORAGE_URI must have the valid dataset provider" )
31
+ raise Exception
Original file line number Diff line number Diff line change
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ # TODO (andreyvelich): This should be moved under Training V2 SDK.
6
+ @dataclass
7
+ class HuggingFaceDatasetConfig :
8
+ storage_uri : str
9
+ access_token : Optional [str ] = None
Original file line number Diff line number Diff line change
1
+ import logging
2
+ from urllib .parse import urlparse
3
+
4
+ import huggingface_hub
5
+
6
+ import pkg .initializer_v2 .utils .utils as utils
7
+
8
+ # TODO (andreyvelich): This should be moved to SDK V2 constants.
9
+ import sdk .python .kubeflow .storage_initializer .constants as constants
10
+ from pkg .initializer_v2 .dataset .config import HuggingFaceDatasetConfig
11
+
12
+ logging .basicConfig (
13
+ format = "%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s" ,
14
+ datefmt = "%Y-%m-%dT%H:%M:%SZ" ,
15
+ level = logging .INFO ,
16
+ )
17
+
18
+
19
+ class HuggingFace (utils .DatasetProvider ):
20
+
21
+ def load_config (self ):
22
+ config_dict = utils .get_config_from_env (HuggingFaceDatasetConfig )
23
+ logging .info (f"Config for HuggingFace dataset initializer: { config_dict } " )
24
+ self .config = HuggingFaceDatasetConfig (** config_dict )
25
+
26
+ def download_dataset (self ):
27
+ storage_uri_parsed = urlparse (self .config .storage_uri )
28
+ dataset_uri = storage_uri_parsed .netloc + storage_uri_parsed .path
29
+
30
+ logging .info (f"Downloading dataset: { dataset_uri } " )
31
+ logging .info ("-" * 40 )
32
+
33
+ if self .config .access_token :
34
+ huggingface_hub .login (self .config .access_token )
35
+
36
+ huggingface_hub .snapshot_download (
37
+ repo_id = dataset_uri ,
38
+ repo_type = "dataset" ,
39
+ local_dir = constants .VOLUME_PATH_DATASET ,
40
+ )
41
+
42
+ logging .info ("Dataset has been downloaded" )
Original file line number Diff line number Diff line change
1
+ import logging
2
+ import os
3
+ from urllib .parse import urlparse
4
+
5
+ import pkg .initializer_v2 .utils .utils as utils
6
+ from pkg .initializer_v2 .model .huggingface import HuggingFace
7
+
8
+ logging .basicConfig (
9
+ format = "%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s" ,
10
+ datefmt = "%Y-%m-%dT%H:%M:%SZ" ,
11
+ level = logging .INFO ,
12
+ )
13
+
14
+ if __name__ == "__main__" :
15
+ logging .info ("Starting pre-trained model initialization" )
16
+
17
+ try :
18
+ storage_uri = os .environ [utils .STORAGE_URI_ENV ]
19
+ except Exception as e :
20
+ logging .error ("STORAGE_URI env variable must be set." )
21
+ raise e
22
+
23
+ match urlparse (storage_uri ).scheme :
24
+ # TODO (andreyvelich): Implement more model providers.
25
+ case utils .HF_SCHEME :
26
+ hf = HuggingFace ()
27
+ hf .load_config ()
28
+ hf .download_model ()
29
+ case _:
30
+ logging .error (
31
+ f"STORAGE_URI must have the valid model provider. STORAGE_URI: { storage_uri } "
32
+ )
33
+ raise Exception
You can’t perform that action at this time.
0 commit comments