From 2689413fb0aff01a833257140460a35e2eeedb96 Mon Sep 17 00:00:00 2001 From: BeachWang <1400012807@pku.edu.cn> Date: Tue, 27 Aug 2024 11:48:28 +0800 Subject: [PATCH] update spacy to deal conflict with ms-swift (#397) * update_spacy * fix model version * keep model 3.5.0 * update spacy to 3.7.0 & support native tar.gz package * update docker version * update librosa version * update nltk version --------- Co-authored-by: gece.gc --- .github/workflows/docker/docker-compose.yml | 4 +-- data_juicer/utils/model_utils.py | 37 +++++++++++++++++---- environments/minimal_requires.txt | 4 +-- environments/science_requires.txt | 4 +-- 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml index eeba32206..92a5c76c2 100644 --- a/.github/workflows/docker/docker-compose.yml +++ b/.github/workflows/docker/docker-compose.yml @@ -1,7 +1,7 @@ version: '3' services: ray-head: - image: data-juicer-unittest:0.2.1 + image: data-juicer-unittest:0.2.2 pull_policy: never command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block environment: @@ -30,7 +30,7 @@ services: capabilities: [gpu] ray-worker: - image: data-juicer-unittest:0.2.1 + image: data-juicer-unittest:0.2.2 pull_policy: never command: ray start --address=ray-head:6379 --block environment: diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index fe716333d..f145e4a76 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -72,7 +72,7 @@ def check_model(model_name, force=False): ) else: logger.info( - f'Model [{cached_model_path}] not found . Downloading...') + f'Model [{cached_model_path}] not found. Downloading...') try: model_link = os.path.join(MODEL_LINKS, model_name) @@ -406,7 +406,7 @@ def prepare_huggingface_model(pretrained_model_name_or_path, return (model, processor) if return_model else processor -def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.5.0'): +def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.7.0'): """ Prepare spacy model for specific language. @@ -419,17 +419,40 @@ def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.5.0'): assert lang in ['zh', 'en'], 'Diversity only support zh and en' model_name = name_pattern.format(lang) logger.info(f'Loading spacy model [{model_name}]...') - compressed_model = '{}.zip'.format(model_name) + compressed_model = '{}.tar.gz'.format(model_name) # decompress the compressed model if it's not decompressed def decompress_model(compressed_model_path): - decompressed_model_path = compressed_model_path.replace('.zip', '') + if not compressed_model_path.endswith('.tar.gz'): + raise ValueError('Only .tar.gz files are supported') + + decompressed_model_path = compressed_model_path.replace('.tar.gz', '') if os.path.exists(decompressed_model_path) \ and os.path.isdir(decompressed_model_path): return decompressed_model_path - import zipfile - with zipfile.ZipFile(compressed_model_path) as zf: - zf.extractall(DJMC) + + ver_name = os.path.basename(decompressed_model_path) + unver_name = ver_name.rsplit('-', maxsplit=1)[0] + target_dir_in_archive = f'{ver_name}/{unver_name}/{ver_name}/' + + import tarfile + with tarfile.open(compressed_model_path, 'r:gz') as tar: + for member in tar.getmembers(): + if member.name.startswith(target_dir_in_archive): + # relative path without unnecessary directory levels + relative_path = os.path.relpath( + member.name, start=target_dir_in_archive) + target_path = os.path.join(decompressed_model_path, + relative_path) + + if member.isfile(): + # ensure the directory exists + target_directory = os.path.dirname(target_path) + os.makedirs(target_directory, exist_ok=True) + # for files, extract to the specific location + with tar.extractfile(member) as source: + with open(target_path, 'wb') as target: + target.write(source.read()) return decompressed_model_path try: diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index c162fb21d..bd55d2008 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -4,7 +4,7 @@ pandas==2.0.3 datasets==2.18.0 av soundfile -librosa +librosa>=0.10 loguru tabulate tqdm @@ -21,7 +21,7 @@ pdfplumber plotly python-docx streamlit -spacy==3.5.0 +spacy==3.7.0 multiprocess==0.70.12 dill==0.3.4 psutil diff --git a/environments/science_requires.txt b/environments/science_requires.txt index e1ab796cd..c1350368b 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -10,7 +10,7 @@ simhash-pybind selectolax nlpaug nlpcda -nltk +nltk<3.9 transformers>=4.37 transformers_stream_generator einops @@ -18,7 +18,7 @@ accelerate tiktoken opencc==1.1.6 imagededup -spacy-pkuseg==0.0.32 +spacy-pkuseg diffusers simple-aesthetics-predictor scenedetect[opencv]