From a465fb6c7d02a2a0b6289d79eb0434f6d6253a2f Mon Sep 17 00:00:00 2001 From: Thomas Hansen Date: Mon, 27 Apr 2026 09:54:37 -0500 Subject: [PATCH 1/2] Hash-truncate long cache path segments --- inference/core/cache/air_gapped.py | 5 +- inference/core/cache/model_artifacts.py | 5 +- inference/core/models/roboflow.py | 3 +- inference/core/registries/roboflow.py | 6 +-- inference/core/utils/file_system.py | 34 ++++++++++++++ inference/models/easy_ocr/easy_ocr.py | 5 +- .../models/grounding_dino/grounding_dino.py | 5 +- inference/models/transformers/transformers.py | 2 +- .../core/cache/test_model_artifacts.py | 17 +++++++ .../core/registries/test_roboflow.py | 33 +++++++++++++ .../unit_tests/core/utils/test_file_system.py | 46 +++++++++++++++++++ 11 files changed, 148 insertions(+), 13 deletions(-) diff --git a/inference/core/cache/air_gapped.py b/inference/core/cache/air_gapped.py index 4ddb280996..3abca8a6ed 100644 --- a/inference/core/cache/air_gapped.py +++ b/inference/core/cache/air_gapped.py @@ -13,6 +13,7 @@ from inference.core.env import MODEL_CACHE_DIR, USE_INFERENCE_MODELS from inference.core.roboflow_api import MODEL_TYPE_KEY, PROJECT_TASK_TYPE_KEY +from inference.core.utils.file_system import safe_path_join logger = logging.getLogger(__name__) @@ -65,14 +66,14 @@ def is_model_cached(model_id: str) -> bool: """ if not USE_INFERENCE_MODELS: # Only check the traditional layout when inference-models is disabled. - traditional_path = os.path.join(MODEL_CACHE_DIR, model_id) + traditional_path = safe_path_join(MODEL_CACHE_DIR, model_id) return os.path.isdir(traditional_path) and _has_non_hidden_children( traditional_path ) # When inference-models is enabled, check both layouts — models cached # before the migration still sit in the traditional tree. - traditional_path = os.path.join(MODEL_CACHE_DIR, model_id) + traditional_path = safe_path_join(MODEL_CACHE_DIR, model_id) if os.path.isdir(traditional_path) and _has_non_hidden_children(traditional_path): return True diff --git a/inference/core/cache/model_artifacts.py b/inference/core/cache/model_artifacts.py index ebf06f3ce9..7fb8a8adc7 100644 --- a/inference/core/cache/model_artifacts.py +++ b/inference/core/cache/model_artifacts.py @@ -20,6 +20,7 @@ dump_text_lines_atomic, read_json, read_text_file, + safe_path_join, ) @@ -138,7 +139,7 @@ def save_text_lines_in_cache( def get_cache_file_path(file: str, model_id: Optional[str] = None) -> str: cache_dir = get_cache_dir(model_id=model_id) - return os.path.join(cache_dir, file) + return safe_path_join(cache_dir, file) def _rmtree_onerror(func, path, exc_info): @@ -234,5 +235,5 @@ def clear_cache(model_id: Optional[str] = None, delete_from_disk: bool = True) - def get_cache_dir(model_id: Optional[str] = None) -> str: if model_id is not None: - return os.path.join(MODEL_CACHE_DIR, model_id) + return safe_path_join(MODEL_CACHE_DIR, model_id) return MODEL_CACHE_DIR diff --git a/inference/core/models/roboflow.py b/inference/core/models/roboflow.py index f3f23ea972..798998587b 100644 --- a/inference/core/models/roboflow.py +++ b/inference/core/models/roboflow.py @@ -73,6 +73,7 @@ get_roboflow_model_data, ) from inference.core.telemetry import set_span_attribute, start_span +from inference.core.utils.file_system import safe_path_join from inference.core.utils.image_utils import load_image from inference.core.utils.onnx import get_onnxruntime_execution_providers from inference.core.utils.preprocess import letterbox_image, prepare @@ -138,7 +139,7 @@ def __init__( self.dataset_id, self.version_id = get_model_id_chunks(model_id=model_id) self.endpoint = model_id self.device_id = GLOBAL_DEVICE_ID - self.cache_dir = os.path.join(cache_dir_root, self.endpoint) + self.cache_dir = safe_path_join(cache_dir_root, self.endpoint) self.keypoints_metadata: Optional[dict] = None initialise_cache(model_id=self.endpoint) diff --git a/inference/core/registries/roboflow.py b/inference/core/registries/roboflow.py index 7c03ca6bc0..913aa7aba9 100644 --- a/inference/core/registries/roboflow.py +++ b/inference/core/registries/roboflow.py @@ -42,7 +42,7 @@ get_roboflow_model_data, get_roboflow_workspace, ) -from inference.core.utils.file_system import dump_json, read_json +from inference.core.utils.file_system import dump_json, read_json, safe_path_join from inference.core.utils.roboflow import get_model_id_chunks from inference.models.aliases import resolve_roboflow_model_alias @@ -381,7 +381,7 @@ def _save_model_metadata_in_cache( def construct_model_type_cache_path( dataset_id: Union[DatasetID, ModelID], version_id: Optional[VersionID] ) -> str: - cache_dir = os.path.join( + cache_dir = safe_path_join( MODEL_CACHE_DIR, dataset_id, version_id if version_id else "" ) - return os.path.join(cache_dir, "model_type.json") + return safe_path_join(cache_dir, "model_type.json") diff --git a/inference/core/utils/file_system.py b/inference/core/utils/file_system.py index 1288cb3bde..56214ab9fb 100644 --- a/inference/core/utils/file_system.py +++ b/inference/core/utils/file_system.py @@ -3,10 +3,15 @@ import os.path import re import tempfile +from hashlib import sha1 from typing import List, Optional, Union _pattern = re.compile(r"[^A-Za-z0-9_-]") +MAX_PATH_SEGMENT_BYTES = 255 +TRUNCATED_PATH_SEGMENT_PREFIX_BYTES = 200 +TRUNCATED_PATH_SEGMENT_HASH_CHARS = 12 + class AtomicPath: """Context manager for atomic file writes. @@ -173,3 +178,32 @@ def ensure_write_is_allowed(path: str, allow_override: bool) -> None: def sanitize_path_segment(path_segment: str) -> str: # Keep only letters, numbers, underscores and dashes return _pattern.sub("_", path_segment) + + +def hash_truncate_path_segment(path_segment: str) -> str: + if len(os.fsencode(path_segment)) <= MAX_PATH_SEGMENT_BYTES: + return path_segment + encoded_segment = os.fsencode(path_segment) + digest = sha1(encoded_segment).hexdigest()[:TRUNCATED_PATH_SEGMENT_HASH_CHARS] + suffix = f"_{digest}" + prefix_length = min( + TRUNCATED_PATH_SEGMENT_PREFIX_BYTES, + MAX_PATH_SEGMENT_BYTES - len(os.fsencode(suffix)), + ) + prefix = encoded_segment[:prefix_length].decode("utf-8", errors="ignore") + return f"{prefix}{suffix}" + + +def truncate_path_segments(path: str) -> str: + drive, path_without_drive = os.path.splitdrive(path) + if os.altsep is not None: + path_without_drive = path_without_drive.replace(os.altsep, os.sep) + safe_segments = [ + hash_truncate_path_segment(segment) if segment else segment + for segment in path_without_drive.split(os.sep) + ] + return drive + os.sep.join(safe_segments) + + +def safe_path_join(*path_segments: str) -> str: + return truncate_path_segments(os.path.join(*path_segments)) diff --git a/inference/models/easy_ocr/easy_ocr.py b/inference/models/easy_ocr/easy_ocr.py index cfb45fcd9c..ad9c4bb9a5 100644 --- a/inference/models/easy_ocr/easy_ocr.py +++ b/inference/models/easy_ocr/easy_ocr.py @@ -10,6 +10,7 @@ import torch from PIL import Image +from inference.core.cache.model_artifacts import get_cache_file_path from inference.core.entities.requests.easy_ocr import EasyOCRInferenceRequest from inference.core.entities.responses.inference import ( InferenceResponse, @@ -53,8 +54,8 @@ def __init__( self.recognizer = model_id.split("/")[1] shutil.copyfile( - f"{MODEL_CACHE_DIR}/{model_id}/weights.pt", - f"{MODEL_CACHE_DIR}/{model_id}/{self.recognizer}.pth", + get_cache_file_path(file="weights.pt", model_id=model_id), + get_cache_file_path(file=f"{self.recognizer}.pth", model_id=model_id), ) def predict(self, image_in: np.ndarray, prompt="", history=None, **kwargs): diff --git a/inference/models/grounding_dino/grounding_dino.py b/inference/models/grounding_dino/grounding_dino.py index 88bbcc84aa..3b041cc4f8 100644 --- a/inference/models/grounding_dino/grounding_dino.py +++ b/inference/models/grounding_dino/grounding_dino.py @@ -47,6 +47,7 @@ def _patched_get_extended_attention_mask( from groundingdino.util.inference import Model +from inference.core.cache.model_artifacts import get_cache_dir from inference.core.entities.requests.groundingdino import GroundingDINOInferenceRequest from inference.core.entities.requests.inference import InferenceRequestImage from inference.core.entities.responses.inference import ( @@ -54,7 +55,7 @@ def _patched_get_extended_attention_mask( ObjectDetectionInferenceResponse, ObjectDetectionPrediction, ) -from inference.core.env import CLASS_AGNOSTIC_NMS, MODEL_CACHE_DIR +from inference.core.env import CLASS_AGNOSTIC_NMS from inference.core.models.roboflow import RoboflowCoreModel from inference.core.utils.image_utils import load_image_bgr, xyxy_to_xywh @@ -78,7 +79,7 @@ def __init__( super().__init__(*args, model_id=model_id, **kwargs) - GROUNDING_DINO_CACHE_DIR = os.path.join(MODEL_CACHE_DIR, model_id) + GROUNDING_DINO_CACHE_DIR = get_cache_dir(model_id=model_id) import groundingdino.config as _gd_config diff --git a/inference/models/transformers/transformers.py b/inference/models/transformers/transformers.py index ae2b8aa841..ada2d40f40 100644 --- a/inference/models/transformers/transformers.py +++ b/inference/models/transformers/transformers.py @@ -85,7 +85,7 @@ def __init__( self.cache_model_artefacts(**kwargs) - self.cache_dir = os.path.join(MODEL_CACHE_DIR, self.endpoint + "/") + self.cache_dir = get_cache_dir(model_id=self.endpoint) self.initialize_model(**kwargs) diff --git a/tests/inference/unit_tests/core/cache/test_model_artifacts.py b/tests/inference/unit_tests/core/cache/test_model_artifacts.py index 8b1ef40b47..771805d542 100644 --- a/tests/inference/unit_tests/core/cache/test_model_artifacts.py +++ b/tests/inference/unit_tests/core/cache/test_model_artifacts.py @@ -20,6 +20,7 @@ save_json_in_cache, save_text_lines_in_cache, ) +from inference.core.utils.file_system import hash_truncate_path_segment from tests.inference.unit_tests.core.utils.test_file_system import ( assert_bytes_file_content_correct, assert_text_file_content_correct, @@ -272,6 +273,22 @@ def test_get_cache_dir_when_model_id_given() -> None: assert result == "/some/cache/yolo/3" +@mock.patch.object(model_artifacts, "MODEL_CACHE_DIR", "/some/cache") +def test_get_cache_dir_when_model_id_has_long_segment() -> None: + # given + long_model_slug = "find-" + ("class-" * 60) + "instant-1" + + # when + result = get_cache_dir(model_id=f"workspace/{long_model_slug}") + + # then + assert result == os.path.join( + "/some/cache", + "workspace", + hash_truncate_path_segment(path_segment=long_model_slug), + ) + + @mock.patch.object(model_artifacts, "MODEL_CACHE_DIR", "/some/cache") def test_get_cache_dir_when_model_id_not_given() -> None: # when diff --git a/tests/inference/unit_tests/core/registries/test_roboflow.py b/tests/inference/unit_tests/core/registries/test_roboflow.py index 1c8991e8c3..ed984d5733 100644 --- a/tests/inference/unit_tests/core/registries/test_roboflow.py +++ b/tests/inference/unit_tests/core/registries/test_roboflow.py @@ -209,6 +209,39 @@ def test_save_model_metadata_in_cache( ) +def test_save_and_load_model_metadata_in_cache_when_instant_model_slug_is_long( + empty_local_dir: str, +) -> None: + # given + long_model_slug = "find-" + ("class-" * 60) + "instant-1" + dataset_id = f"huizen/{long_model_slug}" + + # when + with mock.patch.object( + roboflow, "MODEL_CACHE_DIR", empty_local_dir + ), mock.patch.object(roboflow, "LAMBDA", True): + save_model_metadata_in_cache( + dataset_id=dataset_id, + version_id=None, + project_task_type="object-detection", + model_type="yolov8n", + ) + _in_process_metadata_cache.cache.clear() + result = get_model_metadata_from_cache(dataset_id=dataset_id, version_id=None) + cache_path = roboflow.construct_model_type_cache_path( + dataset_id=dataset_id, version_id=None + ) + + # then + assert result == ("object-detection", "yolov8n") + assert os.path.isfile(cache_path) + assert all( + len(os.fsencode(path_segment)) <= 255 + for path_segment in cache_path.split(os.sep) + if path_segment + ) + + @mock.patch.object(roboflow, "construct_model_type_cache_path") def test_get_model_type_when_cache_is_utilised( construct_model_type_cache_path_mock: MagicMock, diff --git a/tests/inference/unit_tests/core/utils/test_file_system.py b/tests/inference/unit_tests/core/utils/test_file_system.py index 7792a300e3..3c53e4171a 100644 --- a/tests/inference/unit_tests/core/utils/test_file_system.py +++ b/tests/inference/unit_tests/core/utils/test_file_system.py @@ -2,11 +2,13 @@ import os import os.path import tempfile +from hashlib import sha1 import pytest from humanfriendly.testing import touch from inference.core.utils.file_system import ( + MAX_PATH_SEGMENT_BYTES, AtomicPath, dump_bytes, dump_bytes_atomic, @@ -16,8 +18,10 @@ dump_text_lines_atomic, ensure_parent_dir_exists, ensure_write_is_allowed, + hash_truncate_path_segment, read_json, read_text_file, + safe_path_join, ) @@ -341,6 +345,48 @@ def test_ensure_parent_dir_exists_when_dir_does_not_exist(empty_local_dir: str) assert os.listdir(empty_local_dir) == ["some"] +def test_hash_truncate_path_segment_when_segment_fits_os_limit() -> None: + # given + segment = "workspace" + + # when + result = hash_truncate_path_segment(path_segment=segment) + + # then + assert result == segment + + +def test_hash_truncate_path_segment_when_segment_exceeds_os_limit() -> None: + # given + segment = "a" * 300 + + # when + result = hash_truncate_path_segment(path_segment=segment) + + # then + assert result == f"{'a' * 200}_{sha1(segment.encode()).hexdigest()[:12]}" + assert len(os.fsencode(result)) <= MAX_PATH_SEGMENT_BYTES + + +def test_safe_path_join_truncates_long_path_segments(empty_local_dir: str) -> None: + # given + segment = "find-" + ("class-" * 60) + "instant-1" + expected_segment = hash_truncate_path_segment(path_segment=segment) + + # when + result = safe_path_join(empty_local_dir, "huizen", segment, "model_type.json") + + # then + assert result == os.path.join( + empty_local_dir, "huizen", expected_segment, "model_type.json" + ) + assert all( + len(os.fsencode(path_segment)) <= MAX_PATH_SEGMENT_BYTES + for path_segment in result.split(os.sep) + if path_segment + ) + + @pytest.mark.parametrize("allow_override", [True, False]) def test_ensure_write_is_allowed_when_file_does_not_exist( allow_override: bool, From ac8d9d9db1a169b0ce3e4f7138a6f2afd95bf8a0 Mon Sep 17 00:00:00 2001 From: Thomas Hansen Date: Thu, 30 Apr 2026 09:19:22 -0500 Subject: [PATCH 2/2] Address cache path review feedback --- inference/core/cache/air_gapped.py | 22 ++++---- inference/core/cache/model_artifacts.py | 52 +++++++++++++++++-- inference/core/models/roboflow.py | 5 +- inference/core/registries/roboflow.py | 11 ++-- inference/core/utils/file_system.py | 36 ++++--------- .../core/cache/test_model_artifacts.py | 40 ++++++++++++-- .../core/registries/test_roboflow.py | 3 +- .../unit_tests/core/utils/test_file_system.py | 43 +++++++-------- tests/unit/core/cache/test_air_gapped.py | 1 - 9 files changed, 129 insertions(+), 84 deletions(-) diff --git a/inference/core/cache/air_gapped.py b/inference/core/cache/air_gapped.py index 3abca8a6ed..e535630af8 100644 --- a/inference/core/cache/air_gapped.py +++ b/inference/core/cache/air_gapped.py @@ -4,16 +4,17 @@ offline workflow construction. """ -import hashlib import json import logging import os -import re from typing import Any, Dict, List, Optional +from inference.core.cache.model_artifacts import ( + get_cache_dir, + slugify_model_id_to_cache_key, +) from inference.core.env import MODEL_CACHE_DIR, USE_INFERENCE_MODELS from inference.core.roboflow_api import MODEL_TYPE_KEY, PROJECT_TASK_TYPE_KEY -from inference.core.utils.file_system import safe_path_join logger = logging.getLogger(__name__) @@ -27,14 +28,7 @@ def _slugify_model_id(model_id: str) -> str: Must stay in sync with ``inference_models.models.auto_loaders.core.slugify_model_id_to_os_safe_format``. """ - slug = re.sub(r"[^A-Za-z0-9_-]+", "-", model_id) - slug = re.sub(r"[_-]{2,}", "-", slug) - if not slug: - slug = "special-char-only-model-id" - if len(slug) > 48: - slug = slug[:48] - digest = hashlib.blake2s(model_id.encode("utf-8"), digest_size=4).hexdigest() - return f"{slug}-{digest}" + return slugify_model_id_to_cache_key(model_id=model_id) def _has_non_hidden_children(path: str) -> bool: @@ -66,14 +60,16 @@ def is_model_cached(model_id: str) -> bool: """ if not USE_INFERENCE_MODELS: # Only check the traditional layout when inference-models is disabled. - traditional_path = safe_path_join(MODEL_CACHE_DIR, model_id) + traditional_path = get_cache_dir( + model_id=model_id, cache_dir_root=MODEL_CACHE_DIR + ) return os.path.isdir(traditional_path) and _has_non_hidden_children( traditional_path ) # When inference-models is enabled, check both layouts — models cached # before the migration still sit in the traditional tree. - traditional_path = safe_path_join(MODEL_CACHE_DIR, model_id) + traditional_path = get_cache_dir(model_id=model_id, cache_dir_root=MODEL_CACHE_DIR) if os.path.isdir(traditional_path) and _has_non_hidden_children(traditional_path): return True diff --git a/inference/core/cache/model_artifacts.py b/inference/core/cache/model_artifacts.py index 7fb8a8adc7..621f11ee37 100644 --- a/inference/core/cache/model_artifacts.py +++ b/inference/core/cache/model_artifacts.py @@ -1,4 +1,5 @@ import errno +import hashlib import json import os.path import re @@ -18,11 +19,15 @@ dump_json_atomic, dump_text_lines, dump_text_lines_atomic, + path_fits_os_limits, read_json, read_text_file, - safe_path_join, ) +MODEL_ID_CACHE_SLUG_PREFIX_LENGTH = 48 +MODEL_ID_CACHE_SLUG_HASH_BYTES = 4 +SPECIAL_CHAR_ONLY_MODEL_ID_SLUG = "special-char-only-model-id" + def initialise_cache(model_id: Optional[str] = None) -> None: cache_dir = get_cache_dir(model_id=model_id) @@ -139,7 +144,7 @@ def save_text_lines_in_cache( def get_cache_file_path(file: str, model_id: Optional[str] = None) -> str: cache_dir = get_cache_dir(model_id=model_id) - return safe_path_join(cache_dir, file) + return os.path.join(cache_dir, file) def _rmtree_onerror(func, path, exc_info): @@ -233,7 +238,44 @@ def clear_cache(model_id: Optional[str] = None, delete_from_disk: bool = True) - ) -def get_cache_dir(model_id: Optional[str] = None) -> str: +def get_cache_dir( + model_id: Optional[str] = None, cache_dir_root: Optional[str] = None +) -> str: + cache_dir_root = cache_dir_root if cache_dir_root is not None else MODEL_CACHE_DIR if model_id is not None: - return safe_path_join(MODEL_CACHE_DIR, model_id) - return MODEL_CACHE_DIR + model_cache_path = get_model_id_cache_path( + model_id=model_id, cache_dir_root=cache_dir_root + ) + return os.path.join(cache_dir_root, model_cache_path) + return cache_dir_root + + +def get_model_id_cache_path(model_id: str, cache_dir_root: str) -> str: + legacy_cache_path = os.path.join(cache_dir_root, model_id) + if cache_path_is_within_root( + path=legacy_cache_path, cache_dir_root=cache_dir_root + ) and path_fits_os_limits(path=legacy_cache_path): + return model_id + return slugify_model_id_to_cache_key(model_id=model_id) + + +def cache_path_is_within_root(path: str, cache_dir_root: str) -> bool: + try: + root = os.path.abspath(cache_dir_root) + candidate = os.path.abspath(path) + return os.path.commonpath([root, candidate]) == root + except ValueError: + return False + + +def slugify_model_id_to_cache_key(model_id: str) -> str: + model_id_slug = re.sub(r"[^A-Za-z0-9_-]+", "-", model_id) + model_id_slug = re.sub(r"[_-]{2,}", "-", model_id_slug) + if not model_id_slug: + model_id_slug = SPECIAL_CHAR_ONLY_MODEL_ID_SLUG + if len(model_id_slug) > MODEL_ID_CACHE_SLUG_PREFIX_LENGTH: + model_id_slug = model_id_slug[:MODEL_ID_CACHE_SLUG_PREFIX_LENGTH] + digest = hashlib.blake2s( + model_id.encode("utf-8"), digest_size=MODEL_ID_CACHE_SLUG_HASH_BYTES + ).hexdigest() + return f"{model_id_slug}-{digest}" diff --git a/inference/core/models/roboflow.py b/inference/core/models/roboflow.py index 798998587b..a6f865f0c7 100644 --- a/inference/core/models/roboflow.py +++ b/inference/core/models/roboflow.py @@ -73,7 +73,6 @@ get_roboflow_model_data, ) from inference.core.telemetry import set_span_attribute, start_span -from inference.core.utils.file_system import safe_path_join from inference.core.utils.image_utils import load_image from inference.core.utils.onnx import get_onnxruntime_execution_providers from inference.core.utils.preprocess import letterbox_image, prepare @@ -139,7 +138,9 @@ def __init__( self.dataset_id, self.version_id = get_model_id_chunks(model_id=model_id) self.endpoint = model_id self.device_id = GLOBAL_DEVICE_ID - self.cache_dir = safe_path_join(cache_dir_root, self.endpoint) + self.cache_dir = get_cache_dir( + model_id=self.endpoint, cache_dir_root=cache_dir_root + ) self.keypoints_metadata: Optional[dict] = None initialise_cache(model_id=self.endpoint) diff --git a/inference/core/registries/roboflow.py b/inference/core/registries/roboflow.py index 913aa7aba9..2742a7d306 100644 --- a/inference/core/registries/roboflow.py +++ b/inference/core/registries/roboflow.py @@ -5,6 +5,7 @@ from inference.core.cache import cache from inference.core.cache.lru_cache import LRUCache +from inference.core.cache.model_artifacts import get_cache_dir from inference.core.devices.utils import GLOBAL_DEVICE_ID from inference.core.entities.types import ( DatasetID, @@ -16,7 +17,6 @@ from inference.core.env import ( CACHE_METADATA_LOCK_TIMEOUT, LAMBDA, - MODEL_CACHE_DIR, MODELS_CACHE_AUTH_CACHE_MAX_SIZE, MODELS_CACHE_AUTH_CACHE_TTL, MODELS_CACHE_AUTH_ENABLED, @@ -42,7 +42,7 @@ get_roboflow_model_data, get_roboflow_workspace, ) -from inference.core.utils.file_system import dump_json, read_json, safe_path_join +from inference.core.utils.file_system import dump_json, read_json from inference.core.utils.roboflow import get_model_id_chunks from inference.models.aliases import resolve_roboflow_model_alias @@ -381,7 +381,6 @@ def _save_model_metadata_in_cache( def construct_model_type_cache_path( dataset_id: Union[DatasetID, ModelID], version_id: Optional[VersionID] ) -> str: - cache_dir = safe_path_join( - MODEL_CACHE_DIR, dataset_id, version_id if version_id else "" - ) - return safe_path_join(cache_dir, "model_type.json") + model_id = dataset_id if version_id is None else f"{dataset_id}/{version_id}" + cache_dir = get_cache_dir(model_id=model_id) + return os.path.join(cache_dir, "model_type.json") diff --git a/inference/core/utils/file_system.py b/inference/core/utils/file_system.py index 56214ab9fb..f1754f9a38 100644 --- a/inference/core/utils/file_system.py +++ b/inference/core/utils/file_system.py @@ -3,14 +3,12 @@ import os.path import re import tempfile -from hashlib import sha1 from typing import List, Optional, Union _pattern = re.compile(r"[^A-Za-z0-9_-]") +MAX_PATH_BYTES = 4096 MAX_PATH_SEGMENT_BYTES = 255 -TRUNCATED_PATH_SEGMENT_PREFIX_BYTES = 200 -TRUNCATED_PATH_SEGMENT_HASH_CHARS = 12 class AtomicPath: @@ -180,30 +178,14 @@ def sanitize_path_segment(path_segment: str) -> str: return _pattern.sub("_", path_segment) -def hash_truncate_path_segment(path_segment: str) -> str: - if len(os.fsencode(path_segment)) <= MAX_PATH_SEGMENT_BYTES: - return path_segment - encoded_segment = os.fsencode(path_segment) - digest = sha1(encoded_segment).hexdigest()[:TRUNCATED_PATH_SEGMENT_HASH_CHARS] - suffix = f"_{digest}" - prefix_length = min( - TRUNCATED_PATH_SEGMENT_PREFIX_BYTES, - MAX_PATH_SEGMENT_BYTES - len(os.fsencode(suffix)), - ) - prefix = encoded_segment[:prefix_length].decode("utf-8", errors="ignore") - return f"{prefix}{suffix}" - - -def truncate_path_segments(path: str) -> str: +def path_fits_os_limits(path: str) -> bool: + if len(os.fsencode(os.path.abspath(path))) >= MAX_PATH_BYTES: + return False drive, path_without_drive = os.path.splitdrive(path) if os.altsep is not None: path_without_drive = path_without_drive.replace(os.altsep, os.sep) - safe_segments = [ - hash_truncate_path_segment(segment) if segment else segment - for segment in path_without_drive.split(os.sep) - ] - return drive + os.sep.join(safe_segments) - - -def safe_path_join(*path_segments: str) -> str: - return truncate_path_segments(os.path.join(*path_segments)) + return all( + len(os.fsencode(path_segment)) <= MAX_PATH_SEGMENT_BYTES + for path_segment in path_without_drive.split(os.sep) + if path_segment + ) diff --git a/tests/inference/unit_tests/core/cache/test_model_artifacts.py b/tests/inference/unit_tests/core/cache/test_model_artifacts.py index 771805d542..73c2ce3064 100644 --- a/tests/inference/unit_tests/core/cache/test_model_artifacts.py +++ b/tests/inference/unit_tests/core/cache/test_model_artifacts.py @@ -19,8 +19,9 @@ save_bytes_in_cache, save_json_in_cache, save_text_lines_in_cache, + slugify_model_id_to_cache_key, ) -from inference.core.utils.file_system import hash_truncate_path_segment +from inference.core.utils.file_system import MAX_PATH_SEGMENT_BYTES from tests.inference.unit_tests.core.utils.test_file_system import ( assert_bytes_file_content_correct, assert_text_file_content_correct, @@ -277,16 +278,45 @@ def test_get_cache_dir_when_model_id_given() -> None: def test_get_cache_dir_when_model_id_has_long_segment() -> None: # given long_model_slug = "find-" + ("class-" * 60) + "instant-1" + model_id = f"workspace/{long_model_slug}" # when - result = get_cache_dir(model_id=f"workspace/{long_model_slug}") + result = get_cache_dir(model_id=model_id) # then assert result == os.path.join( - "/some/cache", - "workspace", - hash_truncate_path_segment(path_segment=long_model_slug), + "/some/cache", slugify_model_id_to_cache_key(model_id=model_id) ) + assert len(os.fsencode(os.path.basename(result))) <= MAX_PATH_SEGMENT_BYTES + + +@mock.patch.object(model_artifacts, "MODEL_CACHE_DIR", "/some/cache") +def test_get_cache_dir_when_model_id_has_too_many_segments() -> None: + # given + model_id = "/".join(["segment"] * 700) + + # when + result = get_cache_dir(model_id=model_id) + + # then + assert result == os.path.join( + "/some/cache", slugify_model_id_to_cache_key(model_id=model_id) + ) + + +@mock.patch.object(model_artifacts, "MODEL_CACHE_DIR", "/some/cache") +def test_get_cache_dir_when_model_id_points_outside_cache_root() -> None: + # given + model_id = "../outside" + + # when + result = get_cache_dir(model_id=model_id) + + # then + assert result == os.path.join( + "/some/cache", slugify_model_id_to_cache_key(model_id=model_id) + ) + assert os.path.commonpath(["/some/cache", result]) == "/some/cache" @mock.patch.object(model_artifacts, "MODEL_CACHE_DIR", "/some/cache") diff --git a/tests/inference/unit_tests/core/registries/test_roboflow.py b/tests/inference/unit_tests/core/registries/test_roboflow.py index ed984d5733..b5f055362a 100644 --- a/tests/inference/unit_tests/core/registries/test_roboflow.py +++ b/tests/inference/unit_tests/core/registries/test_roboflow.py @@ -6,6 +6,7 @@ import pytest +from inference.core.cache import model_artifacts from inference.core.devices.utils import GLOBAL_DEVICE_ID from inference.core.entities.types import ModelType, TaskType from inference.core.exceptions import MissingApiKeyError, ModelNotRecognisedError @@ -218,7 +219,7 @@ def test_save_and_load_model_metadata_in_cache_when_instant_model_slug_is_long( # when with mock.patch.object( - roboflow, "MODEL_CACHE_DIR", empty_local_dir + model_artifacts, "MODEL_CACHE_DIR", empty_local_dir ), mock.patch.object(roboflow, "LAMBDA", True): save_model_metadata_in_cache( dataset_id=dataset_id, diff --git a/tests/inference/unit_tests/core/utils/test_file_system.py b/tests/inference/unit_tests/core/utils/test_file_system.py index 3c53e4171a..2fb4965baf 100644 --- a/tests/inference/unit_tests/core/utils/test_file_system.py +++ b/tests/inference/unit_tests/core/utils/test_file_system.py @@ -2,12 +2,12 @@ import os import os.path import tempfile -from hashlib import sha1 import pytest from humanfriendly.testing import touch from inference.core.utils.file_system import ( + MAX_PATH_BYTES, MAX_PATH_SEGMENT_BYTES, AtomicPath, dump_bytes, @@ -18,10 +18,9 @@ dump_text_lines_atomic, ensure_parent_dir_exists, ensure_write_is_allowed, - hash_truncate_path_segment, + path_fits_os_limits, read_json, read_text_file, - safe_path_join, ) @@ -345,46 +344,42 @@ def test_ensure_parent_dir_exists_when_dir_does_not_exist(empty_local_dir: str) assert os.listdir(empty_local_dir) == ["some"] -def test_hash_truncate_path_segment_when_segment_fits_os_limit() -> None: +def test_path_fits_os_limits_when_path_is_within_limits(empty_local_dir: str) -> None: # given - segment = "workspace" + path = os.path.join(empty_local_dir, "workspace", "model_type.json") # when - result = hash_truncate_path_segment(path_segment=segment) + result = path_fits_os_limits(path=path) # then - assert result == segment + assert result is True -def test_hash_truncate_path_segment_when_segment_exceeds_os_limit() -> None: +def test_path_fits_os_limits_when_segment_exceeds_os_limit( + empty_local_dir: str, +) -> None: # given - segment = "a" * 300 + path = os.path.join(empty_local_dir, "a" * (MAX_PATH_SEGMENT_BYTES + 1)) # when - result = hash_truncate_path_segment(path_segment=segment) + result = path_fits_os_limits(path=path) # then - assert result == f"{'a' * 200}_{sha1(segment.encode()).hexdigest()[:12]}" - assert len(os.fsencode(result)) <= MAX_PATH_SEGMENT_BYTES + assert result is False -def test_safe_path_join_truncates_long_path_segments(empty_local_dir: str) -> None: +def test_path_fits_os_limits_when_total_path_exceeds_os_limit( + empty_local_dir: str, +) -> None: # given - segment = "find-" + ("class-" * 60) + "instant-1" - expected_segment = hash_truncate_path_segment(path_segment=segment) + segments_count = (MAX_PATH_BYTES // len("segment/")) + 1 + long_path = os.path.join(empty_local_dir, *["segment"] * segments_count) # when - result = safe_path_join(empty_local_dir, "huizen", segment, "model_type.json") + result = path_fits_os_limits(path=long_path) # then - assert result == os.path.join( - empty_local_dir, "huizen", expected_segment, "model_type.json" - ) - assert all( - len(os.fsencode(path_segment)) <= MAX_PATH_SEGMENT_BYTES - for path_segment in result.split(os.sep) - if path_segment - ) + assert result is False @pytest.mark.parametrize("allow_override", [True, False]) diff --git a/tests/unit/core/cache/test_air_gapped.py b/tests/unit/core/cache/test_air_gapped.py index 0c0e136f1c..8921914fef 100644 --- a/tests/unit/core/cache/test_air_gapped.py +++ b/tests/unit/core/cache/test_air_gapped.py @@ -8,7 +8,6 @@ import pytest - # --------------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------------