diff --git a/src/test/pull_hf_model_test.cpp b/src/test/pull_hf_model_test.cpp index efb8c8e934..9fd178ae8c 100644 --- a/src/test/pull_hf_model_test.cpp +++ b/src/test/pull_hf_model_test.cpp @@ -499,6 +499,29 @@ class HfPull : public TestWithTempDir { ::SetUpServerForDownloadAndStart(this->t, this->server, sourceModel, downloadPath, task, timeoutSeconds); } + static int RunPullHfModelAndGetCode(const std::string& sourceModel, const std::string& modelRepositoryPath, const std::string& pullTask) { + server.setShutdownRequest(0); + std::vector args = { + "ovms", + "--pull", + "--source_model", + sourceModel, + "--model_repository_path", + modelRepositoryPath, + "--task", + pullTask, + }; + std::vector argv; + argv.reserve(args.size()); + for (auto& a : args) { + argv.push_back(a.data()); + } + const int exitCode = server.start(static_cast(argv.size()), argv.data()); + server.setShutdownRequest(1); + server.setShutdownRequest(0); + return exitCode; + } + void TearDown() { server.setShutdownRequest(1); if (t) @@ -512,6 +535,9 @@ class HfPull : public TestWithTempDir { class HfPullCache : public HfPull { protected: + static constexpr int CACHE_PULL_MAX_ATTEMPTS = 3; + static constexpr int CACHE_PULL_RETRY_DELAY_MS = 1000; + static std::once_flag cacheInitFlag; static std::unique_ptr cacheDir; static std::string cachedRepositoryPath; @@ -531,18 +557,110 @@ class HfPullCache : public HfPull { std::string cacheDownloadPath = ovms::FileSystem::joinPath({cacheDir->dir.string(), "repository"}); std::string pullTask = this->task; - this->ServerPullHfModel(sourceModelName, cacheDownloadPath, pullTask); - server.setShutdownRequest(1); - if (t) - t->join(); - server.setShutdownRequest(0); + auto buildModelBasePath = [&](const std::string& repositoryRoot) { + return ovms::FileSystem::joinPath({repositoryRoot, MODEL_NAMESPACE, MODEL_ID}); + }; + auto hasCompleteCache = [&](const std::string& repositoryRoot) { + const std::string modelBase = buildModelBasePath(repositoryRoot); + std::error_code ec; + const bool hasModel = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "openvino_model.bin", ec); + if (ec) + return false; + const bool hasDetok = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "openvino_detokenizer.bin", ec); + if (ec) + return false; + const bool hasTok = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "openvino_tokenizer.bin", ec); + if (ec) + return false; + const bool hasTokModel = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "tokenizer.model", ec); + if (ec) + return false; + const bool hasGraph = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "graph.pbtxt", ec); + if (ec) + return false; + + // Validate expected known sizes for deterministic cache integrity checks. + const bool modelSizeOk = hasModel && (std::filesystem::file_size(ovms::FileSystem::appendSlash(modelBase) + "openvino_model.bin", ec) == OPENVINO_MODEL_BIN_FULL_SIZE_BYTES); + if (ec) + return false; + const bool detokSizeOk = hasDetok && (std::filesystem::file_size(ovms::FileSystem::appendSlash(modelBase) + "openvino_detokenizer.bin", ec) == OPENVINO_DETOKENIZER_BIN_FULL_SIZE_BYTES); + if (ec) + return false; + const bool tokSizeOk = hasTok && (std::filesystem::file_size(ovms::FileSystem::appendSlash(modelBase) + "openvino_tokenizer.bin", ec) == OPENVINO_TOKENIZER_BIN_FULL_SIZE_BYTES); + if (ec) + return false; + const bool tokModelSizeOk = hasTokModel && (std::filesystem::file_size(ovms::FileSystem::appendSlash(modelBase) + "tokenizer.model", ec) == TOKENIZER_MODEL_FULL_SIZE_BYTES); + if (ec) + return false; + + const bool hasRepoMarker = std::filesystem::exists(ovms::libgit2::getLfsWipMarkerPath(repositoryRoot), ec); + if (ec) + return false; + const bool hasModelMarker = std::filesystem::exists(ovms::libgit2::getLfsWipMarkerPath(modelBase), ec); + if (ec) + return false; + + return hasGraph && modelSizeOk && detokSizeOk && tokSizeOk && tokModelSizeOk && !hasRepoMarker && !hasModelMarker; + }; + auto looksLikeRecoverableNetworkFailure = [&](const std::string& repositoryRoot) { + const std::string modelBase = buildModelBasePath(repositoryRoot); + std::error_code ec; + const bool modelBaseExists = std::filesystem::exists(modelBase, ec); + if (ec || !modelBaseExists) + return false; + + // Interrupted network/LFS transfers usually leave partial files or marker traces. + const bool hasRepoMarker = std::filesystem::exists(ovms::libgit2::getLfsWipMarkerPath(repositoryRoot), ec); + if (ec) + return false; + const bool hasModelMarker = std::filesystem::exists(ovms::libgit2::getLfsWipMarkerPath(modelBase), ec); + if (ec) + return false; + if (hasRepoMarker || hasModelMarker) + return true; + + const std::string modelBin = ovms::FileSystem::appendSlash(modelBase) + "openvino_model.bin"; + const bool modelExists = std::filesystem::exists(modelBin, ec); + if (ec) + return false; + if (!modelExists) + return true; + + const std::uintmax_t modelSize = std::filesystem::file_size(modelBin, ec); + if (ec) + return false; + return (modelSize > 0) && (modelSize < OPENVINO_MODEL_BIN_FULL_SIZE_BYTES); + }; + + int lastExitCode = EXIT_FAILURE; + for (int attempt = 1; attempt <= CACHE_PULL_MAX_ATTEMPTS; ++attempt) { + lastExitCode = this->RunPullHfModelAndGetCode(sourceModelName, cacheDownloadPath, pullTask); + const bool cacheComplete = hasCompleteCache(cacheDownloadPath); + if ((lastExitCode == EXIT_SUCCESS) && cacheComplete) { + cachedRepositoryPath = cacheDownloadPath; + ASSERT_TRUE(std::filesystem::exists(cachedRepositoryPath)); + return; + } - cachedRepositoryPath = cacheDownloadPath; - ASSERT_TRUE(std::filesystem::exists(cachedRepositoryPath)); + const bool recoverable = looksLikeRecoverableNetworkFailure(cacheDownloadPath) || !cacheComplete; + if (!recoverable || (attempt == CACHE_PULL_MAX_ATTEMPTS)) { + FAIL() << "Failed to initialize shared HF cache after " << attempt + << " attempt(s). Last exit code: " << lastExitCode + << ". Cache path: " << cacheDownloadPath; + } + SPDLOG_WARN("Shared HF cache initialization attempt {} failed with exit code {}. Retrying pull.", attempt, lastExitCode); + std::this_thread::sleep_for(std::chrono::milliseconds(CACHE_PULL_RETRY_DELAY_MS)); + } }); } void seedCurrentTestRepository() { + ASSERT_FALSE(cachedRepositoryPath.empty()) + << "Shared HF cache was never successfully initialized (call_once completed with a failure). " + "All HfPullCache tests in this process will be unable to seed their working directory."; + ASSERT_TRUE(std::filesystem::exists(cachedRepositoryPath)) + << "Shared HF cache path does not exist on disk: " << cachedRepositoryPath + << ". Cache initialization completed but left no usable directory."; std::error_code ec; std::filesystem::copy(cachedRepositoryPath, testRepositoryPath,