Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 125 additions & 7 deletions src/test/pull_hf_model_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,29 @@ class HfPull : public TestWithTempDir {
::SetUpServerForDownloadAndStart(this->t, this->server, sourceModel, downloadPath, task, timeoutSeconds);
}

int RunPullHfModelAndGetCode(const std::string& sourceModel, const std::string& modelRepositoryPath, const std::string& pullTask) {
server.setShutdownRequest(0);
std::vector<std::string> args = {
"ovms",
"--pull",
"--source_model",
sourceModel,
"--model_repository_path",
modelRepositoryPath,
"--task",
pullTask,
};
std::vector<char*> argv;
argv.reserve(args.size());
for (auto& a : args) {
argv.push_back(a.data());
}
const int exitCode = server.start(static_cast<int>(argv.size()), argv.data());
server.setShutdownRequest(1);
server.setShutdownRequest(0);
return exitCode;
}

void TearDown() {
server.setShutdownRequest(1);
if (t)
Expand All @@ -512,6 +535,9 @@ class HfPull : public TestWithTempDir {

class HfPullCache : public HfPull {
protected:
static constexpr int CACHE_PULL_MAX_ATTEMPTS = 3;
static constexpr int CACHE_PULL_RETRY_DELAY_MS = 1000;

static std::once_flag cacheInitFlag;
static std::unique_ptr<TempDir> cacheDir;
static std::string cachedRepositoryPath;
Expand All @@ -531,18 +557,110 @@ class HfPullCache : public HfPull {
std::string cacheDownloadPath = ovms::FileSystem::joinPath({cacheDir->dir.string(), "repository"});
std::string pullTask = this->task;

this->ServerPullHfModel(sourceModelName, cacheDownloadPath, pullTask);
server.setShutdownRequest(1);
if (t)
t->join();
server.setShutdownRequest(0);
auto buildModelBasePath = [&](const std::string& repositoryRoot) {
return ovms::FileSystem::joinPath({repositoryRoot, MODEL_NAMESPACE, MODEL_ID});
};
auto hasCompleteCache = [&](const std::string& repositoryRoot) {
const std::string modelBase = buildModelBasePath(repositoryRoot);
std::error_code ec;
const bool hasModel = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "openvino_model.bin", ec);
if (ec)
return false;
const bool hasDetok = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "openvino_detokenizer.bin", ec);
if (ec)
return false;
const bool hasTok = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "openvino_tokenizer.bin", ec);
if (ec)
return false;
const bool hasTokModel = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "tokenizer.model", ec);
if (ec)
return false;
const bool hasGraph = std::filesystem::exists(ovms::FileSystem::appendSlash(modelBase) + "graph.pbtxt", ec);
if (ec)
return false;

// Validate expected known sizes for deterministic cache integrity checks.
const bool modelSizeOk = hasModel && (std::filesystem::file_size(ovms::FileSystem::appendSlash(modelBase) + "openvino_model.bin", ec) == OPENVINO_MODEL_BIN_FULL_SIZE_BYTES);
if (ec)
return false;
const bool detokSizeOk = hasDetok && (std::filesystem::file_size(ovms::FileSystem::appendSlash(modelBase) + "openvino_detokenizer.bin", ec) == OPENVINO_DETOKENIZER_BIN_FULL_SIZE_BYTES);
if (ec)
return false;
const bool tokSizeOk = hasTok && (std::filesystem::file_size(ovms::FileSystem::appendSlash(modelBase) + "openvino_tokenizer.bin", ec) == OPENVINO_TOKENIZER_BIN_FULL_SIZE_BYTES);
if (ec)
return false;
const bool tokModelSizeOk = hasTokModel && (std::filesystem::file_size(ovms::FileSystem::appendSlash(modelBase) + "tokenizer.model", ec) == TOKENIZER_MODEL_FULL_SIZE_BYTES);
if (ec)
return false;

const bool hasRepoMarker = std::filesystem::exists(ovms::libgit2::getLfsWipMarkerPath(repositoryRoot), ec);
if (ec)
return false;
const bool hasModelMarker = std::filesystem::exists(ovms::libgit2::getLfsWipMarkerPath(modelBase), ec);
if (ec)
return false;

return hasGraph && modelSizeOk && detokSizeOk && tokSizeOk && tokModelSizeOk && !hasRepoMarker && !hasModelMarker;
};
auto looksLikeRecoverableNetworkFailure = [&](const std::string& repositoryRoot) {
const std::string modelBase = buildModelBasePath(repositoryRoot);
std::error_code ec;
const bool modelBaseExists = std::filesystem::exists(modelBase, ec);
if (ec || !modelBaseExists)
return false;

// Interrupted network/LFS transfers usually leave partial files or marker traces.
const bool hasRepoMarker = std::filesystem::exists(ovms::libgit2::getLfsWipMarkerPath(repositoryRoot), ec);
if (ec)
return false;
const bool hasModelMarker = std::filesystem::exists(ovms::libgit2::getLfsWipMarkerPath(modelBase), ec);
if (ec)
return false;
if (hasRepoMarker || hasModelMarker)
return true;

const std::string modelBin = ovms::FileSystem::appendSlash(modelBase) + "openvino_model.bin";
const bool modelExists = std::filesystem::exists(modelBin, ec);
if (ec)
return false;
if (!modelExists)
return true;

const std::uintmax_t modelSize = std::filesystem::file_size(modelBin, ec);
if (ec)
return false;
return (modelSize > 0) && (modelSize < OPENVINO_MODEL_BIN_FULL_SIZE_BYTES);
};

int lastExitCode = EXIT_FAILURE;
for (int attempt = 1; attempt <= CACHE_PULL_MAX_ATTEMPTS; ++attempt) {
lastExitCode = this->RunPullHfModelAndGetCode(sourceModelName, cacheDownloadPath, pullTask);
const bool cacheComplete = hasCompleteCache(cacheDownloadPath);
if ((lastExitCode == EXIT_SUCCESS) && cacheComplete) {
cachedRepositoryPath = cacheDownloadPath;
Comment thread
rasapala marked this conversation as resolved.
ASSERT_TRUE(std::filesystem::exists(cachedRepositoryPath));
return;
}

cachedRepositoryPath = cacheDownloadPath;
ASSERT_TRUE(std::filesystem::exists(cachedRepositoryPath));
const bool recoverable = looksLikeRecoverableNetworkFailure(cacheDownloadPath) || !cacheComplete;
if (!recoverable || (attempt == CACHE_PULL_MAX_ATTEMPTS)) {
FAIL() << "Failed to initialize shared HF cache after " << attempt
<< " attempt(s). Last exit code: " << lastExitCode
<< ". Cache path: " << cacheDownloadPath;
}
Comment thread
rasapala marked this conversation as resolved.
SPDLOG_WARN("Shared HF cache initialization attempt {} failed with exit code {}. Retrying pull.", attempt, lastExitCode);
std::this_thread::sleep_for(std::chrono::milliseconds(CACHE_PULL_RETRY_DELAY_MS));
}
});
}

void seedCurrentTestRepository() {
ASSERT_FALSE(cachedRepositoryPath.empty())
<< "Shared HF cache was never successfully initialized (call_once completed with a failure). "
"All HfPullCache tests in this process will be unable to seed their working directory.";
ASSERT_TRUE(std::filesystem::exists(cachedRepositoryPath))
<< "Shared HF cache path does not exist on disk: " << cachedRepositoryPath
<< ". Cache initialization completed but left no usable directory.";
std::error_code ec;
std::filesystem::copy(cachedRepositoryPath,
testRepositoryPath,
Expand Down