Skip to content

Commit

Permalink
[TESTS] Use FP32 inference precision, FP16 KV cache precision for pip…
Browse files Browse the repository at this point in the history
…elines (#1485)

OpenVINO plugins enable different kind of optimizations by default like
KV cache compression to int8, fp16 inference precision, while in GenAI
tests we want to test pipelines and how they are compared against HF /
optimum w/o extra optimizations:


https://github.com/openvinotoolkit/openvino.genai/blob/4db67aecac78885c6d1e302f348c9489e2154388/tests/python_tests/common.py#L318-L325

Hopefully, we can merge int8 KV cache by default for CB then
#1206, because in
tests we will still compare FP16 KV cache, while official Validation
should be responsible for validation against reference via WWB metrics.
  • Loading branch information
ilya-lavrenov authored Jan 6, 2025
1 parent b04b28b commit 48dfd16
Show file tree
Hide file tree
Showing 9 changed files with 21 additions and 22 deletions.
2 changes: 1 addition & 1 deletion samples/export-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
openvino-tokenizers~=2025.0.0.0.dev
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
numpy<2.0.0; sys_platform == 'darwin'
einops==0.8.0 # For Qwen
transformers_stream_generator==0.0.5 # For Qwen
Expand Down
2 changes: 1 addition & 1 deletion src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ class ContinuousBatchingPipeline:
def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
...
@typing.overload
def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, **kwargs) -> None:
...
@typing.overload
def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle:
Expand Down
7 changes: 3 additions & 4 deletions src/python/py_continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,15 +223,14 @@ void init_continuous_batching_pipeline(py::module_& m) {
py::arg("properties") = ov::AnyMap({}),
py::arg("tokenizer_properties") = ov::AnyMap({}))

.def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
.def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const py::kwargs& kwargs) {
ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config));
return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::kwargs_to_any_map(kwargs));
}),
py::arg("models_path"),
py::arg("tokenizer"),
py::arg("scheduler_config"),
py::arg("device"),
py::arg("properties") = ov::AnyMap({}))
py::arg("device"))

.def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
.def("get_config", &ContinuousBatchingPipeline::get_config)
Expand Down
2 changes: 1 addition & 1 deletion tests/python_tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def run_continuous_batching(
if type(generation_configs) is not list:
generation_configs = [generation_configs] * len(prompts)

cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU')
cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties())
output = cb_pipe.generate(prompts, generation_configs)

del cb_pipe
Expand Down
12 changes: 6 additions & 6 deletions tests/python_tests/ov_genai_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import json

import openvino_genai as ov_genai

from common import get_default_properties

def get_models_list():
precommit_models = [
Expand Down Expand Up @@ -92,7 +92,7 @@ def read_model(params, **tokenizer_kwargs):

if (models_path / "openvino_model.xml").exists():
opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True,
compile=False, device='CPU')
compile=False, device='CPU', ov_config=get_default_properties())
else:
ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
with_detokenizer=True,
Expand All @@ -104,7 +104,7 @@ def read_model(params, **tokenizer_kwargs):
hf_tokenizer.save_pretrained(models_path)

opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
compile=False, device='CPU', load_in_8bit=False)
compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_properties())
opt_model.generation_config.save_pretrained(models_path)
opt_model.config.save_pretrained(models_path)
opt_model.save_pretrained(models_path)
Expand All @@ -114,7 +114,7 @@ def read_model(params, **tokenizer_kwargs):
models_path,
hf_tokenizer,
opt_model,
ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False),
ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_properties()),
)


Expand Down Expand Up @@ -178,7 +178,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
with (temp_path / config_name).open('w') as f:
json.dump(config_json, f)

ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU')
ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_properties())

for _, config_name in configs:
os.remove(temp_path / config_name)
Expand All @@ -188,4 +188,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):

@functools.lru_cache(1)
def get_continuous_batching(path):
return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig())
return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_properties())
2 changes: 1 addition & 1 deletion tests/python_tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
diffusers==0.32.1
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
onnx==1.17.0
pytest
Expand Down
4 changes: 2 additions & 2 deletions tests/python_tests/test_continuous_batching.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pathlib import Path
from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer

from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
from common import get_default_properties, get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \
get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
Expand Down Expand Up @@ -155,7 +155,7 @@ def test_post_oom_health(tmp_path, sampling_config):
models_path : Path = tmp_path / model_id
convert_models(opt_model, hf_tokenizer, models_path)

cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_properties())

# First run should return incomplete response
output = cb_pipe.generate(["What is OpenVINO?"], [generation_config])
Expand Down
8 changes: 4 additions & 4 deletions tests/python_tests/test_kv_cache_eviction.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from openvino import serialize
from transformers import AutoTokenizer

from common import TESTS_ROOT, run_cb_pipeline_with_ref
from common import TESTS_ROOT, run_cb_pipeline_with_ref, get_default_properties


def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
Expand All @@ -42,7 +42,7 @@ class ConvertedModel:
@pytest.fixture(scope='module')
def converted_model(tmp_path_factory):
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False)
model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_properties())
tokenizer = AutoTokenizer.from_pretrained(model_id)
models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id
model.save_pretrained(models_path)
Expand Down Expand Up @@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
scheduler_config_opt.enable_prefix_caching = enable_prefix_caching

models_path = converted_model.models_path
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU")
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_properties())
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_properties())

tokenizer = converted_model.tokenizer

Expand Down
4 changes: 2 additions & 2 deletions tests/python_tests/test_vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import transformers
from optimum.intel.openvino import OVModelForVisualCausalLM
from openvino_genai import VLMPipeline, GenerationConfig
from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters
from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters, get_default_properties

def get_ov_model(cache):
model_dir = cache.mkdir("tiny-random-minicpmv-2_6")
Expand All @@ -19,7 +19,7 @@ def get_ov_model(cache):
ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True)
openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True)
model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties())
processor.save_pretrained(model_dir)
model.save_pretrained(model_dir)
return model_dir
Expand Down

0 comments on commit 48dfd16

Please sign in to comment.