feat(cache): add caching support for topic safety and content safety output checks

Pouyanpi · Pouyanpi · commit fab35c0fa40d · 2025-10-17T12:40:22.000+02:00
Extends the LLM caching system to support topic safety input checks and
content safety output checks. Both actions now cache their results along
with LLM stats and metadata to improve performance on repeated queries.

  Changes

  - Added caching support to topic_safety_check_input() with cache
hit/miss logic
  - Added caching support to content_safety_check_output() with cache
hit/miss logic
  - Both actions now extract and store LLM metadata alongside stats in
cache entries
  - Added model_caches parameter to both actions for optional cache
injection
  - Comprehensive test coverage for both new caching implementations
  - Tests verify cache hits, stats restoration, and metadata handling

pre-commits
diff --git a/nemoguardrails/library/content_safety/actions.py b/nemoguardrails/library/content_safety/actions.py
@@ -25,6 +25,7 @@
 from nemoguardrails.llm.cache.utils import (
     CacheEntry,
     create_normalized_cache_key,
+    extract_llm_metadata_for_cache,
     extract_llm_stats_for_cache,
     get_from_cache_and_restore_stats,
 )
@@ -110,6 +111,7 @@ async def content_safety_check_input(
         cache_entry: CacheEntry = {
             "result": final_result,
             "llm_stats": extract_llm_stats_for_cache(),
+            "llm_metadata": extract_llm_metadata_for_cache(),
         }
         cache.put(cache_key, cache_entry)
         log.debug(f"Content safety result cached for model '{model_name}'")
@@ -139,6 +141,7 @@ async def content_safety_check_output(
     llm_task_manager: LLMTaskManager,
     model_name: Optional[str] = None,
     context: Optional[dict] = None,
+    model_caches: Optional[Dict[str, CacheInterface]] = None,
     **kwargs,
 ) -> dict:
     _MAX_TOKENS = 3
@@ -176,12 +179,22 @@ async def content_safety_check_output(
             "bot_response": bot_response,
         },
     )
+
     stop = llm_task_manager.get_stop_tokens(task=task)
     max_tokens = llm_task_manager.get_max_tokens(task=task)
 
+    llm_call_info_var.set(LLMCallInfo(task=task))
+
     max_tokens = max_tokens or _MAX_TOKENS
 
-    llm_call_info_var.set(LLMCallInfo(task=task))
+    cache = model_caches.get(model_name) if model_caches else None
+
+    if cache:
+        cache_key = create_normalized_cache_key(check_output_prompt)
+        cached_result = get_from_cache_and_restore_stats(cache, cache_key)
+        if cached_result is not None:
+            log.debug(f"Content safety output cache hit for model '{model_name}'")
+            return cached_result
 
     result = await llm_call(
         llm,
@@ -194,4 +207,16 @@ async def content_safety_check_output(
 
     is_safe, *violated_policies = result
 
-    return {"allowed": is_safe, "policy_violations": violated_policies}
+    final_result = {"allowed": is_safe, "policy_violations": violated_policies}
+
+    if cache:
+        cache_key = create_normalized_cache_key(check_output_prompt)
+        cache_entry: CacheEntry = {
+            "result": final_result,
+            "llm_stats": extract_llm_stats_for_cache(),
+            "llm_metadata": extract_llm_metadata_for_cache(),
+        }
+        cache.put(cache_key, cache_entry)
+        log.debug(f"Content safety output result cached for model '{model_name}'")
+
+    return final_result
diff --git a/nemoguardrails/library/topic_safety/actions.py b/nemoguardrails/library/topic_safety/actions.py
@@ -21,6 +21,14 @@
 from nemoguardrails.actions.actions import action
 from nemoguardrails.actions.llm.utils import llm_call
 from nemoguardrails.context import llm_call_info_var
+from nemoguardrails.llm.cache import CacheInterface
+from nemoguardrails.llm.cache.utils import (
+    CacheEntry,
+    create_normalized_cache_key,
+    extract_llm_metadata_for_cache,
+    extract_llm_stats_for_cache,
+    get_from_cache_and_restore_stats,
+)
 from nemoguardrails.llm.filters import to_chat_messages
 from nemoguardrails.llm.taskmanager import LLMTaskManager
 from nemoguardrails.logging.explain import LLMCallInfo
@@ -35,6 +43,7 @@ async def topic_safety_check_input(
     model_name: Optional[str] = None,
     context: Optional[dict] = None,
     events: Optional[List[dict]] = None,
+    model_caches: Optional[Dict[str, CacheInterface]] = None,
     **kwargs,
 ) -> dict:
     _MAX_TOKENS = 10
@@ -102,11 +111,32 @@ async def topic_safety_check_input(
     messages.extend(conversation_history)
     messages.append({"type": "user", "content": user_input})
 
+    cache = model_caches.get(model_name) if model_caches else None
+
+    if cache:
+        cache_key = create_normalized_cache_key(messages)
+        cached_result = get_from_cache_and_restore_stats(cache, cache_key)
+        if cached_result is not None:
+            log.debug(f"Topic safety cache hit for model '{model_name}'")
+            return cached_result
+
     result = await llm_call(llm, messages, stop=stop, llm_params={"temperature": 0.01})
 
     if result.lower().strip() == "off-topic":
         on_topic = False
     else:
         on_topic = True
 
-    return {"on_topic": on_topic}
+    final_result = {"on_topic": on_topic}
+
+    if cache:
+        cache_key = create_normalized_cache_key(messages)
+        cache_entry: CacheEntry = {
+            "result": final_result,
+            "llm_stats": extract_llm_stats_for_cache(),
+            "llm_metadata": extract_llm_metadata_for_cache(),
+        }
+        cache.put(cache_key, cache_entry)
+        log.debug(f"Topic safety result cached for model '{model_name}'")
+
+    return final_result
diff --git a/tests/test_content_safety_cache.py b/tests/test_content_safety_cache.py
@@ -18,7 +18,10 @@
 import pytest
 
 from nemoguardrails.context import llm_call_info_var, llm_stats_var
-from nemoguardrails.library.content_safety.actions import content_safety_check_input
+from nemoguardrails.library.content_safety.actions import (
+    content_safety_check_input,
+    content_safety_check_output,
+)
 from nemoguardrails.llm.cache.lfu import LFUCache
 from nemoguardrails.llm.cache.utils import create_normalized_cache_key
 from nemoguardrails.logging.explain import LLMCallInfo
@@ -95,6 +98,7 @@ async def test_content_safety_cache_retrieves_result_and_restores_stats(
             "prompt_tokens": 80,
             "completion_tokens": 20,
         },
+        "llm_metadata": None,
     }
     cache_key = create_normalized_cache_key("test prompt")
     cache.put(cache_key, cache_entry)
@@ -138,6 +142,7 @@ async def test_content_safety_cache_duration_reflects_cache_read_time(
             "prompt_tokens": 40,
             "completion_tokens": 10,
         },
+        "llm_metadata": None,
     }
     cache_key = create_normalized_cache_key("test prompt")
     cache.put(cache_key, cache_entry)
@@ -191,6 +196,7 @@ async def test_content_safety_cache_handles_missing_stats_gracefully(
     cache_entry = {
         "result": {"allowed": True, "policy_violations": []},
         "llm_stats": None,
+        "llm_metadata": None,
     }
     cache_key = create_normalized_cache_key("test_key")
     cache.put(cache_key, cache_entry)
@@ -213,3 +219,63 @@ async def test_content_safety_cache_handles_missing_stats_gracefully(
 
     assert result["allowed"] is True
     assert llm_stats.get_stat("total_calls") == 0
+
+
+@pytest.mark.asyncio
+async def test_content_safety_check_output_cache_stores_result(
+    fake_llm_with_stats, mock_task_manager
+):
+    cache = LFUCache(maxsize=10)
+    mock_task_manager.parse_task_output.return_value = [True, "policy2"]
+
+    result = await content_safety_check_output(
+        llms=fake_llm_with_stats,
+        llm_task_manager=mock_task_manager,
+        model_name="test_model",
+        context={"user_message": "test user input", "bot_message": "test bot response"},
+        model_caches={"test_model": cache},
+    )
+
+    assert result["allowed"] is True
+    assert result["policy_violations"] == ["policy2"]
+    assert cache.size() == 1
+
+
+@pytest.mark.asyncio
+async def test_content_safety_check_output_cache_hit(
+    fake_llm_with_stats, mock_task_manager
+):
+    cache = LFUCache(maxsize=10)
+
+    cache_entry = {
+        "result": {"allowed": False, "policy_violations": ["unsafe_output"]},
+        "llm_stats": {
+            "total_tokens": 75,
+            "prompt_tokens": 60,
+            "completion_tokens": 15,
+        },
+        "llm_metadata": None,
+    }
+    cache_key = create_normalized_cache_key("test output prompt")
+    cache.put(cache_key, cache_entry)
+
+    mock_task_manager.render_task_prompt.return_value = "test output prompt"
+
+    llm_stats = LLMStats()
+    llm_stats_var.set(llm_stats)
+
+    result = await content_safety_check_output(
+        llms=fake_llm_with_stats,
+        llm_task_manager=mock_task_manager,
+        model_name="test_model",
+        context={"user_message": "user", "bot_message": "bot"},
+        model_caches={"test_model": cache},
+    )
+
+    assert result["allowed"] is False
+    assert result["policy_violations"] == ["unsafe_output"]
+    assert llm_stats.get_stat("total_calls") == 1
+    assert llm_stats.get_stat("total_tokens") == 75
+
+    llm_call_info = llm_call_info_var.get()
+    assert llm_call_info.from_cache is True
diff --git a/tests/test_topic_safety_cache.py b/tests/test_topic_safety_cache.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from nemoguardrails.context import llm_call_info_var, llm_stats_var
+from nemoguardrails.library.topic_safety.actions import topic_safety_check_input
+from nemoguardrails.llm.cache.lfu import LFUCache
+from nemoguardrails.llm.cache.utils import create_normalized_cache_key
+from nemoguardrails.logging.explain import LLMCallInfo
+from nemoguardrails.logging.stats import LLMStats
+from tests.utils import FakeLLM
+
+
+@pytest.fixture
+def mock_task_manager():
+    tm = MagicMock()
+    tm.render_task_prompt.return_value = "Is this on topic?"
+    tm.get_stop_tokens.return_value = []
+    tm.get_max_tokens.return_value = 10
+    return tm
+
+
+@pytest.fixture
+def fake_llm_topic():
+    llm = FakeLLM(responses=["on-topic"])
+    return {"test_model": llm}
+
+
+@pytest.mark.asyncio
+async def test_topic_safety_cache_stores_result(fake_llm_topic, mock_task_manager):
+    cache = LFUCache(maxsize=10)
+
+    result = await topic_safety_check_input(
+        llms=fake_llm_topic,
+        llm_task_manager=mock_task_manager,
+        model_name="test_model",
+        context={"user_message": "What is AI?"},
+        events=[],
+        model_caches={"test_model": cache},
+    )
+
+    assert result["on_topic"] is True
+    assert cache.size() == 1
+
+
+@pytest.mark.asyncio
+async def test_topic_safety_cache_hit(fake_llm_topic, mock_task_manager):
+    cache = LFUCache(maxsize=10)
+
+    system_prompt_with_restriction = (
+        "Is this on topic?\n\n"
+        'If any of the above conditions are violated, please respond with "off-topic". '
+        'Otherwise, respond with "on-topic". '
+        'You must respond with "on-topic" or "off-topic".'
+    )
+
+    messages = [
+        {"type": "system", "content": system_prompt_with_restriction},
+        {"type": "user", "content": "What is AI?"},
+    ]
+    cache_entry = {
+        "result": {"on_topic": False},
+        "llm_stats": {
+            "total_tokens": 50,
+            "prompt_tokens": 40,
+            "completion_tokens": 10,
+        },
+        "llm_metadata": None,
+    }
+    cache_key = create_normalized_cache_key(messages)
+    cache.put(cache_key, cache_entry)
+
+    llm_stats = LLMStats()
+    llm_stats_var.set(llm_stats)
+
+    llm_call_info = LLMCallInfo(task="topic_safety_check_input $model=test_model")
+    llm_call_info_var.set(llm_call_info)
+
+    result = await topic_safety_check_input(
+        llms=fake_llm_topic,
+        llm_task_manager=mock_task_manager,
+        model_name="test_model",
+        context={"user_message": "What is AI?"},
+        events=[],
+        model_caches={"test_model": cache},
+    )
+
+    assert result["on_topic"] is False
+    assert llm_stats.get_stat("total_calls") == 1
+    assert llm_stats.get_stat("total_tokens") == 50
+
+    llm_call_info = llm_call_info_var.get()
+    assert llm_call_info.from_cache is True
+
+
+@pytest.mark.asyncio
+async def test_topic_safety_without_cache(fake_llm_topic, mock_task_manager):
+    result = await topic_safety_check_input(
+        llms=fake_llm_topic,
+        llm_task_manager=mock_task_manager,
+        model_name="test_model",
+        context={"user_message": "What is AI?"},
+        events=[],
+    )
+
+    assert result["on_topic"] is True