generative-computing
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎mellea/backends/adapters/adapter.py‎
Lines changed: 1 addition & 2 deletions b/‎mellea/backends/adapters/adapter.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎mellea/backends/huggingface.py‎
Lines changed: 11 additions & 0 deletions b/‎mellea/backends/huggingface.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎mellea/backends/litellm.py‎
Lines changed: 2 additions & 2 deletions b/‎mellea/backends/litellm.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mellea/backends/openai.py‎
Lines changed: 6 additions & 0 deletions b/‎mellea/backends/openai.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎mellea/stdlib/intrinsics/rag.py‎
Lines changed: 3 additions & 1 deletion b/‎mellea/stdlib/intrinsics/rag.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mellea/templates/prompts/default/LLMaJRequirement.jinja2‎
Lines changed: 0 additions & 15 deletions b/‎mellea/templates/prompts/default/LLMaJRequirement.jinja2‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎test/backends/test_huggingface.py‎
Lines changed: 68 additions & 33 deletions b/‎test/backends/test_huggingface.py‎
Lines changed: 68 additions & 33 deletions
@@ -7,11 +7,11 @@ repos:
       - id: ruff-format
         name: "Ruff formatter"
         args: [--config=pyproject.toml]
-        files: '^(mellea|tests|cli|docs).*\.(py|ipynb)$'
+        files: '^(mellea|test|cli|docs).*\.(py|ipynb)$'
       - id: ruff
         name: "Ruff linter"
         args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
-        files: '^(mellea|tests).*\.(py|ipynb)$'
+        files: '^(mellea).*\.(py|ipynb)$'
 
   - repo: local
     hooks:
@@ -20,7 +20,7 @@ repos:
         entry: uv run --no-sync mypy mellea
         pass_filenames: false
         language: system
-        files: '\.py$'
+        files: '^(mellea|test|cli|docs).*\.(py|ipynb)$'
 
   - repo: https://github.com/astral-sh/uv-pre-commit
     rev: 0.7.8
 
@@ -2,11 +2,10 @@
 
 import abc
 import pathlib
-from typing import Any, TypeVar
+from typing import TypeVar
 
 import granite_common.intrinsics
 import yaml
-from litellm import cast
 
 from mellea.backends import Backend
 from mellea.backends.adapters.catalog import AdapterType, fetch_intrinsic_metadata
 
@@ -281,6 +281,11 @@ async def _generate_from_intrinsic(
         if not ctx.is_chat_context:
             raise Exception("Does not yet support non-chat contexts.")
 
+        if len(model_options.items()) > 0:
+            FancyLogger.get_logger().info(
+                "passing in model options when generating with an adapter; some model options may be overwritten / ignored"
+            )
+
         linearized_ctx = ctx.view_for_generation()
         assert linearized_ctx is not None, (
             "If ctx.is_chat_context, then the context should be linearizable."
@@ -341,6 +346,12 @@ async def _generate_from_intrinsic(
             "messages": conversation,
             "extra_body": {"documents": docs},
         }
+
+        # Convert other parameters from Mellea proprietary format to standard format.
+        for model_option in model_options:
+            if model_option == ModelOption.TEMPERATURE:
+                request_json["temperature"] = model_options[model_option]
+
         rewritten = rewriter.transform(request_json, **action.intrinsic_kwargs)
 
         # TODO: Handle caching here. granite_common doesn't tell us what changed,
 
@@ -54,12 +54,12 @@ def __init__(
         base_url: str | None = "http://localhost:11434",
         model_options: dict | None = None,
     ):
-        """Initialize and OpenAI compatible backend. For any additional kwargs that you need to pass the the client, pass them as a part of **kwargs.
+        """Initialize an OpenAI compatible backend using the [LiteLLM Python SDK](https://docs.litellm.ai/docs/#litellm-python-sdk).
 
         Note: If getting `Unclosed client session`, set `export DISABLE_AIOHTTP_TRANSPORT=True` in your environment. See: https://github.com/BerriAI/litellm/issues/13251.
 
         Args:
-            model_id : The LiteLLM model identifier. Make sure that all necessary credentials are in OS environment variables.
+            model_id : The LiteLLM model identifier; in most cases requires some combination of `<provider>/<model_creator>/<model_name>`. Make sure that all necessary credentials are in OS environment variables.
             formatter: A custom formatter based on backend.If None, defaults to TemplateFormatter
             base_url : Base url for LLM API. Defaults to None.
             model_options : Generation options to pass to the LLM. Defaults to None.
 
@@ -435,6 +435,12 @@ async def _generate_from_intrinsic(
             "extra_body": {"documents": docs},
         }
 
+        # Convert other parameters from Mellea proprietary format to standard format.
+        if model_options is not None:
+            for model_option in model_options:
+                if model_option == ModelOption.TEMPERATURE:
+                    request_json["temperature"] = model_options[model_option]
+
         rewritten = rewriter.transform(request_json, **action.intrinsic_kwargs)
 
         self.load_adapter(adapter.qualified_name)
 
@@ -9,6 +9,7 @@
     AdapterType,
     GraniteCommonAdapter,
 )
+from mellea.backends.types import ModelOption
 from mellea.stdlib.base import ChatContext, Document
 from mellea.stdlib.chat import Message
 from mellea.stdlib.intrinsics.intrinsic import Intrinsic
@@ -63,6 +64,7 @@ def _call_intrinsic(
         intrinsic,
         context,
         backend,
+        model_options={ModelOption.TEMPERATURE: 0.0},
         # No rejection sampling, please
         strategy=None,
     )
@@ -277,7 +279,7 @@ def rewrite_answer_for_relevance(
         backend,
         kwargs={
             "answer_relevance_category": result_json["answer_relevance_category"],
-            "answer_relevance_analysis": result_json["answer_relevance_category"],
+            "answer_relevance_analysis": result_json["answer_relevance_analysis"],
             "correction_method": correction_method,
         },
     )
 
@@ -17,13 +17,22 @@
 from mellea.backends.formatter import TemplateFormatter
 from mellea.backends.huggingface import LocalHFBackend, _assert_correct_adapters
 from mellea.backends.types import ModelOption
-from mellea.stdlib.base import (CBlock, ChatContext, Context, ModelOutputThunk,
-                                SimpleContext)
+from mellea.stdlib.base import (
+    CBlock,
+    ChatContext,
+    Context,
+    ModelOutputThunk,
+    SimpleContext,
+)
 from mellea.stdlib.chat import Message
 from mellea.stdlib.intrinsics.intrinsic import Intrinsic
-from mellea.stdlib.requirement import (ALoraRequirement, LLMaJRequirement,
-                                       Requirement, ValidationResult,
-                                       default_output_to_bool)
+from mellea.stdlib.requirement import (
+    ALoraRequirement,
+    LLMaJRequirement,
+    Requirement,
+    ValidationResult,
+    default_output_to_bool,
+)
 
 
 @pytest.fixture(scope="module")
@@ -40,9 +49,7 @@ def backend():
         )
     )
     backend.add_adapter(
-        GraniteCommonAdapter(
-            "answerability", base_model_name=backend.base_model_name
-        )
+        GraniteCommonAdapter("answerability", base_model_name=backend.base_model_name)
     )
     return backend
 
@@ -54,6 +61,7 @@ def session(backend):
     yield session
     session.reset()
 
+
 @pytest.mark.qualitative
 def test_adapters(backend):
     assert len(backend._added_adapters.items()) > 0
@@ -305,6 +313,7 @@ async def test_async_avalue(session):
     assert m1_final_val is not None
     assert m1_final_val == mot1.value
 
+
 @pytest.mark.qualitative
 async def test_generate_with_lock(backend):
     # Enable the faulthandler for this test.
@@ -319,23 +328,20 @@ async def test_generate_with_lock(backend):
     b._added_adapters = {}
     b._loaded_adapters = {}
     b.add_adapter(
-        GraniteCommonAdapter(
-            "requirement_check", base_model_name=b.base_model_name
-        )
+        GraniteCommonAdapter("requirement_check", base_model_name=b.base_model_name)
     )
     b.add_adapter(
-        GraniteCommonAdapter(
-            "answerability", base_model_name=b.base_model_name
-        )
+        GraniteCommonAdapter("answerability", base_model_name=b.base_model_name)
     )
 
     memoized = dict()
     gen_func = model.generate
+
     def mock_func(input_ids, *args, **kwargs):
         """Mocks the generate function. Must call `populate_mocked_dict` with each input that must be cached before using this."""
         for key, val in memoized.items():
             if torch.equal(key, input_ids):
-                time.sleep(random.uniform(.1, .5)) # Simulate a bit of work.
+                time.sleep(random.uniform(0.1, 0.5))  # Simulate a bit of work.
                 return val
         assert False, "did not get a cached response"
 
@@ -347,7 +353,9 @@ def populate_mocked_dict(input_ids, *args, **kwargs):
         return output
 
     model.generate = Mock(side_effect=populate_mocked_dict)
-    assert not isinstance(backend._model, Mock), "mocking went wrong; backend fixture changed; other tests may fail"
+    assert not isinstance(backend._model, Mock), (
+        "mocking went wrong; backend fixture changed; other tests may fail"
+    )
 
     # Set up the inputs.
     ctx = ChatContext().add(Message("user", "hello"))
@@ -362,18 +370,22 @@ def call_backend_generate():
             b.generate_from_context(act, ctx),
             b.generate_from_context(req_intrinsic, ctx),
             b.generate_from_context(answerability_intrinsic, ctx),
-            b.generate_from_raw([raw_act], ctx, model_options={ModelOption.MAX_NEW_TOKENS: 3})
+            b.generate_from_raw(
+                [raw_act], ctx, model_options={ModelOption.MAX_NEW_TOKENS: 3}
+            ),
         ]
 
     # Call once to populate the memoized mock.
     outputs = await asyncio.gather(*call_backend_generate())
     for output in outputs:
         mot = output[0]
-        await mot.avalue() # Ensure all values are computed.
+        await mot.avalue()  # Ensure all values are computed.
 
     # Use the memoized mock that errors if not precomputed.
     model.generate = Mock(side_effect=mock_func)
-    count = 5 # Use a high number to try to put pressure on the lock and catch deadlocks.
+    count = (
+        5  # Use a high number to try to put pressure on the lock and catch deadlocks.
+    )
     coros: list[Coroutine[Any, Any, tuple[ModelOutputThunk, Context]]] = []
     for _ in range(count):
         coros.extend(call_backend_generate())
@@ -388,10 +400,11 @@ def call_backend_generate():
 
     faulthandler.disable()
 
+
 @pytest.mark.qualitative
 async def test_generate_with_lock_does_not_block_when_awaiting_value(backend):
-    """This is a tricky test to setup. 
-    
+    """This is a tricky test to setup.
+
     It's purpose is to ensure that a long-running generation doesn't get blocked
     when awaiting the `model_output_thunk.avalue()` of a different generation request.
 
@@ -417,14 +430,28 @@ async def test_generate_with_lock_does_not_block_when_awaiting_value(backend):
     # - a streaming generation that will take a long time to resolve.
     # - a regular generation that should be able to happen while the streaming is happening.
     # - two intrinsics that shouldn't be able to happen concurrently.
-    reg_mot_stream, _ = await backend.generate_from_context(act, ctx, model_options={ModelOption.STREAM: True, ModelOption.MAX_NEW_TOKENS: token_generation_length, "min_length": token_generation_length})
+    reg_mot_stream, _ = await backend.generate_from_context(
+        act,
+        ctx,
+        model_options={
+            ModelOption.STREAM: True,
+            ModelOption.MAX_NEW_TOKENS: token_generation_length,
+            "min_length": token_generation_length,
+        },
+    )
     reg_mot, _ = await backend.generate_from_context(act, ctx)
-    req_mot, _ = await backend.generate_from_context(req_intrinsic, ctx, model_options={ModelOption.STREAM: True})
-    answerability_mot, _ = await backend.generate_from_context(answerability_intrinsic, ctx, model_options={ModelOption.STREAM: True})
+    req_mot, _ = await backend.generate_from_context(
+        req_intrinsic, ctx, model_options={ModelOption.STREAM: True}
+    )
+    answerability_mot, _ = await backend.generate_from_context(
+        answerability_intrinsic, ctx, model_options={ModelOption.STREAM: True}
+    )
 
     # Ensure the stream is generating but not yet completing.
     await reg_mot_stream.astream()
-    assert not reg_mot_stream.is_computed(), "generation completed too early, see test for more details"
+    assert not reg_mot_stream.is_computed(), (
+        "generation completed too early, see test for more details"
+    )
 
     # Awaiting this shouldn't cause a deadlock. Add the timeout so the test can fail.
     # If the test fails, this means that the streaming generation wasn't able to complete,
@@ -442,11 +469,12 @@ async def test_generate_with_lock_does_not_block_when_awaiting_value(backend):
             raise e
         else:
             raise Exception("timeout ended too early, see test for more details")
-    
+
     for output in [reg_mot_stream, reg_mot, req_mot, answerability_mot]:
         if not output.is_computed():
             await output.avalue()  # Ensure everything gets computed.
 
+
 @pytest.mark.qualitative
 async def test_error_during_generate_with_lock(backend):
     # Create local versions of these objects so that mocking
@@ -459,20 +487,21 @@ async def test_error_during_generate_with_lock(backend):
     b._added_adapters = {}
     b._loaded_adapters = {}
     b.add_adapter(
-        GraniteCommonAdapter(
-            "requirement_check", base_model_name=b.base_model_name
-        )
+        GraniteCommonAdapter("requirement_check", base_model_name=b.base_model_name)
     )
 
     regular_generate = b._model.generate
+
     def generate_and_raise_exc(*args, **kwargs):
         """Will generate like usual for the intrinsic request. Will fail for the regular generation request."""
         if "max_new_tokens" in kwargs:
             return regular_generate(*args, **kwargs)  # type: ignore
         raise Exception("Oops!")
 
     b._model.generate = Mock(side_effect=generate_and_raise_exc)
-    assert not isinstance(backend._model, Mock), "mocking went wrong; backend fixture changed; other tests may fail"
+    assert not isinstance(backend._model, Mock), (
+        "mocking went wrong; backend fixture changed; other tests may fail"
+    )
 
     # Set up the inputs.
     ctx = ChatContext().add(Message("user", "hello"))
@@ -487,9 +516,10 @@ def generate_and_raise_exc(*args, **kwargs):
 
     await req_mot.avalue()
 
+
 def test_assert_correct_adapters():
     model = Mock()
-    
+
     # Test scenarios with no active adapters.
     model.active_adapters = Mock(return_value=[])
     _assert_correct_adapters("", model)
@@ -505,11 +535,16 @@ def test_assert_correct_adapters():
     _assert_correct_adapters("new", model)
 
     # Test scenarios when no adapters have been loaded.
-    model.active_adapters = Mock(side_effect=ValueError("No adapter loaded. Please load an adapter first."))
-    _assert_correct_adapters("", model)  # This will fail if peft ever changes the error message.
+    model.active_adapters = Mock(
+        side_effect=ValueError("No adapter loaded. Please load an adapter first.")
+    )
+    _assert_correct_adapters(
+        "", model
+    )  # This will fail if peft ever changes the error message.
     with pytest.raises(AssertionError):
         _assert_correct_adapters("new", model)
 
+
 if __name__ == "__main__":
     import pytest