Refactors converting output values into strings

skrawcz · elijahbenizzy · commit ff6d5ef21029 · 2023-10-04T21:02:03.000-07:00
So both the driver and materializer need to handle converting
function names and variables into strings.

This is a little messy -- but it centralizes logic in common.
I didn't bother with another file because I didn't know what to call it.
So putting under common seems fine.

Otherwise I added tests to ensure that new functionality works,
and left the existing tests to ensure nothing broke.
diff --git a/hamilton/common/__init__.py b/hamilton/common/__init__.py
@@ -0,0 +1,58 @@
+# code in this module should no depend on much
+from typing import Any, Callable, List, Optional, Set, Tuple, Union
+
+
+def convert_output_value(
+    output_value: Union[str, Callable, Any], module_set: Set[str]
+) -> Tuple[Optional[str], Optional[str]]:
+    """Converts output values that one can request into strings.
+
+    It checks that if it's a function, it's in the passed in module set.
+
+    :param output_value: the value we want to convert into a string. We don't annotate driver.Variable here for
+       import reasons.
+    :param module_set: the set of modules functions could come from.
+    :return: a tuple, (string value, string error). One or the other is returned, never both.
+    """
+    if isinstance(output_value, str):
+        return output_value, None
+    elif hasattr(output_value, "name"):
+        return output_value.name, None
+    elif isinstance(output_value, Callable):
+        if output_value.__module__ in module_set:
+            return output_value.__name__, None
+        else:
+            return None, (
+                f"Function {output_value.__module__}.{output_value.__name__} is a function not "
+                f"in a "
+                f"module given to the materializer. Valid choices are {module_set}."
+            )
+    else:
+        return None, (
+            f"Materializer dependency {output_value} is not a string, a function, or a driver.Variable."
+        )
+
+
+def convert_output_values(
+    output_values: List[Union[str, Callable, Any]], module_set: Set[str]
+) -> List[str]:
+    """Checks & converts outputs values to strings. This is used in building dependencies for the DAG.
+
+    :param output_values: the values to convert.
+    :param module_set: the modules any functions could come from.
+    :return: the final values
+    :raises ValueError: if there are values that can't be used/converted.
+    """
+    final_values = []
+    errors = []
+    for final_var in output_values:
+        _val, _error = convert_output_value(final_var, module_set)
+        if _val:
+            final_values.append(_val)
+        if _error:
+            errors.append(_error)
+    if errors:
+        errors.sort()
+        error_str = f"{len(errors)} errors encountered:\n  " + "\n  ".join(errors)
+        raise ValueError(error_str)
+    return final_values
diff --git a/hamilton/driver.py b/hamilton/driver.py
@@ -14,6 +14,7 @@
 
 import pandas as pd
 
+from hamilton import common
 from hamilton.execution import executors, graph_functions, grouping, state
 from hamilton.io import materialization
 
@@ -419,31 +420,8 @@ def _create_final_vars(self, final_vars: List[Union[str, Callable, Variable]]) -
         :param final_vars:
         :return: list of strings in the order that final_vars was provided.
         """
-        _final_vars = []
-        errors = []
-        module_set = {_module.__name__ for _module in self.graph_modules}
-        for final_var in final_vars:
-            if isinstance(final_var, str):
-                _final_vars.append(final_var)
-            elif isinstance(final_var, Variable):
-                _final_vars.append(final_var.name)
-            elif isinstance(final_var, Callable):
-                if final_var.__module__ in module_set:
-                    _final_vars.append(final_var.__name__)
-                else:
-                    errors.append(
-                        f"Function {final_var.__module__}.{final_var.__name__} is a function not "
-                        f"in a "
-                        f"module given to the driver. Valid choices are {module_set}."
-                    )
-            else:
-                errors.append(
-                    f"Final var {final_var} is not a string, a function, or a driver.Variable."
-                )
-        if errors:
-            errors.sort()
-            error_str = f"{len(errors)} errors encountered:\n  " + "\n  ".join(errors)
-            raise ValueError(error_str)
+        _module_set = {_module.__name__ for _module in self.graph_modules}
+        _final_vars = common.convert_output_values(final_vars, _module_set)
         return _final_vars
 
     def capture_execute_telemetry(
diff --git a/hamilton/io/materialization.py b/hamilton/io/materialization.py
@@ -1,9 +1,8 @@
 import sys
 import typing
-from types import ModuleType
-from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
+from typing import Any, Dict, List, Optional, Set, Type, Union
 
-from hamilton import base, graph, node
+from hamilton import base, common, graph, node
 from hamilton.function_modifiers.adapters import SaveToDecorator
 from hamilton.function_modifiers.dependencies import SingleDependency, value
 from hamilton.graph import FunctionGraph
@@ -68,7 +67,7 @@ def __init__(
         self.dependencies = dependencies
         self.data_saver_kwargs = self._process_kwargs(data_saver_kwargs)
 
-    def sanitize_dependencies(self, module_set: Set[ModuleType]) -> "MaterializerFactory":
+    def sanitize_dependencies(self, module_set: Set[str]) -> "MaterializerFactory":
         """Sanitizes the dependencies to ensure they're strings.
 
         This replaces the internal value for self.dependencies and returns a new object.
@@ -77,32 +76,9 @@ def sanitize_dependencies(self, module_set: Set[ModuleType]) -> "MaterializerFac
         :param module_set: modules that "functions" could come from if that's passed in.
         :return: new object with sanitized_dependencies.
         """
-        _final_vars = []
-        errors = []
-        for final_var in self.dependencies:
-            if isinstance(final_var, str):
-                _final_vars.append(final_var)
-            elif hasattr(final_var, "name"):
-                _final_vars.append(final_var.name)
-            elif isinstance(final_var, Callable):
-                if final_var.__module__ in module_set:
-                    _final_vars.append(final_var.__name__)
-                else:
-                    errors.append(
-                        f"Function {final_var.__module__}.{final_var.__name__} is a function not "
-                        f"in a "
-                        f"module given to the materializer. Valid choices are {module_set}."
-                    )
-            else:
-                errors.append(
-                    f"Materializer dependency {final_var} is not a string, a function, or a driver.Variable."
-                )
-        if errors:
-            errors.sort()
-            error_str = f"{len(errors)} errors encountered:\n  " + "\n  ".join(errors)
-            raise ValueError(error_str)
+        final_vars = common.convert_output_values(self.dependencies, module_set)
         return MaterializerFactory(
-            self.id, self.savers, self.result_builder, _final_vars, **self.data_saver_kwargs
+            self.id, self.savers, self.result_builder, final_vars, **self.data_saver_kwargs
         )
 
     @staticmethod
diff --git a/tests/io/test_materialization.py b/tests/io/test_materialization.py
@@ -1,6 +1,10 @@
 import dataclasses
 from typing import Any, Collection, Dict, List, Optional, Type
 
+import pytest
+
+import tests.resources.cyclic_functions
+import tests.resources.test_default_args
 from hamilton import base, graph, node
 from hamilton.io import materialization
 from hamilton.io.data_adapters import DataSaver
@@ -152,3 +156,39 @@ def second_node() -> dict:
     assert "materializer_2" in fn_graph_modified.nodes
     assert "first_node" in fn_graph_modified.nodes
     assert "second_node" in fn_graph_modified.nodes
+
+
+def test_sanitize_materializer_dependencies_happy():
+    """Tests that we return new objects & appropriately sanitize dependency types - converting them as necessary."""
+    factory_1 = MaterializerFactory(
+        "materializer_1",
+        [MockDataSaver],
+        dependencies=[
+            tests.resources.test_default_args.A,
+            tests.resources.test_default_args.B,
+            "C",
+        ],
+        result_builder=JoinBuilder(),
+        storage_key="test_modify_function_graph_2",
+    )
+    s = {tests.resources.test_default_args.__name__}
+    actual = factory_1.sanitize_dependencies(s)
+    assert actual.id == factory_1.id
+    assert actual.savers == factory_1.savers
+    assert actual.result_builder == factory_1.result_builder
+    assert actual.dependencies == ["A", "B", "C"]
+    assert actual is not factory_1
+
+
+def test_sanitize_materializer_dependencies_error():
+    """Tests that we error when bad cases are encountered."""
+    factory_1 = MaterializerFactory(
+        "materializer_1",
+        [MockDataSaver],
+        dependencies=["B", tests.resources.cyclic_functions.A],
+        result_builder=JoinBuilder(),
+        storage_key="test_modify_function_graph_2",
+    )
+    with pytest.raises(ValueError):
+        s = {tests.resources.test_default_args.__name__}
+        factory_1.sanitize_dependencies(s)
diff --git a/tests/test_common.py b/tests/test_common.py
@@ -0,0 +1,60 @@
+import pytest
+
+import tests.resources.cyclic_functions
+import tests.resources.test_default_args
+from hamilton import common, driver
+
+
+class Object:
+    """Dummy class to test with."""
+
+    def __repr__(self):
+        return "'object'"
+
+
+@pytest.mark.parametrize(
+    "value_to_convert, module_set, expected_value, expected_error",
+    [
+        ("a", {"amodule"}, "a", None),
+        (
+            tests.resources.test_default_args.A,
+            {tests.resources.test_default_args.__name__},
+            "A",
+            None,
+        ),
+        (driver.Variable("A", int), {"amodule"}, "A", None),
+        (
+            Object(),
+            {"amodule"},
+            None,
+            "Materializer dependency 'object' is not a string, a function, or a driver.Variable.",
+        ),
+        (
+            tests.resources.cyclic_functions.A,
+            {tests.resources.test_default_args.__name__},
+            None,
+            "Function tests.resources.cyclic_functions.A is a function not in a module given to the materializer. Valid choices are {'tests.resources.test_default_args'}.",
+        ),
+    ],
+)
+def test_convert_output_value(value_to_convert, module_set, expected_value, expected_error):
+    actual_value, actual_error = common.convert_output_value(value_to_convert, module_set)
+    assert actual_value == expected_value
+    assert actual_error == expected_error
+
+
+def test_convert_output_values_happy():
+    """Tests that we loop as expected without issue"""
+    actual = common.convert_output_values(
+        [tests.resources.test_default_args.A, "B"], {tests.resources.test_default_args.__name__}
+    )
+    assert actual == ["A", "B"]
+
+
+def test_convert_output_values_error():
+    """Tests that we error when bad cases are encountered."""
+    with pytest.raises(ValueError):
+        common.convert_output_values(
+            [tests.resources.test_default_args.A, tests.resources.cyclic_functions.A],
+            {tests.resources.test_default_args.__name__},
+        )