dagster-io · OwenKephart · Jan 23, 2025 · Jan 16, 2025 · graphite-app · Jan 22, 2025
diff --git a/python_modules/libraries/dagster-dg/dagster_dg/docs.py b/python_modules/libraries/dagster-dg/dagster_dg/docs.py
@@ -1,24 +1,72 @@
 import tempfile
 import webbrowser
-from collections.abc import Mapping, Sequence, Set
-from dataclasses import dataclass
-from typing import Any, Optional, Union
+from collections.abc import Iterator, Mapping, Sequence, Set
+from typing import Any, Union
 
 import markdown
 import yaml
+from dagster._utils.source_position import SourcePositionTree
+from dagster._utils.yaml_utils import parse_yaml_with_source_positions
 
 from dagster_dg.component import RemoteComponentType
 
 REF_BASE = "#/$defs/"
-COMMENTED_MAPPING_TAG = "!commented_mapping"
+JSON_SCHEMA_EXTRA_REQUIRED_SCOPE_KEY = "dagster_required_scope"
 
 
-@dataclass(frozen=True)
-class CommentedObject:
-    key: str
-    value: Union[Sequence["CommentedObject"], Any]
-    comment: Optional[str]
-    top_level: bool = False
+def _subschemas_on_path(
+    valpath: Sequence[Union[str, int]], json_schema: Mapping[str, Any], subschema: Mapping[str, Any]
+) -> Iterator[Mapping[str, Any]]:
+    """Given a valpath and the json schema of a given target type, returns the subschemas at each step of the path."""
+    # List[ComplexType] (e.g.) will contain a reference to the complex type schema in the
+    # top-level $defs, so we dereference it here.
+    if "$ref" in subschema:
+        # depending on the pydantic version, the extras may be stored with the reference or not
+        extras = {k: v for k, v in subschema.items() if k != "$ref"}
+        subschema = {**json_schema["$defs"].get(subschema["$ref"][len(REF_BASE) :]), **extras}
+
+    yield subschema
+    if len(valpath) == 0:
+        return
+
+    # Optional[ComplexType] (e.g.) will contain multiple schemas in the "anyOf" field
+    if "anyOf" in subschema:
+        for inner in subschema["anyOf"]:
+            yield from _subschemas_on_path(valpath, json_schema, inner)
+
+    el = valpath[0]
+    if isinstance(el, str):
+        # valpath: ['field']
+        # field: X
+        inner = subschema.get("properties", {}).get(el)
+    elif isinstance(el, int):
+        # valpath: ['field', 0]
+        # field: List[X]
+        inner = subschema.get("items")
+    else:
+        raise ValueError(f"Invalid valpath element: {el}")
+
+    # the path wasn't valid, or unspecified
+    if not inner:
+        return
+
+    _, *rest = valpath
+    yield from _subschemas_on_path(rest, json_schema, inner)
+
+
+def _get_additional_required_scope(subschema: Mapping[str, Any]) -> Set[str]:
+    raw = subschema.get(JSON_SCHEMA_EXTRA_REQUIRED_SCOPE_KEY)
+    return set(raw) if raw else set()
+
+
+def get_required_scope(
+    valpath: Sequence[Union[str, int]], json_schema: Mapping[str, Any]
+) -> Set[str]:
+    """Given a valpath and the json schema of a given target type, determines the available rendering scope."""
+    required_scope = set()
+    for subschema in _subschemas_on_path(valpath, json_schema, json_schema):
+        required_scope |= _get_additional_required_scope(subschema)
+    return required_scope
 
 
 def _dereference_schema(
@@ -30,38 +78,22 @@ def _dereference_schema(
         return subschema
 
 
-def _commented_object_for_subschema(
-    name: str,
-    json_schema: Mapping[str, Any],
-    subschema: Mapping[str, Any],
-    required_scope: Optional[Set[str]] = None,
-) -> Union[CommentedObject, Any]:
-    additional_scope = subschema.get("dagster_required_scope")
-    required_scope = (required_scope or set()) | set(additional_scope or [])
-
+def _sample_value_for_subschema(
+    json_schema: Mapping[str, Any], subschema: Mapping[str, Any]
+) -> Any:
     subschema = _dereference_schema(json_schema, subschema)
     if "anyOf" in subschema:
         # TODO: handle anyOf fields more gracefully, for now just choose first option
-        return _commented_object_for_subschema(
-            name, json_schema, subschema["anyOf"][0], required_scope=required_scope
-        )
+        return _sample_value_for_subschema(json_schema, subschema["anyOf"][0])
 
     objtype = subschema["type"]
     if objtype == "object":
-        return CommentedObject(
-            key=name,
-            value={
-                k: _commented_object_for_subschema(k, json_schema, v)
-                for k, v in subschema.get("properties", {}).items()
-            },
-            comment=f"Available scope: {required_scope}" if required_scope else None,
-        )
+        return {
+            k: _sample_value_for_subschema(json_schema, v)
+            for k, v in subschema.get("properties", {}).items()
+        }
     elif objtype == "array":
-        return [
-            _commented_object_for_subschema(
-                name, json_schema, subschema["items"], required_scope=required_scope
-            )
-        ]
+        return [_sample_value_for_subschema(json_schema, subschema["items"])]
     elif objtype == "string":
         return "..."
     elif objtype == "integer":
@@ -83,44 +115,32 @@ def write_line_break(self) -> None:
             super().write_line_break()
         super().write_line_break()
 
-    def _get_tag(self) -> str:
-        return getattr(self.event, "tag", "")
-
-    def expect_node(self, root=False, sequence=False, mapping=False, simple_key=False):
-        # for commented mappings, emit comment above the value
-        tag = self._get_tag()
-        if tag.startswith(COMMENTED_MAPPING_TAG):
-            self.write_indicator(f"# {tag[len(COMMENTED_MAPPING_TAG) + 1:]}", True)
-            self.write_line_break()
-
-        return super().expect_node(root, sequence, mapping, simple_key)
-
-    def process_tag(self):
-        tag = self._get_tag()
-        # ignore the mapping tag as it's handled specially
-        if tag.startswith(COMMENTED_MAPPING_TAG):
-            return
-        else:
-            super().process_tag()
 
-
-def commented_object_representer(dumper: yaml.SafeDumper, obj: CommentedObject) -> yaml.nodes.Node:
-    mapping = obj.value if isinstance(obj.value, dict) else {obj.key: obj.value}
-
-    if obj.comment is not None:
-        return dumper.represent_mapping(f"{COMMENTED_MAPPING_TAG}|{obj.comment}", mapping)
-    else:
-        return dumper.represent_dict(mapping)
-
-
-ComponentDumper.add_representer(CommentedObject, commented_object_representer)
+def _get_source_position_comments(
+    valpath: Sequence[Union[str, int]], tree: SourcePositionTree, json_schema: Mapping[str, Any]
+) -> Iterator[tuple[int, str]]:
+    available_scope = get_required_scope(valpath[1:], json_schema)
+    if available_scope:
+        yield (tree.position.start.line - 1, f"Available scope: {available_scope}")
+    for child_path, child_tree in tree.children.items():
+        yield from _get_source_position_comments([*valpath, child_path], child_tree, json_schema)
 
 
 def generate_sample_yaml(component_type: str, json_schema: Mapping[str, Any]) -> str:
-    params_obj = _commented_object_for_subschema("params", json_schema, json_schema)
-    return yaml.dump(
-        {"type": component_type, "params": params_obj}, Dumper=ComponentDumper, sort_keys=False
+    raw = yaml.dump(
+        {"type": component_type, "params": _sample_value_for_subschema(json_schema, json_schema)},
+        Dumper=ComponentDumper,
+        sort_keys=False,
     )
+    parsed = parse_yaml_with_source_positions(raw)
+    comments = dict(_get_source_position_comments([], parsed.source_position_tree, json_schema))
+    commented_lines = []
+    for line_num, line in enumerate(raw.split("\n")):
+        if line_num in comments:
+            commented_lines.append(f"{line} # {comments[line_num]}")
+        else:
+            commented_lines.append(line)
+    return "\n".join(commented_lines)
 
 
 def render_markdown_in_browser(markdown_content: str) -> None:

diff --git a/python_modules/libraries/dagster-dg/dagster_dg_tests/utils_tests/test_sample_yaml.py b/python_modules/libraries/dagster-dg/dagster_dg_tests/utils_tests/test_sample_yaml.py
@@ -11,7 +11,7 @@ class SampleSubSchema(ComponentSchemaBaseModel):
 
 
 class SampleSchema(ComponentSchemaBaseModel):
-    sub_scoped: Annotated[SampleSubSchema, ResolvableFieldInfo(additional_scope={"outer_scope"})]
+    sub_scoped: Annotated[SampleSubSchema, ResolvableFieldInfo(required_scope={"outer_scope"})]
     sub_optional: SampleSubSchema
     sub_list: Sequence[SampleSubSchema]
 
@@ -26,8 +26,8 @@ def test_generate_sample_yaml():
 
 params:
   sub_scoped: # Available scope: {'outer_scope'}
-    str_field: '...'
-    int_field: 0
+    str_field: '...' # Available scope: {'outer_scope'}
+    int_field: 0 # Available scope: {'outer_scope'}
   sub_optional:
     str_field: '...'
     int_field: 0