From 76abd8c8855fc8f886bb4b64219a05e5aa34497c Mon Sep 17 00:00:00 2001 From: Omri Steiner Date: Mon, 7 Jul 2025 23:24:40 +0200 Subject: [PATCH 1/2] internal/encoding/yaml: encode YAML anchors as CUE definitions This commits supports encoding YAML documents such as: a: &a 3 b: *a To this CUE document: #a: 3 a: #a b: #a Fixes #3818 Signed-off-by: Omri Steiner --- internal/encoding/yaml/decode.go | 137 +++++++++++++++++++--- internal/encoding/yaml/decode_test.go | 74 +++++++++--- internal/encoding/yaml/testdata/merge.out | 20 ++-- 3 files changed, 190 insertions(+), 41 deletions(-) diff --git a/internal/encoding/yaml/decode.go b/internal/encoding/yaml/decode.go index 82872405b96..9283f063a13 100644 --- a/internal/encoding/yaml/decode.go +++ b/internal/encoding/yaml/decode.go @@ -64,6 +64,15 @@ type decoder struct { // forceNewline ensures that the next position will be on a new line. forceNewline bool + + // anchorFields contains the anchors that are gathered as we walk the YAML nodes. + // these are only added to the AST when we're done processing the whole document. + anchorFields []ast.Field + // anchorNames map anchor nodes to their names. + anchorNames map[*yaml.Node]string + // anchorTakenNames keeps track of anchor names that have been taken. + // It is used to ensure unique anchor names. + anchorTakenNames map[string]struct{} } // TODO(mvdan): this can be io.Reader really, except that token.Pos is offset-based, @@ -83,9 +92,11 @@ func NewDecoder(filename string, b []byte) *decoder { tokFile := token.NewFile(filename, 0, len(b)+1) tokFile.SetLinesForContent(b) return &decoder{ - tokFile: tokFile, - tokLines: append(tokFile.Lines(), len(b)), - yamlDecoder: *yaml.NewDecoder(bytes.NewReader(b)), + tokFile: tokFile, + tokLines: append(tokFile.Lines(), len(b)), + yamlDecoder: *yaml.NewDecoder(bytes.NewReader(b)), + anchorNames: make(map[*yaml.Node]string), + anchorTakenNames: make(map[string]struct{}), } } @@ -176,24 +187,35 @@ func Unmarshal(filename string, data []byte) (ast.Expr, error) { return n, nil } -func (d *decoder) extract(yn *yaml.Node) (ast.Expr, error) { - d.addHeadCommentsToPending(yn) - var expr ast.Expr - var err error +func (d *decoder) extractNoAnchor(yn *yaml.Node) (ast.Expr, error) { switch yn.Kind { case yaml.DocumentNode: - expr, err = d.document(yn) + return d.document(yn) case yaml.SequenceNode: - expr, err = d.sequence(yn) + return d.sequence(yn) case yaml.MappingNode: - expr, err = d.mapping(yn) + return d.mapping(yn) case yaml.ScalarNode: - expr, err = d.scalar(yn) + return d.scalar(yn) case yaml.AliasNode: - expr, err = d.alias(yn) + return d.referenceAlias(yn) default: return nil, d.posErrorf(yn, "unknown yaml node kind: %d", yn.Kind) } +} + +func (d *decoder) extract(yn *yaml.Node) (ast.Expr, error) { + d.addHeadCommentsToPending(yn) + + var expr ast.Expr + var err error + + if yn.Anchor == "" { + expr, err = d.extractNoAnchor(yn) + } else { + expr, err = d.anchor(yn) + } + if err != nil { return nil, err } @@ -324,7 +346,39 @@ func (d *decoder) document(yn *yaml.Node) (ast.Expr, error) { if n := len(yn.Content); n != 1 { return nil, d.posErrorf(yn, "yaml document nodes are meant to have one content node but have %d", n) } - return d.extract(yn.Content[0]) + + expr, err := d.extract(yn.Content[0]) + if err != nil { + return nil, err + } + + return d.addAnchorNodes(expr) +} + +// addAnchorNodes prepends anchor nodes at the top of the document. +func (d *decoder) addAnchorNodes(expr ast.Expr) (ast.Expr, error) { + elements := []ast.Decl{} + + for _, field := range d.anchorFields { + elements = append(elements, &field) + } + + switch x := expr.(type) { + case *ast.StructLit: + x.Elts = append(elements, x.Elts...) + case *ast.ListLit: + if len(elements) > 0 { + expr = &ast.StructLit{ + Elts: append(elements, x), + } + } + default: + // If the whole YAML document is not a map / seq, then it can't have anchors. + // maybe assert that `anchorFields` is empty? + break + } + + return expr, nil } func (d *decoder) sequence(yn *yaml.Node) (ast.Expr, error) { @@ -458,7 +512,7 @@ func (d *decoder) label(yn *yaml.Node) (ast.Label, error) { if yn.Alias.Kind != yaml.ScalarNode { return nil, d.posErrorf(yn, "invalid map key: %v", yn.Alias.ShortTag()) } - expr, err = d.alias(yn) + expr, err = d.inlineAlias(yn) value = yn.Alias.Value default: return nil, d.posErrorf(yn, "invalid map key: %v", yn.ShortTag()) @@ -639,7 +693,10 @@ func (d *decoder) makeNum(yn *yaml.Node, val string, kind token.Token) (expr ast return expr } -func (d *decoder) alias(yn *yaml.Node) (ast.Expr, error) { +// inlineAlias expands an alias node in place, returning the expanded node. +// Sometimes we have to resort to this, for example when the alias +// is inside a map key, since CUE does not support structs as map keys. +func (d *decoder) inlineAlias(yn *yaml.Node) (ast.Expr, error) { if d.extractingAliases[yn] { // TODO this could actually be allowed in some circumstances. return nil, d.posErrorf(yn, "anchor %q value contains itself", yn.Value) @@ -649,11 +706,59 @@ func (d *decoder) alias(yn *yaml.Node) (ast.Expr, error) { } d.extractingAliases[yn] = true var node ast.Expr - node, err := d.extract(yn.Alias) + node, err := d.extractNoAnchor(yn.Alias) delete(d.extractingAliases, yn) return node, err } +// referenceAlias replaces an alias with a reference to the identifier of its anchor. +func (d *decoder) referenceAlias(yn *yaml.Node) (ast.Expr, error) { + anchor, ok := d.anchorNames[yn.Alias] + if !ok { + return nil, d.posErrorf(yn, "anchor %q not found", yn.Alias.Anchor) + } + + return &ast.Ident{ + NamePos: d.pos(yn), + Name: anchor, + }, nil +} + +func (d *decoder) anchor(yn *yaml.Node) (ast.Expr, error) { + var anchorIdent string + + // Pick a non-conflicting anchor name. + for i := 1; ; i++ { + if i == 1 { + anchorIdent = "#" + yn.Anchor + } else { + anchorIdent = "#" + yn.Anchor + "_" + strconv.Itoa(i) + } + if _, ok := d.anchorTakenNames[anchorIdent]; !ok { + d.anchorTakenNames[anchorIdent] = struct{}{} + break + } + } + d.anchorNames[yn] = anchorIdent + + // Process the node itself, but don't put it into the AST just yet, + // store it for later to be used as an anchor identifier. + pos := d.pos(yn) + expr, err := d.extractNoAnchor(yn) + if err != nil { + return nil, err + } + d.anchorFields = append(d.anchorFields, ast.Field{ + Label: &ast.Ident{Name: anchorIdent}, + Value: expr, + }) + + return &ast.Ident{ + NamePos: pos, + Name: anchorIdent, + }, nil +} + func labelStr(l ast.Label) string { switch l := l.(type) { case *ast.Ident: diff --git a/internal/encoding/yaml/decode_test.go b/internal/encoding/yaml/decode_test.go index 15f7d3ea1a9..3ee8e67f9a1 100644 --- a/internal/encoding/yaml/decode_test.go +++ b/internal/encoding/yaml/decode_test.go @@ -474,26 +474,67 @@ Null: 1 // Anchors and aliases. { "a: &x 1\nb: &y 2\nc: *x\nd: *y\n", - `a: 1 -b: 2 -c: 1 -d: 2`, + `#x: 1 +#y: 2 +a: #x +b: #y +c: #x +d: #y`, }, { "a: &a {c: 1}\nb: *a", - `a: {c: 1} -b: { - c: 1 -}`, + `#a: {c: 1} +a: #a +b: #a`, }, { "a: &a [1, 2]\nb: *a", - "a: [1, 2]\nb: [1, 2]", // TODO: a: [1, 2], b: a + "#a: [1, 2]\na: #a\nb: #a", }, { `a: &a "b" *a : "c"`, - `a: "b" -b: "c"`, + `#a: "b" +a: #a +b: "c"`, + }, + { + `- 3 +- &a 4 +- *a`, + `#a: 4, [ + 3, + #a, + #a, +]`, + }, + // Test nested anchors + { + `foo: &a + bar: &b + baz: 1 +a: *a +b: *b +`, + `#b: { + baz: 1 +} +#a: { + bar: #b +} +foo: #a +a: #a +b: #b`, }, + { + `a: + - &b c`, + `#b: "c" +a: [#b]`, + }, + // Recursive anchor - make sure we don't infinitely recurse on such input. + {"a: &a\n b: *a\n", `#a: { + b: #a +} +a: #a`}, { "foo: ''", @@ -778,10 +819,12 @@ a: // yaml-test-suite 3GZX: Spec Example 7.1. Alias Nodes { "First occurrence: &anchor Foo\nSecond occurrence: *anchor\nOverride anchor: &anchor Bar\nReuse anchor: *anchor\n", - `"First occurrence": "Foo" -"Second occurrence": "Foo" -"Override anchor": "Bar" -"Reuse anchor": "Bar"`, + `#anchor: "Foo" +#anchor_2: "Bar" +"First occurrence": #anchor +"Second occurrence": #anchor +"Override anchor": #anchor_2 +"Reuse anchor": #anchor_2`, }, } @@ -926,7 +969,6 @@ var unmarshalErrorTests = []struct { {"v:\n- [A,", "test.yaml:2: did not find expected node content"}, {"a:\n- b: *,", "test.yaml:2: did not find expected alphabetic or numeric character"}, {"a: *b\n", "test.yaml: unknown anchor 'b' referenced"}, - {"a: &a\n b: *a\n", `test.yaml:2: anchor "a" value contains itself`}, {"a: &a { b: c }\n*a : foo", "test.yaml:2: invalid map key: !!map"}, {"a: &a [b]\n*a : foo", "test.yaml:2: invalid map key: !!seq"}, {"value: -", "test.yaml: block sequence entries are not allowed in this context"}, diff --git a/internal/encoding/yaml/testdata/merge.out b/internal/encoding/yaml/testdata/merge.out index bc55dde7a3a..333adc99ff0 100644 --- a/internal/encoding/yaml/testdata/merge.out +++ b/internal/encoding/yaml/testdata/merge.out @@ -1,15 +1,17 @@ +#CENTER: {x: 1, y: 2} +#LEFT: {x: 0, y: 2} +#BIG: {r: 10} +#SMALL: {r: 1} + // From http://yaml.org/type/merge.html // Test anchors: { - list: [{ - x: 1, y: 2 - }, { - x: 0, y: 2 - }, { - r: 10 - }, { - r: 1 - }] + list: [ + #CENTER, + #LEFT, + #BIG, + #SMALL, + ] } // All the following maps are equal: From 84379d28320112039133d472e74854ca58f7ffc3 Mon Sep 17 00:00:00 2001 From: Omri Steiner Date: Mon, 14 Jul 2025 17:17:56 +0200 Subject: [PATCH 2/2] internal/encoding/yaml: declare YAML anchor definitions closer to where they're defined Signed-off-by: Omri Steiner --- internal/encoding/yaml/decode.go | 43 ++++++++++++++++----------- internal/encoding/yaml/decode_test.go | 4 +-- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/internal/encoding/yaml/decode.go b/internal/encoding/yaml/decode.go index 9283f063a13..12a12e793c5 100644 --- a/internal/encoding/yaml/decode.go +++ b/internal/encoding/yaml/decode.go @@ -162,7 +162,7 @@ func (d *decoder) Decode() (ast.Expr, error) { return nil, err } d.yamlNonEmpty = true - return d.extract(&yn) + return d.extract(&yn, true) } // Unmarshal parses a single YAML value to a CUE expression. @@ -187,14 +187,14 @@ func Unmarshal(filename string, data []byte) (ast.Expr, error) { return n, nil } -func (d *decoder) extractNoAnchor(yn *yaml.Node) (ast.Expr, error) { +func (d *decoder) extractNoAnchor(yn *yaml.Node, isTopLevel bool) (ast.Expr, error) { switch yn.Kind { case yaml.DocumentNode: return d.document(yn) case yaml.SequenceNode: return d.sequence(yn) case yaml.MappingNode: - return d.mapping(yn) + return d.mapping(yn, isTopLevel) case yaml.ScalarNode: return d.scalar(yn) case yaml.AliasNode: @@ -204,16 +204,16 @@ func (d *decoder) extractNoAnchor(yn *yaml.Node) (ast.Expr, error) { } } -func (d *decoder) extract(yn *yaml.Node) (ast.Expr, error) { +func (d *decoder) extract(yn *yaml.Node, isTopLevel bool) (ast.Expr, error) { d.addHeadCommentsToPending(yn) var expr ast.Expr var err error if yn.Anchor == "" { - expr, err = d.extractNoAnchor(yn) + expr, err = d.extractNoAnchor(yn, isTopLevel) } else { - expr, err = d.anchor(yn) + expr, err = d.anchor(yn, isTopLevel) } if err != nil { @@ -347,7 +347,7 @@ func (d *decoder) document(yn *yaml.Node) (ast.Expr, error) { return nil, d.posErrorf(yn, "yaml document nodes are meant to have one content node but have %d", n) } - expr, err := d.extract(yn.Content[0]) + expr, err := d.extract(yn.Content[0], true) if err != nil { return nil, err } @@ -394,7 +394,7 @@ func (d *decoder) sequence(yn *yaml.Node) (ast.Expr, error) { closeSameLine := true for _, c := range yn.Content { d.forceNewline = multiline - elem, err := d.extract(c) + elem, err := d.extract(c, false) if err != nil { return nil, err } @@ -408,14 +408,14 @@ func (d *decoder) sequence(yn *yaml.Node) (ast.Expr, error) { return list, nil } -func (d *decoder) mapping(yn *yaml.Node) (ast.Expr, error) { +func (d *decoder) mapping(yn *yaml.Node, isTopLevel bool) (ast.Expr, error) { strct := &ast.StructLit{} multiline := false if len(yn.Content) > 0 { multiline = yn.Line < yn.Content[len(yn.Content)-1].Line } - if err := d.insertMap(yn, strct, multiline, false); err != nil { + if err := d.insertMap(yn, strct, multiline, false, isTopLevel); err != nil { return nil, err } // TODO(mvdan): moving these positions above insertMap breaks a few tests, why? @@ -428,7 +428,7 @@ func (d *decoder) mapping(yn *yaml.Node) (ast.Expr, error) { return strct, nil } -func (d *decoder) insertMap(yn *yaml.Node, m *ast.StructLit, multiline, mergeValues bool) error { +func (d *decoder) insertMap(yn *yaml.Node, m *ast.StructLit, multiline, mergeValues bool, isTopLevel bool) error { l := len(yn.Content) outer: for i := 0; i < l; i += 2 { @@ -459,7 +459,7 @@ outer: f := decl.(*ast.Field) name, _, err := ast.LabelName(f.Label) if err == nil && name == key { - f.Value, err = d.extract(yv) + f.Value, err = d.extract(yv, false) if err != nil { return err } @@ -468,12 +468,19 @@ outer: } } - value, err := d.extract(yv) + value, err := d.extract(yv, false) if err != nil { return err } field.Value = value + if isTopLevel { + for _, field := range d.anchorFields { + m.Elts = append(m.Elts, &field) + } + d.anchorFields = nil + } + m.Elts = append(m.Elts, field) } return nil @@ -482,9 +489,9 @@ outer: func (d *decoder) merge(yn *yaml.Node, m *ast.StructLit, multiline bool) error { switch yn.Kind { case yaml.MappingNode: - return d.insertMap(yn, m, multiline, true) + return d.insertMap(yn, m, multiline, true, false) case yaml.AliasNode: - return d.insertMap(yn.Alias, m, multiline, true) + return d.insertMap(yn.Alias, m, multiline, true, false) case yaml.SequenceNode: // Step backwards as earlier nodes take precedence. for _, c := range slices.Backward(yn.Content) { @@ -706,7 +713,7 @@ func (d *decoder) inlineAlias(yn *yaml.Node) (ast.Expr, error) { } d.extractingAliases[yn] = true var node ast.Expr - node, err := d.extractNoAnchor(yn.Alias) + node, err := d.extractNoAnchor(yn.Alias, false) delete(d.extractingAliases, yn) return node, err } @@ -724,7 +731,7 @@ func (d *decoder) referenceAlias(yn *yaml.Node) (ast.Expr, error) { }, nil } -func (d *decoder) anchor(yn *yaml.Node) (ast.Expr, error) { +func (d *decoder) anchor(yn *yaml.Node, isTopLevel bool) (ast.Expr, error) { var anchorIdent string // Pick a non-conflicting anchor name. @@ -744,7 +751,7 @@ func (d *decoder) anchor(yn *yaml.Node) (ast.Expr, error) { // Process the node itself, but don't put it into the AST just yet, // store it for later to be used as an anchor identifier. pos := d.pos(yn) - expr, err := d.extractNoAnchor(yn) + expr, err := d.extractNoAnchor(yn, isTopLevel) if err != nil { return nil, err } diff --git a/internal/encoding/yaml/decode_test.go b/internal/encoding/yaml/decode_test.go index 3ee8e67f9a1..ee9ed3a1d76 100644 --- a/internal/encoding/yaml/decode_test.go +++ b/internal/encoding/yaml/decode_test.go @@ -475,8 +475,8 @@ Null: 1 { "a: &x 1\nb: &y 2\nc: *x\nd: *y\n", `#x: 1 -#y: 2 a: #x +#y: 2 b: #y c: #x d: #y`, @@ -820,9 +820,9 @@ a: { "First occurrence: &anchor Foo\nSecond occurrence: *anchor\nOverride anchor: &anchor Bar\nReuse anchor: *anchor\n", `#anchor: "Foo" -#anchor_2: "Bar" "First occurrence": #anchor "Second occurrence": #anchor +#anchor_2: "Bar" "Override anchor": #anchor_2 "Reuse anchor": #anchor_2`, },