From 29c33d1ffe46d4d37eed2278c889c471e13058ac Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Sat, 23 May 2026 01:40:04 +0200 Subject: [PATCH 01/19] Add command to convert a repo from SHA1 to SHA256 Entire-Checkpoint: 198bc99d4b0b --- .entire/.gitignore | 1 + README.md | 3 + cmd/git-sync/convert_sha256.go | 106 ++ .../internal/sha256convert/sha256convert.go | 1016 +++++++++++++++++ .../sha256convert/sha256convert_test.go | 783 +++++++++++++ cmd/git-sync/root.go | 1 + docs/convert-sha256.md | 297 +++++ 7 files changed, 2207 insertions(+) create mode 100644 cmd/git-sync/convert_sha256.go create mode 100644 cmd/git-sync/internal/sha256convert/sha256convert.go create mode 100644 cmd/git-sync/internal/sha256convert/sha256convert_test.go create mode 100644 docs/convert-sha256.md diff --git a/.entire/.gitignore b/.entire/.gitignore index 2cffdefa..e66987d2 100644 --- a/.entire/.gitignore +++ b/.entire/.gitignore @@ -2,3 +2,4 @@ tmp/ settings.local.json metadata/ logs/ +redactors/local/ diff --git a/README.md b/README.md index b3a0255c..e1a2484e 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,8 @@ The main commands are: `sync` automatically bootstraps an empty target, so the same command covers initial seeding and ongoing sync. To preview what would happen without pushing, run `git-sync plan` — it takes the same flags as `sync`, and `--mode replicate` previews a `replicate` run. +For one-off SHA1 → SHA256 repo conversion, `git-sync convert-sha256` fetches from an HTTP source and writes a new SHA256 bare repo on disk, with optional commit-message hash rewrites, an origin-notes ref, and a sidecar mapping file. See [docs/convert-sha256.md](docs/convert-sha256.md). + For command examples, JSON output, auth, protocol flags, and advanced command notes, see [docs/usage.md](docs/usage.md). ## Library API @@ -93,6 +95,7 @@ Extended and environment-specific test instructions are in [docs/testing.md](doc - [docs/usage.md](docs/usage.md) — CLI commands, examples, sync behavior, JSON output, auth, protocol notes - [docs/architecture.md](docs/architecture.md) — product rationale, package layout, operation modes vs transfer modes, memory model - [docs/protocol.md](docs/protocol.md) — smart HTTP, pkt-line, capability negotiation, sideband, relay framing +- [docs/convert-sha256.md](docs/convert-sha256.md) — one-off SHA1 → SHA256 repo conversion, mapping outputs, sharp edges - [docs/testing.md](docs/testing.md) — test suites and integration coverage ## FAQ diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go new file mode 100644 index 00000000..3c6cc29b --- /dev/null +++ b/cmd/git-sync/convert_sha256.go @@ -0,0 +1,106 @@ +package main + +import ( + "errors" + "fmt" + + gitsync "entire.io/entire/git-sync" + "entire.io/entire/git-sync/cmd/git-sync/internal/sha256convert" + "entire.io/entire/git-sync/internal/validation" + "github.com/spf13/cobra" +) + +func newConvertSHA256Cmd() *cobra.Command { + var ( + req = sha256convert.Request{} + mappings []string + branches string + jsonOutput bool + protocolVal = newProtocolFlag() + ) + + cmd := &cobra.Command{ + Use: "convert-sha256 [flags] ", + Short: "One-off SHA1 → SHA256 conversion of a remote repo into a local bare repo", + Long: `convert-sha256 fetches a pack from a SHA1 HTTP source and writes a new +SHA256 bare repository on disk at . Every reachable object is +re-hashed under SHA256 and tree/commit/tag references are rewritten. + +The conversion is destructive in two ways the caller should be aware of: +no SHA1↔SHA256 mapping is persisted, and any GPG signatures on commits or +tags are dropped (they sign over the original SHA1 content and would be +invalid post-rewrite). Submodule gitlinks that point at a commit outside +this repository cannot be embedded in a SHA256 tree; if the source repo +contains any, the command exits with an error so the caller can scope +around the offending refs.`, + Args: cobra.MaximumNArgs(2), + SilenceErrors: true, + SilenceUsage: true, + RunE: func(cmd *cobra.Command, args []string) error { + req.ProtocolMode = gitsync.ProtocolMode(protocolVal) + if req.SourceURL == "" && len(args) > 0 { + req.SourceURL = args[0] + } + if req.TargetDir == "" && len(args) > 1 { + req.TargetDir = args[1] + } + if req.SourceURL == "" || req.TargetDir == "" { + return errors.New("convert-sha256 requires a source URL and a target directory") + } + if branches != "" { + req.Branches = splitCSV(branches) + } + for _, raw := range mappings { + mapping, err := validation.ParseMapping(raw) + if err != nil { + return fmt.Errorf("parse mapping %q: %w", raw, err) + } + req.Mappings = append(req.Mappings, gitsync.RefMapping{ + Source: mapping.Source, + Target: mapping.Target, + }) + } + + result, err := sha256convert.Run(cmd.Context(), req) + if err != nil { + return fmt.Errorf("convert-sha256: %w", err) + } + printOutput(jsonOutput, result) + return nil + }, + } + + cmd.Flags().StringVar(&req.SourceURL, "source-url", "", "source repository URL") + cmd.Flags().BoolVar(&req.SourceFollowInfoRefsRedirect, "source-follow-info-refs-redirect", + envBool("GITSYNC_SOURCE_FOLLOW_INFO_REFS_REDIRECT"), + "send follow-up source RPCs to the final /info/refs redirect host") + cmd.Flags().StringVar(&req.SourceAuth.Token, "source-token", + envOr("GITSYNC_SOURCE_TOKEN", ""), "source token/password") + cmd.Flags().StringVar(&req.SourceAuth.Username, "source-username", + envOr("GITSYNC_SOURCE_USERNAME", "git"), "source basic auth username") + cmd.Flags().StringVar(&req.SourceAuth.BearerToken, "source-bearer-token", + envOr("GITSYNC_SOURCE_BEARER_TOKEN", ""), "source bearer token") + cmd.Flags().BoolVar(&req.SourceAuth.SkipTLSVerify, "source-insecure-skip-tls-verify", + envBool("GITSYNC_SOURCE_INSECURE_SKIP_TLS_VERIFY"), + "skip TLS certificate verification for the source") + cmd.Flags().StringVar(&req.TargetDir, "target-dir", "", "directory to initialize as a SHA256 bare repository") + + cmd.Flags().StringVar(&branches, "branch", "", "comma-separated branch list; default is all source branches") + cmd.Flags().StringArrayVar(&mappings, "map", nil, "ref mapping in src:dst form; short names map branches, full refs map exact refs") + cmd.Flags().BoolVar(&req.IncludeTags, "tags", false, "include annotated and lightweight tags") + allRefsFlag(cmd, allRefsUsageScopeOnly, &req.AllRefs) + excludeRefPrefixFlag(cmd, &req.ExcludeRefPrefixes) + addProtocolFlag(cmd, &protocolVal) + cmd.Flags().BoolVarP(&req.Verbose, "verbose", "v", false, "verbose logging") + cmd.Flags().BoolVar(&req.KeepSourceObjects, "keep-source-objects", false, + "keep the temporary SHA1 store on disk after conversion (for debugging)") + cmd.Flags().StringVar(&req.MappingFile, "write-mapping", "", + "write the full SHA1 → SHA256 mapping as a TSV to this path; useful for rewriting external references") + cmd.Flags().BoolVar(&req.SkipMessageRewrite, "no-rewrite-messages", false, + "do not rewrite SHA1 hash references found in commit and tag messages") + cmd.Flags().BoolVar(&req.SkipOriginNotes, "no-origin-notes", false, + "do not write a refs/notes/sha1-origin ref recording each commit's original SHA1") + cmd.Flags().BoolVar(&jsonOutput, "json", false, "print JSON output") + + return cmd +} diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go new file mode 100644 index 00000000..241ec2d1 --- /dev/null +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -0,0 +1,1016 @@ +// Package sha256convert implements a one-off SHA1 → SHA256 conversion for a +// single repository. It fetches a pack from a remote SHA1 HTTP endpoint into +// a temporary on-disk SHA1 bare repo, then walks every reachable object and +// re-emits it under SHA256 into a new bare repo at the user-supplied path. +// +// The tool is intentionally scoped: no hash mapping is persisted, GPG +// signatures on commits and tags are dropped (they sign over the original +// SHA1 byte stream and would be invalid post-rewrite), and submodule +// gitlinks are left at their original SHA1 hash unless the referenced +// commit happens to live in the same repo. A run that encounters an +// unresolvable submodule entry fails so the caller can choose which refs +// to exclude. +package sha256convert + +import ( + "bufio" + "bytes" + "compress/zlib" + "context" + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "os" + "path/filepath" + "regexp" + "sort" + "strconv" + "strings" + "time" + + git "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/filemode" + formatcfg "github.com/go-git/go-git/v6/plumbing/format/config" + "github.com/go-git/go-git/v6/plumbing/object" + "github.com/go-git/go-git/v6/plumbing/storer" + transporthttp "github.com/go-git/go-git/v6/plumbing/transport/http" + "github.com/go-git/go-git/v6/storage/filesystem" + + gitsync "entire.io/entire/git-sync" + "entire.io/entire/git-sync/internal/auth" + "entire.io/entire/git-sync/internal/convert" + "entire.io/entire/git-sync/internal/gitproto" + "entire.io/entire/git-sync/internal/planner" +) + +// Request describes a single SHA1 → SHA256 conversion. +type Request struct { + SourceURL string + SourceAuth gitsync.EndpointAuth + SourceFollowInfoRefsRedirect bool + TargetDir string + + Branches []string + IncludeTags bool + AllRefs bool + ExcludeRefPrefixes []string + Mappings []gitsync.RefMapping + + ProtocolMode gitsync.ProtocolMode + Verbose bool + KeepSourceObjects bool + + // MappingFile, when non-empty, is a path to which a TSV of every + // translated object's SHA1 → SHA256 mapping is written. Useful for + // rewriting external systems that reference old commit hashes. + MappingFile string + + // SkipMessageRewrite disables the inline rewrite of SHA1 hashes found + // in commit and tag messages. Off by default (rewriting is on). + SkipMessageRewrite bool + + // SkipOriginNotes disables the refs/notes/sha1-origin output that + // records each translated commit's original SHA1. Off by default + // (notes are written). + SkipOriginNotes bool + + // Out receives human-readable status lines. Nil means os.Stderr. + Out io.Writer +} + +// Counts tallies converted objects by kind. +type Counts struct { + Blobs int `json:"blobs"` + Trees int `json:"trees"` + Commits int `json:"commits"` + Tags int `json:"tags"` +} + +// Result is the conversion summary, suitable for JSON output. +type Result struct { + SourceURL string `json:"sourceUrl"` + TargetDir string `json:"targetDir"` + Protocol string `json:"protocol"` + RefsConverted int `json:"refsConverted"` + Counts Counts `json:"counts"` + SignaturesStripped int `json:"signaturesStripped"` + MessageRewrites int `json:"messageRewrites"` + OriginNotesRef string `json:"originNotesRef,omitempty"` + MappingFile string `json:"mappingFile,omitempty"` + TempDir string `json:"tempDir,omitempty"` +} + +// Lines satisfies the human-readable output contract used by other git-sync subcommands. +func (r Result) Lines() []string { + lines := []string{ + fmt.Sprintf("sha256 bare repo: %s", r.TargetDir), + fmt.Sprintf("source: %s (%s)", r.SourceURL, r.Protocol), + fmt.Sprintf("converted: %d blobs, %d trees, %d commits, %d tags", + r.Counts.Blobs, r.Counts.Trees, r.Counts.Commits, r.Counts.Tags), + fmt.Sprintf("refs written: %d", r.RefsConverted), + } + if r.SignaturesStripped > 0 { + lines = append(lines, fmt.Sprintf("warning: stripped %d GPG signature(s); they no longer match the rewritten object content", r.SignaturesStripped)) + } + if r.MessageRewrites > 0 { + lines = append(lines, fmt.Sprintf("rewrote %d SHA1 hash reference(s) in commit/tag messages", r.MessageRewrites)) + } + if r.OriginNotesRef != "" { + lines = append(lines, fmt.Sprintf("origin notes ref: %s (use `git notes --ref=%s show ` to recover old SHA1)", + r.OriginNotesRef, strings.TrimPrefix(r.OriginNotesRef, "refs/notes/"))) + } + if r.MappingFile != "" { + lines = append(lines, fmt.Sprintf("mapping written to: %s", r.MappingFile)) + } + if r.TempDir != "" { + lines = append(lines, fmt.Sprintf("kept source objects: %s", r.TempDir)) + } + return lines +} + +// Run performs the conversion described by req. +func Run(ctx context.Context, req Request) (Result, error) { + if req.SourceURL == "" { + return Result{}, errors.New("convert-sha256 requires --source-url") + } + if req.TargetDir == "" { + return Result{}, errors.New("convert-sha256 requires a target directory") + } + out := req.Out + if out == nil { + out = os.Stderr + } + + if err := ensureEmptyTarget(req.TargetDir); err != nil { + return Result{}, err + } + + tempDir, err := os.MkdirTemp("", "git-sync-sha256-src-") + if err != nil { + return Result{}, fmt.Errorf("create temp dir: %w", err) + } + cleanupTemp := true + defer func() { + if cleanupTemp { + _ = os.RemoveAll(tempDir) + } + }() + + srcRepo, err := git.PlainInit(tempDir, true) + if err != nil { + return Result{}, fmt.Errorf("init temporary SHA1 store: %w", err) + } + + dstRepo, err := git.PlainInit(req.TargetDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + return Result{}, fmt.Errorf("init SHA256 target at %s: %w", req.TargetDir, err) + } + + // Source connection + ref discovery ----------------------------------- + planCfg := planner.PlanConfig{ + Branches: append([]string(nil), req.Branches...), + Mappings: toPlannerMappings(req.Mappings), + IncludeTags: req.IncludeTags, + AllRefs: req.AllRefs, + ExcludeRefPrefixes: append([]string(nil), req.ExcludeRefPrefixes...), + } + conn, refService, sourceRefList, err := openSource(ctx, req, planCfg) + if err != nil { + return Result{}, err + } + defer conn.Close() + refService.Verbose = req.Verbose + + sourceRefs := gitproto.RefHashMap(sourceRefList) + desired, _, err := planner.BuildDesiredRefs(sourceRefs, planCfg) + if err != nil { + return Result{}, fmt.Errorf("build desired refs: %w", err) + } + if len(desired) == 0 { + return Result{}, errors.New("no source refs matched the requested scope") + } + + // Fetch into temp SHA1 store ------------------------------------------ + fmt.Fprintf(out, "fetching %d ref(s) from %s ...\n", len(desired), req.SourceURL) + gpDesired := convert.DesiredRefs(desired) + if err := refService.FetchToStore(ctx, srcRepo.Storer, conn, gpDesired, nil); err != nil && + !errors.Is(err, git.NoErrAlreadyUpToDate) { + return Result{}, fmt.Errorf("fetch source pack: %w", err) + } + + // Discover + translate ------------------------------------------------ + tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, req.TargetDir, !req.SkipMessageRewrite) + if err != nil { + return Result{}, err + } + rootSHA1s := make([]plumbing.Hash, 0, len(desired)) + for _, d := range desired { + rootSHA1s = append(rootSHA1s, d.SourceHash) + } + fmt.Fprintln(out, "discovering reachable objects ...") + if err := tr.discover(rootSHA1s); err != nil { + return Result{}, fmt.Errorf("discover reachable: %w", err) + } + fmt.Fprintln(out, "translating objects to sha256 ...") + for _, d := range desired { + if _, err := tr.translate(d.SourceHash); err != nil { + return Result{}, fmt.Errorf("translate %s: %w", d.SourceRef, err) + } + } + + // Write refs --------------------------------------------------------- + refsWritten, err := writeRefs(dstRepo.Storer, desired, tr.mapping) + if err != nil { + return Result{}, fmt.Errorf("write target refs: %w", err) + } + + // Point HEAD at the source's symbolic HEAD if it landed in the + // converted ref set. PlainInit defaults HEAD to refs/heads/master, + // which often doesn't exist (e.g. repos using "main" as the default). + if refService.HeadTarget != "" { + if _, ok := desired[refService.HeadTarget]; ok { + head := plumbing.NewSymbolicReference(plumbing.HEAD, refService.HeadTarget) + if err := dstRepo.Storer.SetReference(head); err != nil { + return Result{}, fmt.Errorf("set HEAD: %w", err) + } + } + } + + res := Result{ + SourceURL: req.SourceURL, + TargetDir: req.TargetDir, + Protocol: refService.Protocol, + RefsConverted: refsWritten, + Counts: tr.counts, + SignaturesStripped: tr.signaturesStripped, + MessageRewrites: tr.messageRewrites, + } + + if !req.SkipOriginNotes && len(tr.commits) > 0 { + notesRef, err := tr.writeOriginNotes(originNotesRef) + if err != nil { + return Result{}, fmt.Errorf("write origin notes: %w", err) + } + if err := dstRepo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(notesRef), tr.lastNotesCommit)); err != nil { + return Result{}, fmt.Errorf("set %s: %w", notesRef, err) + } + res.OriginNotesRef = notesRef + } + + if req.MappingFile != "" { + if err := tr.writeMappingFile(req.MappingFile); err != nil { + return Result{}, fmt.Errorf("write mapping file: %w", err) + } + res.MappingFile = req.MappingFile + } + + if req.KeepSourceObjects { + cleanupTemp = false + res.TempDir = tempDir + } + return res, nil +} + +const originNotesRef = "refs/notes/sha1-origin" + +// ensureEmptyTarget refuses to init into a non-empty directory so the user +// doesn't quietly accumulate objects into an existing repo. +func ensureEmptyTarget(path string) error { + entries, err := os.ReadDir(path) + if err != nil { + if os.IsNotExist(err) { + if mkErr := os.MkdirAll(path, 0o755); mkErr != nil { + return fmt.Errorf("create target dir: %w", mkErr) + } + return nil + } + return fmt.Errorf("read target dir: %w", err) + } + if len(entries) > 0 { + return fmt.Errorf("target directory %s is not empty", path) + } + return nil +} + +func openSource(ctx context.Context, req Request, planCfg planner.PlanConfig) (gitproto.Conn, *gitproto.RefService, []*plumbing.Reference, error) { + ep, err := url.Parse(req.SourceURL) + if err != nil { + return nil, nil, nil, fmt.Errorf("parse source URL: %w", err) + } + if ep.Scheme != "http" && ep.Scheme != "https" { + return nil, nil, nil, fmt.Errorf("convert-sha256 currently supports HTTP/HTTPS sources only; got %q", ep.Scheme) + } + authMethod, err := auth.Resolve(auth.Endpoint{ + Username: req.SourceAuth.Username, + Token: req.SourceAuth.Token, + BearerToken: req.SourceAuth.BearerToken, + SkipTLSVerify: req.SourceAuth.SkipTLSVerify, + }, ep) + if err != nil { + return nil, nil, nil, fmt.Errorf("resolve source auth: %w", err) + } + httpClient := &http.Client{Transport: gitproto.NewHTTPTransport(req.SourceAuth.SkipTLSVerify)} + conn := gitproto.NewHTTPConnWithClient(ep, "source", normalizeAuth(authMethod), httpClient) + conn.FollowInfoRefsRedirect = req.SourceFollowInfoRefsRedirect + + mode := string(req.ProtocolMode) + if mode == "" { + mode = string(gitsync.ProtocolAuto) + } + + refs, svc, err := gitproto.ListSourceRefs(ctx, conn, mode, planner.RefPrefixes(planCfg)) + if err != nil { + _ = conn.Close() + return nil, nil, nil, fmt.Errorf("list source refs: %w", err) + } + return conn, svc, refs, nil +} + +func normalizeAuth(m auth.Method) gitproto.AuthMethod { + if m == nil { + return nil + } + // auth.Method and gitproto.AuthMethod share the same Authorizer signature. + // Wrap so we can pass either *transporthttp.BasicAuth or *transporthttp.TokenAuth. + if a, ok := m.(*transporthttp.BasicAuth); ok { + return a + } + if a, ok := m.(*transporthttp.TokenAuth); ok { + return a + } + return authAdapter{m: m} +} + +type authAdapter struct{ m auth.Method } + +func (a authAdapter) Authorizer(req *http.Request) error { return a.m.Authorizer(req) } + +func toPlannerMappings(in []gitsync.RefMapping) []planner.RefMapping { + out := make([]planner.RefMapping, 0, len(in)) + for _, m := range in { + out = append(out, planner.RefMapping{Source: m.Source, Target: m.Target}) + } + return out +} + +// translator walks the SHA1 source store, rewrites object content with +// SHA256-mapped hashes, and writes the result as loose objects under the +// target bare repo. Loose object writing is done by hand because go-git +// v6 alpha 3's objfile.Writer hardcodes SHA1 in prepareForWrite (see +// plumbing/format/objfile/writer.go:68), which would store every SHA256 +// object at a SHA1-derived path. +type translator struct { + src *filesystem.Storage + dst *filesystem.Storage + objectsDir string + // reachable holds every in-scope SHA1 with its object type, built up + // front by a discovery pass that walks tree/commit/tag dependencies + // from the desired ref tips. It is the authoritative "what's in + // scope" set: abbreviated SHA1 prefixes in commit/tag messages are + // resolved against this set so a unique match is fixed before any + // encoding starts, and so message-reference edges can be added to + // the translation DFS in topological order. + reachable map[plumbing.Hash]plumbing.ObjectType + mapping map[plumbing.Hash]plumbing.Hash + // inProgress detects cycles in the translation DFS. Real Git + // histories cannot form cycles (the parent/tree/tag-target edges + // are a DAG by construction, and SHA1 message-reference cycles are + // cryptographically infeasible), but a defensive guard turns + // surprising input into a clear error instead of a stack overflow. + inProgress map[plumbing.Hash]struct{} + // commits records every translated commit's old SHA1, in DFS order, + // for use by writeOriginNotes. We track separately rather than walking + // the full mapping because notes only attach meaningfully to commits. + commits []plumbing.Hash + counts Counts + signaturesStripped int + messageRewrites int + rewriteMessages bool + lastNotesCommit plumbing.Hash +} + +func newTranslator(src, dst storer.Storer, targetDir string, rewriteMessages bool) (*translator, error) { + srcFS, ok := src.(*filesystem.Storage) + if !ok { + return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) + } + dstFS, ok := dst.(*filesystem.Storage) + if !ok { + return nil, fmt.Errorf("target storage is not filesystem-backed (%T)", dst) + } + return &translator{ + src: srcFS, + dst: dstFS, + objectsDir: filepath.Join(targetDir, "objects"), + reachable: make(map[plumbing.Hash]plumbing.ObjectType), + mapping: make(map[plumbing.Hash]plumbing.Hash), + inProgress: make(map[plumbing.Hash]struct{}), + rewriteMessages: rewriteMessages, + }, nil +} + +// discover walks every object reachable from roots (via tree entries, +// commit tree+parent links, and tag targets) and records each one in +// t.reachable with its object type. Submodule gitlinks are followed +// only when the referenced commit exists in the same source store, to +// stay consistent with translateTree's handling. Message-reference +// edges are not part of this pass — those are added during translation. +func (t *translator) discover(roots []plumbing.Hash) error { + for _, root := range roots { + if err := t.visit(root); err != nil { + return err + } + } + return nil +} + +func (t *translator) visit(sha1 plumbing.Hash) error { + if _, seen := t.reachable[sha1]; seen { + return nil + } + obj, err := t.src.EncodedObject(plumbing.AnyObject, sha1) + if err != nil { + return fmt.Errorf("discover %s: %w", sha1, err) + } + t.reachable[sha1] = obj.Type() + switch obj.Type() { + case plumbing.BlobObject: + return nil + case plumbing.TreeObject: + tree := &object.Tree{} + if err := tree.Decode(obj); err != nil { + return fmt.Errorf("discover decode tree %s: %w", sha1, err) + } + for _, e := range tree.Entries { + if e.Mode == filemode.Submodule { + if _, err := t.src.EncodedObject(plumbing.CommitObject, e.Hash); err == nil { + if err := t.visit(e.Hash); err != nil { + return err + } + } + continue + } + if err := t.visit(e.Hash); err != nil { + return err + } + } + case plumbing.CommitObject: + c := &object.Commit{} + if err := c.Decode(obj); err != nil { + return fmt.Errorf("discover decode commit %s: %w", sha1, err) + } + if err := t.visit(c.TreeHash); err != nil { + return err + } + for _, p := range c.ParentHashes { + if err := t.visit(p); err != nil { + return err + } + } + case plumbing.TagObject: + tag := &object.Tag{} + if err := tag.Decode(obj); err != nil { + return fmt.Errorf("discover decode tag %s: %w", sha1, err) + } + if err := t.visit(tag.Target); err != nil { + return err + } + } + return nil +} + +func (t *translator) translate(sha1 plumbing.Hash) (plumbing.Hash, error) { + if newH, ok := t.mapping[sha1]; ok { + return newH, nil + } + if _, busy := t.inProgress[sha1]; busy { + // Real Git histories cannot form cycles via parent, tree, or + // tag-target edges (those are a DAG by construction), and + // SHA1 message-reference cycles are cryptographically + // infeasible (each commit's hash depends on its content, + // including any hash it embeds). A trip here would mean an + // unexpected graph shape; surface it instead of overflowing + // the stack. + return plumbing.ZeroHash, fmt.Errorf("translation cycle detected at %s", sha1) + } + t.inProgress[sha1] = struct{}{} + defer delete(t.inProgress, sha1) + + obj, err := t.src.EncodedObject(plumbing.AnyObject, sha1) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("lookup %s: %w", sha1, err) + } + switch obj.Type() { + case plumbing.BlobObject: + return t.translateBlob(sha1, obj) + case plumbing.TreeObject: + return t.translateTree(sha1, obj) + case plumbing.CommitObject: + return t.translateCommit(sha1, obj) + case plumbing.TagObject: + return t.translateTag(sha1, obj) + default: + return plumbing.ZeroHash, fmt.Errorf("unexpected object type %v for %s", obj.Type(), sha1) + } +} + +func (t *translator) translateBlob(sha1 plumbing.Hash, src plumbing.EncodedObject) (plumbing.Hash, error) { + r, err := src.Reader() + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("blob reader: %w", err) + } + defer r.Close() + body, err := io.ReadAll(r) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("blob read: %w", err) + } + newHash, err := t.writeLoose(plumbing.BlobObject, body) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("blob store: %w", err) + } + t.mapping[sha1] = newHash + t.counts.Blobs++ + return newHash, nil +} + +func (t *translator) translateTree(sha1 plumbing.Hash, src plumbing.EncodedObject) (plumbing.Hash, error) { + tree := &object.Tree{} + if err := tree.Decode(src); err != nil { + return plumbing.ZeroHash, fmt.Errorf("decode tree %s: %w", sha1, err) + } + for i, entry := range tree.Entries { + if entry.Mode == filemode.Submodule { + // Submodule gitlinks reference a commit in a different repo. + // We can only translate if that commit happens to live in our + // SHA1 store too (rare, e.g. vendored). Otherwise the SHA1 + // pointer can't be embedded in a SHA256 tree, so we error + // out and let the caller scope around it. + if _, ok := t.mapping[entry.Hash]; ok { + tree.Entries[i].Hash = t.mapping[entry.Hash] + continue + } + if _, err := t.src.EncodedObject(plumbing.CommitObject, entry.Hash); err == nil { + newH, err := t.translate(entry.Hash) + if err != nil { + return plumbing.ZeroHash, err + } + tree.Entries[i].Hash = newH + continue + } + return plumbing.ZeroHash, fmt.Errorf( + "tree %s contains submodule gitlink %q at %s that is not present in the source repo; "+ + "exclude refs that reference it or convert the submodule repository first", + sha1, entry.Name, entry.Hash) + } + newH, err := t.translate(entry.Hash) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("tree %s entry %q: %w", sha1, entry.Name, err) + } + tree.Entries[i].Hash = newH + } + body, err := encodeBody(plumbing.TreeObject, tree.Encode) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("encode tree %s: %w", sha1, err) + } + newHash, err := t.writeLoose(plumbing.TreeObject, body) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("store tree %s: %w", sha1, err) + } + t.mapping[sha1] = newHash + t.counts.Trees++ + return newHash, nil +} + +func (t *translator) translateCommit(sha1 plumbing.Hash, src plumbing.EncodedObject) (plumbing.Hash, error) { + c := &object.Commit{} + if err := c.Decode(src); err != nil { + return plumbing.ZeroHash, fmt.Errorf("decode commit %s: %w", sha1, err) + } + newTree, err := t.translate(c.TreeHash) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("commit %s tree: %w", sha1, err) + } + c.TreeHash = newTree + for i, p := range c.ParentHashes { + newP, err := t.translate(p) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("commit %s parent %s: %w", sha1, p, err) + } + c.ParentHashes[i] = newP + } + if t.rewriteMessages { + // Translate every in-scope SHA1 mentioned in this commit's + // message before rewriting it. This makes the message-reference + // edge part of the translation DFS, so the mapping contains + // each referenced object by the time we substitute. Without + // it, sibling-branch references (cherry-picks, etc.) would + // only resolve when ref iteration happened to process the + // referenced commit's branch first. + for _, ref := range t.extractMessageReferences(c.Message) { + if _, err := t.translate(ref); err != nil { + return plumbing.ZeroHash, fmt.Errorf("commit %s message ref %s: %w", sha1, ref, err) + } + } + if rewritten, n := t.rewriteHashesInMessage(c.Message); n > 0 { + c.Message = rewritten + t.messageRewrites += n + } + } + if c.Signature != "" { + c.Signature = "" + t.signaturesStripped++ + } + if c.SignatureSHA256 != "" { + c.SignatureSHA256 = "" + t.signaturesStripped++ + } + // "mergetag" extra headers embed a copy of a signed annotated tag with + // its own signature. Drop them too — they reference the pre-rewrite + // commit/tag content and cannot be re-signed here. + if len(c.ExtraHeaders) > 0 { + filtered := c.ExtraHeaders[:0] + for _, h := range c.ExtraHeaders { + if h.Key == "mergetag" { + t.signaturesStripped++ + continue + } + filtered = append(filtered, h) + } + c.ExtraHeaders = filtered + } + body, err := encodeBody(plumbing.CommitObject, c.Encode) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("encode commit %s: %w", sha1, err) + } + newHash, err := t.writeLoose(plumbing.CommitObject, body) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("store commit %s: %w", sha1, err) + } + t.mapping[sha1] = newHash + t.commits = append(t.commits, sha1) + t.counts.Commits++ + return newHash, nil +} + +func (t *translator) translateTag(sha1 plumbing.Hash, src plumbing.EncodedObject) (plumbing.Hash, error) { + tag := &object.Tag{} + if err := tag.Decode(src); err != nil { + return plumbing.ZeroHash, fmt.Errorf("decode tag %s: %w", sha1, err) + } + newTarget, err := t.translate(tag.Target) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("tag %s target: %w", sha1, err) + } + tag.Target = newTarget + if t.rewriteMessages { + // Same as translateCommit: translate every in-scope message + // reference before rewriting, so cross-branch references + // always resolve regardless of ref iteration order. + for _, ref := range t.extractMessageReferences(tag.Message) { + if _, err := t.translate(ref); err != nil { + return plumbing.ZeroHash, fmt.Errorf("tag %s message ref %s: %w", sha1, ref, err) + } + } + if rewritten, n := t.rewriteHashesInMessage(tag.Message); n > 0 { + tag.Message = rewritten + t.messageRewrites += n + } + } + if tag.Signature != "" { + tag.Signature = "" + t.signaturesStripped++ + } + if tag.SignatureSHA256 != "" { + tag.SignatureSHA256 = "" + t.signaturesStripped++ + } + body, err := encodeBody(plumbing.TagObject, tag.Encode) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("encode tag %s: %w", sha1, err) + } + newHash, err := t.writeLoose(plumbing.TagObject, body) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("store tag %s: %w", sha1, err) + } + t.mapping[sha1] = newHash + t.counts.Tags++ + return newHash, nil +} + +// encodeBody runs an object's go-git Encode method into a SHA1-hasher +// MemoryObject (the hasher we use to capture bytes is irrelevant; we only +// read the body back out) and returns just the payload bytes — without the +// " \x00" header. writeLoose adds the SHA256-correct header. +func encodeBody(typ plumbing.ObjectType, encode func(plumbing.EncodedObject) error) ([]byte, error) { + scratch := plumbing.NewMemoryObject(plumbing.FromObjectFormat(formatcfg.SHA1)) + scratch.SetType(typ) + if err := encode(scratch); err != nil { + return nil, err + } + r, err := scratch.Reader() + if err != nil { + return nil, err + } + defer r.Close() + return io.ReadAll(r) +} + +// writeLoose writes a single object as a SHA256-named loose object under +// objects//. Bypasses go-git's objfile.Writer, which would hash +// with SHA1. Atomic via tempfile+rename, idempotent on duplicate hashes. +func (t *translator) writeLoose(typ plumbing.ObjectType, body []byte) (plumbing.Hash, error) { + h := sha256.New() + header := append(typ.Bytes(), ' ') + header = strconv.AppendInt(header, int64(len(body)), 10) + header = append(header, 0) + h.Write(header) + h.Write(body) + sum := h.Sum(nil) + hexSum := hex.EncodeToString(sum) + + dir := filepath.Join(t.objectsDir, hexSum[:2]) + file := filepath.Join(dir, hexSum[2:]) + + hashID, ok := plumbing.FromBytes(sum) + if !ok { + return plumbing.ZeroHash, fmt.Errorf("internal: bad sha256 sum length %d", len(sum)) + } + + if _, err := os.Stat(file); err == nil { + return hashID, nil + } + if err := os.MkdirAll(dir, 0o755); err != nil { + return plumbing.ZeroHash, fmt.Errorf("mkdir %s: %w", dir, err) + } + + var buf bytes.Buffer + zw := zlib.NewWriter(&buf) + if _, err := zw.Write(header); err != nil { + return plumbing.ZeroHash, fmt.Errorf("zlib write header: %w", err) + } + if _, err := zw.Write(body); err != nil { + return plumbing.ZeroHash, fmt.Errorf("zlib write body: %w", err) + } + if err := zw.Close(); err != nil { + return plumbing.ZeroHash, fmt.Errorf("zlib close: %w", err) + } + + tmp, err := os.CreateTemp(dir, "tmp_obj_") + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("create temp object: %w", err) + } + if _, err := tmp.Write(buf.Bytes()); err != nil { + _ = tmp.Close() + _ = os.Remove(tmp.Name()) + return plumbing.ZeroHash, fmt.Errorf("write temp object: %w", err) + } + if err := tmp.Close(); err != nil { + _ = os.Remove(tmp.Name()) + return plumbing.ZeroHash, fmt.Errorf("close temp object: %w", err) + } + if err := os.Rename(tmp.Name(), file); err != nil { + _ = os.Remove(tmp.Name()) + return plumbing.ZeroHash, fmt.Errorf("rename %s: %w", file, err) + } + return hashID, nil +} + +// hashPattern matches hex runs that could be a git object hash. Git's +// default abbreviation is 7 chars; 40 is a full SHA1. We only rewrite a +// match if the prefix uniquely identifies a commit or tag in the +// reachable set, so false positives on incidental hex strings are +// essentially impossible (a random hex would have to collide with a +// real source SHA1). +var hashPattern = regexp.MustCompile(`\b[0-9a-f]{7,40}\b`) + +// rewriteHashesInMessage scans msg for short and full SHA1 hashes, +// replacing any that uniquely identify a commit or tag in t.reachable +// with the corresponding full SHA256 hex from t.mapping. Returns the +// rewritten message and the number of substitutions made. +// +// Uniqueness is decided against t.reachable rather than t.mapping so +// that abbreviated prefixes get the same verdict during translation as +// they would after every object has been translated — the answer cannot +// flip depending on what has been processed so far. +// +// Performance: the abbreviated-hash path scans the reachable set +// linearly for each match. Fine for repos up to ~100k commits; slower +// past that. If this ever matters, build a sorted-prefix index over +// reachable SHA1 hex strings once and binary-search. +func (t *translator) rewriteHashesInMessage(msg string) (string, int) { + count := 0 + out := hashPattern.ReplaceAllStringFunc(msg, func(s string) string { + sha1, ok := t.resolveMessageRef(s) + if !ok { + return s + } + newHash, ok := t.mapping[sha1] + if !ok { + // The reachable set says this SHA1 is in scope, but the + // translation DFS hasn't placed it yet. Shouldn't happen + // because translateCommit/translateTag add message-reference + // edges before encoding — leave the hex untouched if it + // somehow does. + return s + } + count++ + return newHash.String() + }) + return out, count +} + +// resolveMessageRef returns the unique commit/tag SHA1 in t.reachable +// that matches the given hex prefix. Returns (zero, false) for no +// match, an ambiguous prefix, or a match that is not a commit or tag +// (incidental hex strings that happen to collide with a blob or tree +// hash are not rewritten). +func (t *translator) resolveMessageRef(prefix string) (plumbing.Hash, bool) { + if len(prefix) == 40 { + sha1, ok := plumbing.FromHex(prefix) + if !ok { + return plumbing.ZeroHash, false + } + typ, in := t.reachable[sha1] + if !in { + return plumbing.ZeroHash, false + } + if typ != plumbing.CommitObject && typ != plumbing.TagObject { + return plumbing.ZeroHash, false + } + return sha1, true + } + var match plumbing.Hash + matches := 0 + for sha1, typ := range t.reachable { + if typ != plumbing.CommitObject && typ != plumbing.TagObject { + continue + } + if strings.HasPrefix(sha1.String(), prefix) { + matches++ + if matches > 1 { + return plumbing.ZeroHash, false + } + match = sha1 + } + } + if matches != 1 { + return plumbing.ZeroHash, false + } + return match, true +} + +// extractMessageReferences returns the unique commit/tag SHA1s mentioned +// by hex prefix in msg. Used by translateCommit/translateTag to add +// message-reference edges to the translation DFS so the mapping is +// fully populated by the time the message is rewritten. +func (t *translator) extractMessageReferences(msg string) []plumbing.Hash { + seen := make(map[plumbing.Hash]struct{}) + var out []plumbing.Hash + for _, match := range hashPattern.FindAllString(msg, -1) { + sha1, ok := t.resolveMessageRef(match) + if !ok { + continue + } + if _, dup := seen[sha1]; dup { + continue + } + seen[sha1] = struct{}{} + out = append(out, sha1) + } + return out +} + +// writeOriginNotes writes a `git notes` ref to dst that records each +// translated commit's original SHA1, keyed by its new SHA256. Standard +// git tooling (`git log --notes=`, `git notes --ref= show +// `) can then surface the old hash to anyone with the repo. +// +// The notes tree is flat (no fanout). Git supports either layout, and a +// flat layout keeps this code small; on repos with millions of commits +// lookups slow down to a linear tree scan, but the data is preserved. +func (t *translator) writeOriginNotes(refName string) (string, error) { + if len(t.commits) == 0 { + return "", nil + } + // Note for each commit: a blob containing the original SHA1 hex + newline. + // We collect (sha256-of-new-commit → blob hash) pairs so the tree entry + // path is the commit's new hash. + type entry struct { + key plumbing.Hash + blob plumbing.Hash + } + entries := make([]entry, 0, len(t.commits)) + for _, oldSHA1 := range t.commits { + newCommit, ok := t.mapping[oldSHA1] + if !ok { + continue + } + blobHash, err := t.writeLoose(plumbing.BlobObject, []byte(oldSHA1.String()+"\n")) + if err != nil { + return "", fmt.Errorf("note blob for %s: %w", oldSHA1, err) + } + entries = append(entries, entry{key: newCommit, blob: blobHash}) + } + if len(entries) == 0 { + return "", nil + } + + treeEntries := make([]object.TreeEntry, 0, len(entries)) + for _, e := range entries { + treeEntries = append(treeEntries, object.TreeEntry{ + Name: e.key.String(), + Mode: filemode.Regular, + Hash: e.blob, + }) + } + sort.Slice(treeEntries, func(i, j int) bool { + return treeEntries[i].Name < treeEntries[j].Name + }) + tree := &object.Tree{Entries: treeEntries} + treeBody, err := encodeBody(plumbing.TreeObject, tree.Encode) + if err != nil { + return "", fmt.Errorf("encode notes tree: %w", err) + } + treeHash, err := t.writeLoose(plumbing.TreeObject, treeBody) + if err != nil { + return "", fmt.Errorf("store notes tree: %w", err) + } + + now := time.Now().UTC() + sig := object.Signature{Name: "git-sync", Email: "noreply@entire.io", When: now} + commit := &object.Commit{ + Author: sig, + Committer: sig, + Message: "git-sync convert-sha256: SHA1 origin notes\n", + TreeHash: treeHash, + } + commitBody, err := encodeBody(plumbing.CommitObject, commit.Encode) + if err != nil { + return "", fmt.Errorf("encode notes commit: %w", err) + } + commitHash, err := t.writeLoose(plumbing.CommitObject, commitBody) + if err != nil { + return "", fmt.Errorf("store notes commit: %w", err) + } + t.lastNotesCommit = commitHash + return refName, nil +} + +// writeMappingFile dumps the SHA1 → SHA256 mapping as a TSV. Lines are +// sorted by SHA1 so diffs across runs are stable. Includes every +// translated object (blob/tree/commit/tag), so external tooling can use +// it for content-addressed lookups regardless of object kind. +func (t *translator) writeMappingFile(path string) error { + type pair struct{ sha1, sha256 string } + pairs := make([]pair, 0, len(t.mapping)) + for old, newH := range t.mapping { + pairs = append(pairs, pair{sha1: old.String(), sha256: newH.String()}) + } + sort.Slice(pairs, func(i, j int) bool { return pairs[i].sha1 < pairs[j].sha1 }) + + if dir := filepath.Dir(path); dir != "" { + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create %s: %w", path, err) + } + defer f.Close() + w := bufio.NewWriter(f) + if _, err := fmt.Fprintln(w, "# sha1\tsha256"); err != nil { + return err + } + for _, p := range pairs { + if _, err := fmt.Fprintf(w, "%s\t%s\n", p.sha1, p.sha256); err != nil { + return err + } + } + return w.Flush() +} + +func writeRefs( + dst storer.Storer, + desired map[plumbing.ReferenceName]planner.DesiredRef, + mapping map[plumbing.Hash]plumbing.Hash, +) (int, error) { + written := 0 + for _, d := range desired { + newHash, ok := mapping[d.SourceHash] + if !ok { + return written, fmt.Errorf("ref %s tip %s missing from translation map", d.TargetRef, d.SourceHash) + } + if err := dst.SetReference(plumbing.NewHashReference(d.TargetRef, newHash)); err != nil { + return written, fmt.Errorf("set ref %s: %w", d.TargetRef, err) + } + written++ + } + return written, nil +} + diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go new file mode 100644 index 00000000..9a48e129 --- /dev/null +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -0,0 +1,783 @@ +package sha256convert + +import ( + "bytes" + "compress/zlib" + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "io" + "net/http" + "net/http/cgi" + "net/http/httptest" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + git "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/filemode" + formatcfg "github.com/go-git/go-git/v6/plumbing/format/config" + "github.com/go-git/go-git/v6/plumbing/object" + "github.com/go-git/go-git/v6/storage/filesystem" +) + +// TestTranslator builds a small SHA1 source repo with blobs, trees, commits, +// and an annotated tag — including signed commit/tag — then runs the +// translator and asserts both the bookkeeping counts and the on-disk +// invariant: every loose object's filename equals sha256(headered content). +// That invariant is the one go-git v6 alpha 3 gets wrong via its +// SetEncodedObject path; verifying it directly prevents regressing back +// onto the broken loose-object writer. +func TestTranslator(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init SHA1 source: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target: %v", err) + } + + blobHash := writeBlob(t, srcRepo.Storer, []byte("hello world\n")) + treeHash := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "README", Mode: filemode.Regular, Hash: blobHash}, + }) + + sig := object.Signature{Name: "Test", Email: "test@example.com", When: time.Unix(1700000000, 0).UTC()} + commit1 := &object.Commit{ + Author: sig, + Committer: sig, + Message: "initial\n", + TreeHash: treeHash, + Signature: "-----BEGIN PGP SIGNATURE-----\nfake sig data\n-----END PGP SIGNATURE-----", + } + c1Hash := writeObject(t, srcRepo.Storer, commit1.Encode) + + commit2 := &object.Commit{ + Author: sig, + Committer: sig, + Message: "second\n", + TreeHash: treeHash, + ParentHashes: []plumbing.Hash{c1Hash}, + } + c2Hash := writeObject(t, srcRepo.Storer, commit2.Encode) + + tag := &object.Tag{ + Name: "v1", + Tagger: sig, + Message: "annotated tag\n", + TargetType: plumbing.CommitObject, + Target: c2Hash, + Signature: "-----BEGIN PGP SIGNATURE-----\nfake tag sig\n-----END PGP SIGNATURE-----", + } + tagHash := writeObject(t, srcRepo.Storer, tag.Encode) + + tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, false) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + if err := tr.discover([]plumbing.Hash{tagHash}); err != nil { + t.Fatalf("discover: %v", err) + } + newTagHash, err := tr.translate(tagHash) + if err != nil { + t.Fatalf("translate tag: %v", err) + } + + wantCounts := Counts{Blobs: 1, Trees: 1, Commits: 2, Tags: 1} + if tr.counts != wantCounts { + t.Errorf("counts: got %+v, want %+v", tr.counts, wantCounts) + } + if tr.signaturesStripped != 2 { + t.Errorf("signatures stripped: got %d, want 2 (commit + tag)", tr.signaturesStripped) + } + + // Idempotency: translating the same hash again must reuse the mapping + // without writing more objects or bumping counters. + startBlobs := tr.counts.Blobs + if _, err := tr.translate(tagHash); err != nil { + t.Fatalf("re-translate tag: %v", err) + } + if tr.counts.Blobs != startBlobs { + t.Errorf("re-translate increased blob count; memoization broken") + } + + // Every translated hash must point at a loose object whose filename + // equals sha256(headered content). This is the precise invariant the + // go-git bug violates — keep it as a test. + objectsDir := filepath.Join(dstDir, "objects") + verified := 0 + for _, h := range tr.mapping { + assertLooseObjectHashMatches(t, objectsDir, h) + verified++ + } + if verified == 0 { + t.Fatal("no objects in mapping; nothing was verified") + } + + // The translated tag must decode under the SHA256 target and point at + // a SHA256 commit whose tree resolves to a SHA256 tree. + tagObj, err := object.GetTag(dstRepo.Storer, newTagHash) + if err != nil { + t.Fatalf("read translated tag: %v", err) + } + if tagObj.Signature != "" { + t.Errorf("translated tag still carries a signature: %q", tagObj.Signature) + } + if tagObj.Target != tr.mapping[c2Hash] { + t.Errorf("translated tag target: got %s, want %s", tagObj.Target, tr.mapping[c2Hash]) + } + + commit, err := object.GetCommit(dstRepo.Storer, tagObj.Target) + if err != nil { + t.Fatalf("read translated commit: %v", err) + } + if commit.Signature != "" { + t.Errorf("translated commit still carries a signature: %q", commit.Signature) + } + if len(commit.ParentHashes) != 1 || commit.ParentHashes[0] != tr.mapping[c1Hash] { + t.Errorf("translated commit parents: got %v, want [%s]", commit.ParentHashes, tr.mapping[c1Hash]) + } + if commit.TreeHash != tr.mapping[treeHash] { + t.Errorf("translated commit tree: got %s, want %s", commit.TreeHash, tr.mapping[treeHash]) + } +} + +// TestTranslator_RewritesMessageHashes confirms that SHA1 hash references +// in commit and tag messages — both full 40-char and short forms — are +// rewritten to the corresponding SHA256 when those SHA1s are translated +// objects in the same conversion, and that ambiguous/unknown short +// prefixes are left alone. +func TestTranslator_RewritesMessageHashes(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init SHA1 source: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target: %v", err) + } + + blobHash := writeBlob(t, srcRepo.Storer, []byte("x\n")) + treeHash := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "f", Mode: filemode.Regular, Hash: blobHash}, + }) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + parent := &object.Commit{Author: sig, Committer: sig, Message: "first\n", TreeHash: treeHash} + parentSHA1 := writeObject(t, srcRepo.Storer, parent.Encode) + + // Child commit's message references the parent by full hash, by 7-char + // short prefix, and includes an unrelated 7-char hex string that should + // not match anything in the mapping. + parentHex := parentSHA1.String() + childMsg := fmt.Sprintf( + "reverts %s\nsee short %s for context\nunrelated hex 1234567 follows\n", + parentHex, parentHex[:7]) + child := &object.Commit{ + Author: sig, + Committer: sig, + Message: childMsg, + TreeHash: treeHash, + ParentHashes: []plumbing.Hash{parentSHA1}, + } + childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) + + tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, true) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + if err := tr.discover([]plumbing.Hash{childSHA1}); err != nil { + t.Fatalf("discover: %v", err) + } + if _, err := tr.translate(childSHA1); err != nil { + t.Fatalf("translate child: %v", err) + } + + // 2 references should have been rewritten (full + short). The unrelated + // 7-char hex string is not in the mapping, so it stays. + if tr.messageRewrites != 2 { + t.Errorf("message rewrites: got %d, want 2", tr.messageRewrites) + } + + childNew := tr.mapping[childSHA1] + parentNew := tr.mapping[parentSHA1] + gotChild, err := object.GetCommit(dstRepo.Storer, childNew) + if err != nil { + t.Fatalf("read translated child: %v", err) + } + if !strings.Contains(gotChild.Message, parentNew.String()) { + t.Errorf("child message missing full SHA256 of parent:\n%s", gotChild.Message) + } + if strings.Contains(gotChild.Message, parentHex) { + t.Errorf("child message still contains original parent SHA1:\n%s", gotChild.Message) + } + if !strings.Contains(gotChild.Message, "1234567") { + t.Errorf("unrelated short hex was wrongly substituted:\n%s", gotChild.Message) + } +} + +// TestTranslator_RewritesCrossBranchReferences is the test that proves the +// discovery-plus-topological-DFS design fixes the cross-branch limitation +// the older inline-only rewriter had. Two unrelated branches share no +// ancestry. Branch A has a single commit cA. Branch B has commit cB whose +// message references cA by both full and abbreviated SHA1. We translate B +// first, *then* A — the order under which the older code would have left +// cB's message un-rewritten because cA was not yet in the mapping when cB +// was encoded. With message-reference edges in the DFS, translating cB +// pulls cA in via t.translate, so the mapping is populated and the +// rewrite succeeds. +func TestTranslator_RewritesCrossBranchReferences(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + srcRepo, _ := git.PlainInit(srcDir, true) + dstRepo, _ := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + + blobA := writeBlob(t, srcRepo.Storer, []byte("a\n")) + treeA := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "a", Mode: filemode.Regular, Hash: blobA}, + }) + blobB := writeBlob(t, srcRepo.Storer, []byte("b\n")) + treeB := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "b", Mode: filemode.Regular, Hash: blobB}, + }) + + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + cA := writeObject(t, srcRepo.Storer, (&object.Commit{ + Author: sig, Committer: sig, Message: "branch A tip\n", TreeHash: treeA, + }).Encode) + // cB has no parent in common with cA — they are siblings under + // no ancestor, exactly the case where ancestor-only inline + // rewriting would have failed. + cAHex := cA.String() + cB := writeObject(t, srcRepo.Storer, (&object.Commit{ + Author: sig, + Committer: sig, + Message: fmt.Sprintf("branch B tip\n\nCherry-picked from %s\nsee short %s\n", + cAHex, cAHex[:8]), + TreeHash: treeB, + }).Encode) + + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, true) + // Discovery must see both branches so the reachable set covers cA + // before cB is encoded. + if err := tr.discover([]plumbing.Hash{cB, cA}); err != nil { + t.Fatalf("discover: %v", err) + } + // Translate B first — the order that would have left the rewrite + // stranded under the old design. + if _, err := tr.translate(cB); err != nil { + t.Fatalf("translate cB: %v", err) + } + if _, err := tr.translate(cA); err != nil { + t.Fatalf("translate cA: %v", err) + } + + if tr.messageRewrites != 2 { + t.Errorf("expected 2 rewrites (full + short SHA1 of cA), got %d", tr.messageRewrites) + } + cBNew := tr.mapping[cB] + cANew := tr.mapping[cA] + if cBNew.IsZero() || cANew.IsZero() { + t.Fatalf("missing mapping entries: cB=%s cA=%s", cBNew, cANew) + } + gotB, err := object.GetCommit(dstRepo.Storer, cBNew) + if err != nil { + t.Fatalf("read cB: %v", err) + } + if !strings.Contains(gotB.Message, cANew.String()) { + t.Errorf("cB's message missing cA's SHA256:\n%s", gotB.Message) + } + if strings.Contains(gotB.Message, cAHex) { + t.Errorf("cB's message still contains cA's original SHA1:\n%s", gotB.Message) + } +} + +// TestTranslator_SkipMessageRewrite confirms that with rewriteMessages +// false, the translator leaves message content (including SHA1 hashes) +// untouched. +func TestTranslator_SkipMessageRewrite(t *testing.T) { + root := t.TempDir() + srcRepo, _ := git.PlainInit(filepath.Join(root, "src.git"), true) + dstRepo, _ := git.PlainInit(filepath.Join(root, "dst.git"), true, git.WithObjectFormat(formatcfg.SHA256)) + + blob := writeBlob(t, srcRepo.Storer, []byte("x\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1, 0).UTC()} + parent := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "p\n", TreeHash: tree}).Encode) + parentHex := parent.String() + + child := &object.Commit{ + Author: sig, Committer: sig, TreeHash: tree, ParentHashes: []plumbing.Hash{parent}, + Message: "reverts " + parentHex + "\n", + } + childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) + + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false) + if err := tr.discover([]plumbing.Hash{childSHA1}); err != nil { + t.Fatalf("discover: %v", err) + } + if _, err := tr.translate(childSHA1); err != nil { + t.Fatalf("translate: %v", err) + } + if tr.messageRewrites != 0 { + t.Errorf("expected no rewrites when disabled; got %d", tr.messageRewrites) + } + got, _ := object.GetCommit(dstRepo.Storer, tr.mapping[childSHA1]) + if !strings.Contains(got.Message, parentHex) { + t.Errorf("rewrite-disabled run still mutated the message: %q", got.Message) + } +} + +// TestTranslator_WriteOriginNotes builds a small history and verifies that +// the notes tree contains one entry per translated commit and that each +// entry resolves to a blob whose content is the commit's original SHA1. +func TestTranslator_WriteOriginNotes(t *testing.T) { + root := t.TempDir() + srcRepo, _ := git.PlainInit(filepath.Join(root, "src.git"), true) + dstRepo, _ := git.PlainInit(filepath.Join(root, "dst.git"), true, git.WithObjectFormat(formatcfg.SHA256)) + + blob := writeBlob(t, srcRepo.Storer, []byte("hi\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + c1 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c1\n", TreeHash: tree}).Encode) + c2 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c2\n", TreeHash: tree, ParentHashes: []plumbing.Hash{c1}}).Encode) + + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false) + if err := tr.discover([]plumbing.Hash{c2}); err != nil { + t.Fatalf("discover: %v", err) + } + if _, err := tr.translate(c2); err != nil { + t.Fatalf("translate: %v", err) + } + + refName, err := tr.writeOriginNotes(originNotesRef) + if err != nil { + t.Fatalf("writeOriginNotes: %v", err) + } + if refName != originNotesRef { + t.Errorf("ref name: got %q, want %q", refName, originNotesRef) + } + notesCommit, err := object.GetCommit(dstRepo.Storer, tr.lastNotesCommit) + if err != nil { + t.Fatalf("read notes commit: %v", err) + } + notesTree, err := notesCommit.Tree() + if err != nil { + t.Fatalf("read notes tree: %v", err) + } + if len(notesTree.Entries) != 2 { + t.Fatalf("notes entries: got %d, want 2", len(notesTree.Entries)) + } + for _, mapped := range []plumbing.Hash{tr.mapping[c1], tr.mapping[c2]} { + entry, err := notesTree.FindEntry(mapped.String()) + if err != nil { + t.Fatalf("no notes entry for %s: %v", mapped, err) + } + blob, err := object.GetBlob(dstRepo.Storer, entry.Hash) + if err != nil { + t.Fatalf("read note blob: %v", err) + } + reader, _ := blob.Reader() + buf, _ := io.ReadAll(reader) + _ = reader.Close() + got := strings.TrimSpace(string(buf)) + var origSHA1 plumbing.Hash + for s, n := range tr.mapping { + if n == mapped { + origSHA1 = s + break + } + } + if got != origSHA1.String() { + t.Errorf("note for %s: got %q, want %q", mapped, got, origSHA1.String()) + } + } +} + +// TestTranslator_WriteMappingFile checks the sidecar TSV format: header +// line, sorted by SHA1, one entry per translated object. +func TestTranslator_WriteMappingFile(t *testing.T) { + root := t.TempDir() + srcRepo, _ := git.PlainInit(filepath.Join(root, "src.git"), true) + dstRepo, _ := git.PlainInit(filepath.Join(root, "dst.git"), true, git.WithObjectFormat(formatcfg.SHA256)) + + blob := writeBlob(t, srcRepo.Storer, []byte("hi\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + commit := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c\n", TreeHash: tree}).Encode) + + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false) + if err := tr.discover([]plumbing.Hash{commit}); err != nil { + t.Fatalf("discover: %v", err) + } + if _, err := tr.translate(commit); err != nil { + t.Fatalf("translate: %v", err) + } + + path := filepath.Join(root, "mapping.tsv") + if err := tr.writeMappingFile(path); err != nil { + t.Fatalf("writeMappingFile: %v", err) + } + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read mapping: %v", err) + } + lines := strings.Split(strings.TrimRight(string(raw), "\n"), "\n") + if !strings.HasPrefix(lines[0], "#") { + t.Errorf("first line should be a header comment, got %q", lines[0]) + } + data := lines[1:] + if len(data) != len(tr.mapping) { + t.Errorf("mapping line count: got %d, want %d", len(data), len(tr.mapping)) + } + // Sorted by SHA1. + for i := 1; i < len(data); i++ { + prev := strings.Split(data[i-1], "\t")[0] + cur := strings.Split(data[i], "\t")[0] + if prev >= cur { + t.Errorf("mapping not sorted: %q >= %q", prev, cur) + } + } + // Every translated hash present. + mapped := map[string]string{} + for _, line := range data { + parts := strings.Split(line, "\t") + if len(parts) != 2 { + t.Errorf("malformed line %q", line) + continue + } + mapped[parts[0]] = parts[1] + } + for old, newH := range tr.mapping { + if mapped[old.String()] != newH.String() { + t.Errorf("missing or wrong mapping for %s: got %q, want %s", old, mapped[old.String()], newH) + } + } +} + +// TestTranslator_UnresolvableSubmodule confirms that a tree entry with +// Submodule mode pointing at a commit not in the source repo causes a +// clear error rather than silently producing a malformed SHA256 tree. +func TestTranslator_UnresolvableSubmodule(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init SHA1 source: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target: %v", err) + } + + blobHash := writeBlob(t, srcRepo.Storer, []byte("contents\n")) + // External-looking SHA1 — not in source. + external := plumbing.NewHash("0123456789abcdef0123456789abcdef01234567") + treeHash := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "file", Mode: filemode.Regular, Hash: blobHash}, + {Name: "sub", Mode: filemode.Submodule, Hash: external}, + }) + + tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, false) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + if err := tr.discover([]plumbing.Hash{treeHash}); err != nil { + t.Fatalf("discover: %v", err) + } + _, err = tr.translate(treeHash) + if err == nil { + t.Fatal("expected error for unresolvable submodule, got nil") + } + if !strings.Contains(err.Error(), "submodule") { + t.Errorf("error should mention submodule; got: %v", err) + } +} + +// --- helpers --- + +func writeBlob(t *testing.T, storer interface { + NewEncodedObject() plumbing.EncodedObject + SetEncodedObject(plumbing.EncodedObject) (plumbing.Hash, error) +}, content []byte) plumbing.Hash { + t.Helper() + obj := storer.NewEncodedObject() + obj.SetType(plumbing.BlobObject) + obj.SetSize(int64(len(content))) + w, err := obj.Writer() + if err != nil { + t.Fatalf("blob writer: %v", err) + } + if _, err := w.Write(content); err != nil { + t.Fatalf("blob write: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("blob close: %v", err) + } + h, err := storer.SetEncodedObject(obj) + if err != nil { + t.Fatalf("blob store: %v", err) + } + return h +} + +func writeTree(t *testing.T, storer interface { + NewEncodedObject() plumbing.EncodedObject + SetEncodedObject(plumbing.EncodedObject) (plumbing.Hash, error) +}, entries []object.TreeEntry) plumbing.Hash { + t.Helper() + tree := &object.Tree{Entries: entries} + // object.Tree.Encode requires the slice to be sorted by name; tests + // pre-sort their entries, but be safe. + return writeObject(t, storer, tree.Encode) +} + +func writeObject(t *testing.T, storer interface { + NewEncodedObject() plumbing.EncodedObject + SetEncodedObject(plumbing.EncodedObject) (plumbing.Hash, error) +}, encode func(plumbing.EncodedObject) error) plumbing.Hash { + t.Helper() + obj := storer.NewEncodedObject() + if err := encode(obj); err != nil { + t.Fatalf("encode: %v", err) + } + h, err := storer.SetEncodedObject(obj) + if err != nil { + t.Fatalf("store: %v", err) + } + return h +} + +// assertLooseObjectHashMatches reads the on-disk loose object for h, zlib- +// decompresses it, and confirms sha256(decompressed bytes) == h. The +// decompressed bytes include the " \x00" header, which is what +// git hashes — so this is a direct check on the loose writer's correctness. +func assertLooseObjectHashMatches(t *testing.T, objectsDir string, h plumbing.Hash) { + t.Helper() + hex := h.String() + if len(hex) != 64 { + t.Errorf("hash %s is not 64 hex chars (sha256)", hex) + return + } + path := filepath.Join(objectsDir, hex[:2], hex[2:]) + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + zr, err := zlib.NewReader(bytes.NewReader(raw)) + if err != nil { + t.Fatalf("zlib %s: %v", path, err) + } + defer zr.Close() + plain, err := io.ReadAll(zr) + if err != nil { + t.Fatalf("decompress %s: %v", path, err) + } + sum := sha256.Sum256(plain) + got := makeHex(sum[:]) + if got != hex { + t.Errorf("loose object %s: sha256(content) = %s; filename and content disagree", hex, got) + } +} + +func makeHex(b []byte) string { + return hex.EncodeToString(b) +} + +// --- Integration test (gated) --- + +const gitHTTPBackendEnv = "GITSYNC_E2E_SHA256_HTTP_BACKEND" + +// TestRun_GitHTTPBackend exercises the full convert-sha256 pipeline against +// a local git http-backend serving a real SHA1 source repo. Gated like the +// other end-to-end git-http-backend tests to keep the default test runs +// hermetic (no external binaries required). +func TestRun_GitHTTPBackend(t *testing.T) { + if os.Getenv(gitHTTPBackendEnv) == "" { + t.Skipf("set %s=1 to run the convert-sha256 git-http-backend integration test", gitHTTPBackendEnv) + } + gitBin, err := exec.LookPath("git") + if err != nil { + t.Skipf("git binary not available: %v", err) + } + + root := t.TempDir() + srcBare := filepath.Join(root, "source.git") + worktree := filepath.Join(root, "work") + dstDir := filepath.Join(root, "target.git") + + mustGit(t, root, "init", "--bare", srcBare) + mustGit(t, root, "init", "-b", "main", worktree) + mustGit(t, worktree, "config", "user.name", "convert-sha256 test") + mustGit(t, worktree, "config", "user.email", "test@example.com") + mustWrite(t, filepath.Join(worktree, "README"), "hello\n") + mustGit(t, worktree, "add", "README") + mustGit(t, worktree, "commit", "-m", "initial") + // Capture the first commit's SHA1 so the second commit's message can + // reference it (both full and abbreviated). The conversion should + // rewrite both to the new SHA256 hash. + firstSHA1 := strings.TrimSpace(mustGitOutput(t, worktree, "rev-parse", "HEAD")) + mustWrite(t, filepath.Join(worktree, "second.txt"), "world\n") + mustGit(t, worktree, "add", "second.txt") + mustGit(t, worktree, "commit", "-m", + fmt.Sprintf("second\n\nreverts %s\nsee short %s", firstSHA1, firstSHA1[:7])) + mustGit(t, worktree, "tag", "-a", "v1", "-m", "first tag") + mustGit(t, worktree, "remote", "add", "origin", srcBare) + mustGit(t, worktree, "push", "origin", "HEAD:refs/heads/main") + mustGit(t, worktree, "push", "origin", "v1") + + srv := newCGIBackend(t, gitBin, root) + defer srv.Close() + + mappingPath := filepath.Join(root, "mapping.tsv") + res, err := Run(context.Background(), Request{ + SourceURL: srv.URL + "/source.git", + TargetDir: dstDir, + IncludeTags: true, + MappingFile: mappingPath, + Out: io.Discard, + }) + if err != nil { + t.Fatalf("convert-sha256 run: %v", err) + } + if res.Counts.Commits < 2 { + t.Errorf("expected at least 2 commits converted, got %+v", res.Counts) + } + if res.Counts.Tags != 1 { + t.Errorf("expected 1 tag converted, got %d", res.Counts.Tags) + } + if res.RefsConverted < 2 { + t.Errorf("expected at least 2 refs (main + v1), got %d", res.RefsConverted) + } + + // The converted repo must be self-consistent under SHA256. + fsckOut, err := exec.Command(gitBin, "-C", dstDir, "fsck", "--full").CombinedOutput() + if err != nil { + t.Fatalf("git fsck failed: %v\n%s", err, fsckOut) + } + if strings.Contains(string(fsckOut), "error") || strings.Contains(string(fsckOut), "bad sha") { + t.Fatalf("git fsck reported errors:\n%s", fsckOut) + } + + // Sanity: extensions.objectformat is set, and git can walk the history. + format := mustGitOutput(t, dstDir, "config", "extensions.objectformat") + if strings.TrimSpace(format) != "sha256" { + t.Errorf("extensions.objectformat: got %q, want %q", strings.TrimSpace(format), "sha256") + } + log := mustGitOutput(t, dstDir, "log", "--oneline", "refs/heads/main") + if !strings.Contains(log, "initial") || !strings.Contains(log, "second") { + t.Errorf("git log missing expected commit subjects:\n%s", log) + } + tagShow := mustGitOutput(t, dstDir, "cat-file", "-p", "refs/tags/v1") + if !strings.Contains(tagShow, "first tag") { + t.Errorf("annotated tag did not round-trip:\n%s", tagShow) + } + + // Message rewriting: the second commit's body referenced firstSHA1 + // twice (full + 7-char short). Both should now be SHA256 hashes. + if res.MessageRewrites != 2 { + t.Errorf("message rewrites: got %d, want 2", res.MessageRewrites) + } + secondMsg := mustGitOutput(t, dstDir, "log", "-1", "--format=%B", "refs/heads/main") + if strings.Contains(secondMsg, firstSHA1) { + t.Errorf("second commit message still contains the original SHA1:\n%s", secondMsg) + } + + // Origin notes: the ref exists, and the head commit's note resolves + // to the original SHA1 it was rewritten from. + if res.OriginNotesRef != "refs/notes/sha1-origin" { + t.Errorf("OriginNotesRef: got %q, want refs/notes/sha1-origin", res.OriginNotesRef) + } + headSHA256 := strings.TrimSpace(mustGitOutput(t, dstDir, "rev-parse", "refs/heads/main")) + note := strings.TrimSpace(mustGitOutput(t, dstDir, "notes", "--ref=sha1-origin", "show", headSHA256)) + // The note for the second (head) commit holds its pre-conversion SHA1. + headSHA1 := strings.TrimSpace(mustGitOutput(t, srcBare, "rev-parse", "refs/heads/main")) + if note != headSHA1 { + t.Errorf("origin note for head: got %q, want %q", note, headSHA1) + } + + // Mapping file: present, sorted, has at least one entry per + // translated commit/tree/blob/tag. + if res.MappingFile != mappingPath { + t.Errorf("MappingFile: got %q, want %q", res.MappingFile, mappingPath) + } + mapping, err := os.ReadFile(mappingPath) + if err != nil { + t.Fatalf("read mapping file: %v", err) + } + if !strings.Contains(string(mapping), headSHA1) { + t.Errorf("mapping file missing head SHA1 %s:\n%s", headSHA1, mapping) + } +} + +func mustGit(t *testing.T, dir string, args ...string) { + t.Helper() + cmd := exec.Command("git", args...) + cmd.Dir = dir + cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, out) + } +} + +func mustGitOutput(t *testing.T, dir string, args ...string) string { + t.Helper() + cmd := exec.Command("git", args...) + cmd.Dir = dir + cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, out) + } + return string(out) +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("write %s: %v", path, err) + } +} + +type cgiBackend struct { + *httptest.Server +} + +func newCGIBackend(t *testing.T, gitBin, root string) *cgiBackend { + t.Helper() + handler := &cgi.Handler{ + Path: gitBin, + Args: []string{"http-backend"}, + Env: []string{ + "GIT_PROJECT_ROOT=" + root, + "GIT_HTTP_EXPORT_ALL=1", + }, + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handler.ServeHTTP(w, r) + })) + return &cgiBackend{Server: srv} +} + +// Compile-time sanity: confirm the storers the translator expects are still +// the filesystem-backed type that PlainInit returns. If a future go-git +// release changes the concrete storer, the type assertion in newTranslator +// will start failing in this package's tests rather than only at runtime +// against a real repo. +var _ = (*filesystem.Storage)(nil) diff --git a/cmd/git-sync/root.go b/cmd/git-sync/root.go index 651eb4d6..f0c73da5 100644 --- a/cmd/git-sync/root.go +++ b/cmd/git-sync/root.go @@ -36,6 +36,7 @@ seed an empty target (bootstrap), or inspect either side (probe, fetch).`, cmd.AddCommand(newBootstrapCmd()) cmd.AddCommand(newProbeCmd()) cmd.AddCommand(newFetchCmd()) + cmd.AddCommand(newConvertSHA256Cmd()) cmd.AddCommand(newVersionCmd()) return cmd diff --git a/docs/convert-sha256.md b/docs/convert-sha256.md new file mode 100644 index 00000000..da0dbed0 --- /dev/null +++ b/docs/convert-sha256.md @@ -0,0 +1,297 @@ +# SHA1 → SHA256 Conversion + +`git-sync convert-sha256` is a one-off migration command that fetches a pack +from a SHA1 HTTP source and writes a new SHA256 bare repository on disk. +Every reachable object is re-hashed under SHA256 and tree, commit, and tag +references are rewritten accordingly. + +The command is intentionally narrow: it does not push to a remote, it does +not modify the source, and it is meant to be run once per repo. Resulting +SHA256 hashes have no relation to the original SHA1 hashes beyond a +mapping that the command can optionally emit. + +## Quick Start + +```bash +git-sync convert-sha256 --tags \ + https://github.com/source-org/source-repo.git \ + /path/to/out.git +``` + +The target directory must not exist or must be empty. The result is a bare +repository with `extensions.objectformat = sha256` and a `refs/notes/sha1-origin` +ref recording each commit's pre-conversion SHA1. + +For a private source, pass the token via the environment so it isn't +exposed in `ps`: + +```bash +GITSYNC_SOURCE_TOKEN=ghp_xxx git-sync convert-sha256 --tags \ + https://github.com/source-org/private-repo.git \ + /path/to/out.git +``` + +## What It Does + +1. Probes the source via smart HTTP and discovers refs matching the + requested scope (`--branch`, `--tags`, `--all-refs`, `--map`, + `--exclude-ref-prefix`). +2. Fetches a single self-contained pack via `upload-pack` and lands it in + a temporary on-disk SHA1 bare repo. The temp directory is cleaned up + on exit unless `--keep-source-objects` is passed. +3. Initializes the target as a bare SHA256 repository + (`git init --object-format=sha256` equivalent). +4. Runs a **discovery pass** that walks every reachable object from + each desired ref tip and records its SHA1 and object type. This + gives the rewriter an authoritative "what's in scope" set so + abbreviated message references can be resolved consistently and + message-reference edges can be added to the translation graph. +5. Translates every reachable object in topological order via a + memoized DFS: + - **Blobs**: re-hashed under SHA256; content unchanged. + - **Trees**: each entry's hash translated via the in-memory mapping; + submodule gitlinks left as-is when the referenced commit is in + this repo, otherwise the run errors. + - **Commits**: `tree` and `parent` hashes translated; GPG signatures + dropped; `mergetag` extra headers dropped; in-scope SHA1 + references in the message are translated first (so their SHA256s + are known) and then substituted into the message. + - **Tags**: target hash translated; signatures dropped; tag message + hashes rewritten with the same edge mechanism as commits. +6. Writes refs in the SHA256 target at the translated tip hashes. HEAD + is repointed at the source's symbolic HEAD when that ref made it + into the conversion. +7. Optionally writes the SHA1 → SHA256 mapping as a TSV sidecar + (`--write-mapping `). + +The temp SHA1 store is on disk, not in memory, so peak RAM is bounded +by the in-memory mapping plus a small fixed delta-resolution cache. +Large repos still work; expect runtime dominated by the network fetch +and the loose-object write throughput. + +## Handling External SHA1 References + +A SHA1 → SHA256 cutover is destructive for external systems that +reference commits by hash: PR descriptions, issue trackers, deploy +logs, container labels, doc links, and so on all stop resolving. The +command offers three on-ramps for migrating those out of band. + +### 1. Inline message rewriting (default on) + +Commit and tag messages are scanned for 7-to-40-character hex runs. +When a run uniquely matches a commit or tag SHA1 in the conversion's +reachable set, it is replaced with the full SHA256 hex. Examples that +get rewritten: + +``` +Reverts: a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0 → full SHA256 +Cherry-picked from a1b2c3d → full SHA256 +``` + +Two design notes: + +- **Uniqueness is decided against the reachable set, not the + in-flight mapping.** The discovery pass enumerates every reachable + SHA1 before any encoding starts, so abbreviated prefixes get the + same verdict regardless of how far the translation has progressed. + If `a1b2c3d` matches two different commits in scope, it is treated + as ambiguous and left alone — never rewritten on the basis of which + one happened to be translated first. +- **Cross-branch references work the same as ancestor references.** + Each in-scope SHA1 mentioned in a commit's message is added as a + dependency edge in the translation DFS — that commit is translated + before the referencing commit is encoded. So a cherry-pick from a + sibling branch resolves just as reliably as a revert of an + ancestor. (Cycles in this graph are cryptographically impossible: + for both A's message to contain B's SHA1 and vice versa, you would + need to know each hash before computing it.) + +False positives are essentially impossible because a run is only +substituted if its prefix uniquely matches a real SHA1 of a commit or +tag in scope. Blob and tree hashes are excluded from the match set +so incidental hex strings that collide with content hashes are not +rewritten. Disable with `--no-rewrite-messages` if you prefer +untouched messages. + +### 2. Origin notes ref (default on) + +`refs/notes/sha1-origin` is written after translation and holds, for +each translated commit, the pre-conversion SHA1 hex keyed by the new +SHA256 hash. Standard git tooling can read it: + +```bash +git -C /path/to/out.git notes --ref=sha1-origin show +# prints the original SHA1 + +git -C /path/to/out.git log --notes=sha1-origin +# shows the original SHA1 below each commit's body +``` + +Notes attach meaningfully only to commits, so blobs, trees, and tags +are not represented in this ref. Disable with `--no-origin-notes`. + +### 3. Sidecar mapping file (opt in) + +`--write-mapping ` emits a TSV with one line per translated +object, sorted by SHA1: + +``` +# sha1 sha256 +00027b675386b21c4ca05316145671fb7034d251 d80415fa21bebb... +000bb155604d06f1c48fc7feb4b025d991ef3366 a23cf98db5abfa... +... +``` + +Useful for bulk rewriting external systems: feed the file to a +script that walks Jira tickets, PR bodies, deploy manifests, or any +other system that holds frozen SHA1 references. + +## Flags + +``` +--source-url source repository URL +--source-token source password/token (prefer env) +--source-username source basic auth username (default git) +--source-bearer-token source bearer token +--source-insecure-skip-tls-verify skip TLS verification (testing only) +--source-follow-info-refs-redirect follow /info/refs cross-host redirects +--target-dir SHA256 bare repo directory (must be empty) + +--branch comma-separated branch list +--tags include annotated and lightweight tags +--all-refs include every refs/* on the source +--exclude-ref-prefix subtract refs by prefix; repeatable +--map ref mapping in src:dst form; repeatable + +--protocol protocol mode (auto, v1, v2) +--write-mapping write SHA1 → SHA256 TSV to this path +--no-rewrite-messages skip inline hash rewrites in messages +--no-origin-notes skip refs/notes/sha1-origin +--keep-source-objects leave the temp SHA1 store on disk +--json machine-readable output +--verbose, -v verbose logging +``` + +Environment fallbacks: `GITSYNC_SOURCE_TOKEN`, `GITSYNC_SOURCE_USERNAME`, +`GITSYNC_SOURCE_BEARER_TOKEN`, `GITSYNC_SOURCE_INSECURE_SKIP_TLS_VERIFY`, +`GITSYNC_SOURCE_FOLLOW_INFO_REFS_REDIRECT`, `GITSYNC_PROTOCOL`. + +## Sharp Edges + +**GPG signatures are stripped.** A signature is bytes signed over the +commit's pre-conversion content (including the SHA1 `tree` and +`parent` lines). After rewriting, the bytes no longer match the +signature, so verification would always fail. Rather than persist +invalid signatures, the command drops them and prints a warning count. +This matches upstream `git`'s own SHA256 conversion behavior. Tags +with embedded signatures and `mergetag` extra headers are handled the +same way. + +**Submodule gitlinks must resolve in-repo.** Tree entries with mode +`160000` reference a commit in another repository, but a SHA1 hash +cannot be embedded in a SHA256 tree. The command translates the +pointer if the referenced commit happens to live in the same store +(rare; sometimes seen in vendored modules), and otherwise exits with +an error naming the offending tree, entry, and hash. Scope around +those refs with `--exclude-ref-prefix` or `--branch`, or convert the +submodule repository first. + +**External SHA1 references break silently.** See the section above for +mitigations. References inside the repo (commit and tag messages) are +rewritten when they uniquely identify a commit or tag in scope. +Anything outside the repo — PR descriptions, issue trackers, deploy +manifests, container labels — is not the converter's job; use the +mapping file to drive those rewrites. + +**Replace refs and notes refs become detached.** `refs/replace/` +encodes a SHA1 in the ref name itself, so the name doesn't match +under SHA256 and the replacement never triggers. `refs/notes/*` paths +encode the target object's hash as a tree path, so existing notes +copied under `--all-refs` survive as data but no longer attach to +their original commits. Neither is a correctness issue, just lost +behavior. + +**HEAD can dangle.** When the source's symbolic HEAD branch is not in +the desired ref set, the target's HEAD is left at `refs/heads/master` +(go-git's PlainInit default) and resolves to nothing. Either include +the HEAD branch in scope or set it manually after conversion with +`git -C symbolic-ref HEAD refs/heads/`. + +**Storage is all loose objects.** The command writes one file per +object. Correct, but on filesystems that dislike millions of small +files this is slow. Run `git -C gc --aggressive` afterwards +to pack the converted repo down to a single packfile. + +## Verifying the Output + +Standard git tooling works against the converted repo without +additional flags — the `extensions.objectformat` setting in the local +config is enough for git to switch hashing: + +```bash +git -C /path/to/out.git fsck --full # zero errors expected +git -C /path/to/out.git config extensions.objectformat # prints sha256 +git -C /path/to/out.git log --oneline -5 # SHA256 hashes +git -C /path/to/out.git log --notes=sha1-origin -5 # with original SHA1 +``` + +To use the result as a working repo: + +```bash +git clone /path/to/out.git /path/to/checkout +``` + +To serve it from a host that accepts SHA256: + +```bash +git -C /path/to/out.git push --mirror +``` + +## Implementation Notes + +The translator works in four phases: + +1. **Pack fetch.** A single self-contained pack is streamed into a + filesystem-backed SHA1 storer via go-git's pack parser, so deltas + are resolved up front and the SHA1 source is randomly addressable + for the rest of the run. + +2. **Discovery.** A non-encoding DFS walks every object reachable + from each desired ref tip via tree entries, commit + tree+parent links, and tag targets. Each visited SHA1 is recorded + in a `reachable map[Hash]ObjectType`. This set is the authoritative + "what is in scope" answer used by both submodule resolution and + message-reference rewriting — uniqueness of abbreviated SHA1 + prefixes is decided against this set once, never against the + in-flight mapping. + +3. **Translation.** Memoized recursive DFS from each desired ref tip. + Blobs are copied as-is and re-hashed; trees, commits, and tags are + decoded, their embedded hashes rewritten via the SHA1 → SHA256 + mapping, signatures stripped, and messages rewritten. The DFS + recursion includes message-reference edges: for each commit or + tag whose message mentions a SHA1 of a commit or tag in the + reachable set, that referenced object is translated first. This + guarantees the mapping is populated before the substitution + happens, so cross-branch references resolve as reliably as + ancestor references. Each translated object is written as a loose + object under `objects//` in the target. + +4. **Refs and side outputs.** Refs and HEAD are written at the + translated tip hashes; the origin notes commit (if enabled) is + built and stored under `refs/notes/sha1-origin`; the mapping file + (if requested) is written. + +A defensive `inProgress` set guards against cycles during phase 3. +Real Git histories cannot form cycles (parent, tree, and tag-target +edges are a DAG by construction, and SHA1 message-reference cycles +are cryptographically infeasible), so a trip into this branch is a +hard error rather than a silent skip. + +Note: loose object writing is done by hand rather than via go-git's +`SetEncodedObject`. The underlying `plumbing/format/objfile.Writer` +in `go-git/v6@v6.0.0-alpha.3` hardcodes SHA1 in its hasher, which +would put every translated object at a SHA1-derived path even though +the content references SHA256. This is verified by a unit test that +recomputes `sha256` of every loose object's decompressed content and +compares against the filename. From 3310d43898d2c0169e9a42e379808457067a18a7 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Sat, 23 May 2026 14:23:52 +0200 Subject: [PATCH 02/19] Improvements Entire-Checkpoint: dc4592acaadf --- cmd/git-sync/convert_sha256.go | 36 +- .../internal/sha256convert/sha256convert.go | 349 +++++++++++------- .../sha256convert/sha256convert_test.go | 103 ++++-- docs/convert-sha256.md | 319 ++++++++-------- 4 files changed, 433 insertions(+), 374 deletions(-) diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go index 3c6cc29b..080f1ce9 100644 --- a/cmd/git-sync/convert_sha256.go +++ b/cmd/git-sync/convert_sha256.go @@ -6,15 +6,12 @@ import ( gitsync "entire.io/entire/git-sync" "entire.io/entire/git-sync/cmd/git-sync/internal/sha256convert" - "entire.io/entire/git-sync/internal/validation" "github.com/spf13/cobra" ) func newConvertSHA256Cmd() *cobra.Command { var ( req = sha256convert.Request{} - mappings []string - branches string jsonOutput bool protocolVal = newProtocolFlag() ) @@ -26,13 +23,18 @@ func newConvertSHA256Cmd() *cobra.Command { SHA256 bare repository on disk at . Every reachable object is re-hashed under SHA256 and tree/commit/tag references are rewritten. +All branches and tags on the source are always converted — partial scope +risks stranding cross-branch references in commit messages. Pass +--all-refs to also include refs/notes/*, refs/pull/*, and other custom +namespaces; pass --exclude-ref-prefix to subtract specific namespaces +from --all-refs. + The conversion is destructive in two ways the caller should be aware of: -no SHA1↔SHA256 mapping is persisted, and any GPG signatures on commits or -tags are dropped (they sign over the original SHA1 content and would be -invalid post-rewrite). Submodule gitlinks that point at a commit outside -this repository cannot be embedded in a SHA256 tree; if the source repo -contains any, the command exits with an error so the caller can scope -around the offending refs.`, +GPG signatures on commits and tags are dropped (they sign over the +original SHA1 content and would be invalid post-rewrite), and submodule +gitlinks that point at a commit outside this repository cannot be +embedded in a SHA256 tree — the command exits with an error if it finds +any so the caller can convert the submodule repository first.`, Args: cobra.MaximumNArgs(2), SilenceErrors: true, SilenceUsage: true, @@ -47,19 +49,6 @@ around the offending refs.`, if req.SourceURL == "" || req.TargetDir == "" { return errors.New("convert-sha256 requires a source URL and a target directory") } - if branches != "" { - req.Branches = splitCSV(branches) - } - for _, raw := range mappings { - mapping, err := validation.ParseMapping(raw) - if err != nil { - return fmt.Errorf("parse mapping %q: %w", raw, err) - } - req.Mappings = append(req.Mappings, gitsync.RefMapping{ - Source: mapping.Source, - Target: mapping.Target, - }) - } result, err := sha256convert.Run(cmd.Context(), req) if err != nil { @@ -85,9 +74,6 @@ around the offending refs.`, "skip TLS certificate verification for the source") cmd.Flags().StringVar(&req.TargetDir, "target-dir", "", "directory to initialize as a SHA256 bare repository") - cmd.Flags().StringVar(&branches, "branch", "", "comma-separated branch list; default is all source branches") - cmd.Flags().StringArrayVar(&mappings, "map", nil, "ref mapping in src:dst form; short names map branches, full refs map exact refs") - cmd.Flags().BoolVar(&req.IncludeTags, "tags", false, "include annotated and lightweight tags") allRefsFlag(cmd, allRefsUsageScopeOnly, &req.AllRefs) excludeRefPrefixFlag(cmd, &req.ExcludeRefPrefixes) addProtocolFlag(cmd, &protocolVal) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 241ec2d1..34b3eecb 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -49,17 +49,20 @@ import ( ) // Request describes a single SHA1 → SHA256 conversion. +// +// Scope is intentionally fixed: every branch and every annotated/lightweight +// tag on the source is always converted. Partial scope risks stranding +// cross-branch references in commit messages, which defeats the point of a +// one-off cutover. AllRefs additionally pulls in refs/notes, refs/pull, and +// other custom namespaces; ExcludeRefPrefixes subtracts from that. type Request struct { SourceURL string SourceAuth gitsync.EndpointAuth SourceFollowInfoRefsRedirect bool TargetDir string - Branches []string - IncludeTags bool AllRefs bool ExcludeRefPrefixes []string - Mappings []gitsync.RefMapping ProtocolMode gitsync.ProtocolMode Verbose bool @@ -93,16 +96,17 @@ type Counts struct { // Result is the conversion summary, suitable for JSON output. type Result struct { - SourceURL string `json:"sourceUrl"` - TargetDir string `json:"targetDir"` - Protocol string `json:"protocol"` - RefsConverted int `json:"refsConverted"` - Counts Counts `json:"counts"` - SignaturesStripped int `json:"signaturesStripped"` - MessageRewrites int `json:"messageRewrites"` - OriginNotesRef string `json:"originNotesRef,omitempty"` - MappingFile string `json:"mappingFile,omitempty"` - TempDir string `json:"tempDir,omitempty"` + SourceURL string `json:"sourceUrl"` + TargetDir string `json:"targetDir"` + Protocol string `json:"protocol"` + RefsConverted int `json:"refsConverted"` + Counts Counts `json:"counts"` + SignaturesStripped int `json:"signaturesStripped"` + MessageRewrites int `json:"messageRewrites"` + AmbiguousMessageRefs []string `json:"ambiguousMessageRefs,omitempty"` + OriginNotesRef string `json:"originNotesRef,omitempty"` + MappingFile string `json:"mappingFile,omitempty"` + TempDir string `json:"tempDir,omitempty"` } // Lines satisfies the human-readable output contract used by other git-sync subcommands. @@ -120,6 +124,21 @@ func (r Result) Lines() []string { if r.MessageRewrites > 0 { lines = append(lines, fmt.Sprintf("rewrote %d SHA1 hash reference(s) in commit/tag messages", r.MessageRewrites)) } + if n := len(r.AmbiguousMessageRefs); n > 0 { + preview := r.AmbiguousMessageRefs + const max = 5 + extra := 0 + if len(preview) > max { + extra = len(preview) - max + preview = preview[:max] + } + line := fmt.Sprintf("warning: %d ambiguous SHA1 hex prefix(es) in messages left unrewritten (look up via the mapping file): %s", + n, strings.Join(preview, ", ")) + if extra > 0 { + line += fmt.Sprintf(", ... (%d more)", extra) + } + lines = append(lines, line) + } if r.OriginNotesRef != "" { lines = append(lines, fmt.Sprintf("origin notes ref: %s (use `git notes --ref=%s show ` to recover old SHA1)", r.OriginNotesRef, strings.TrimPrefix(r.OriginNotesRef, "refs/notes/"))) @@ -166,16 +185,12 @@ func Run(ctx context.Context, req Request) (Result, error) { return Result{}, fmt.Errorf("init temporary SHA1 store: %w", err) } - dstRepo, err := git.PlainInit(req.TargetDir, true, git.WithObjectFormat(formatcfg.SHA256)) - if err != nil { - return Result{}, fmt.Errorf("init SHA256 target at %s: %w", req.TargetDir, err) - } - // Source connection + ref discovery ----------------------------------- + // Scope is fixed: always include every branch and every tag. AllRefs + // extends to refs/notes/*, refs/pull/*, and other namespaces; + // ExcludeRefPrefixes can subtract from that under AllRefs. planCfg := planner.PlanConfig{ - Branches: append([]string(nil), req.Branches...), - Mappings: toPlannerMappings(req.Mappings), - IncludeTags: req.IncludeTags, + IncludeTags: true, AllRefs: req.AllRefs, ExcludeRefPrefixes: append([]string(nil), req.ExcludeRefPrefixes...), } @@ -203,19 +218,30 @@ func Run(ctx context.Context, req Request) (Result, error) { return Result{}, fmt.Errorf("fetch source pack: %w", err) } - // Discover + translate ------------------------------------------------ - tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, req.TargetDir, !req.SkipMessageRewrite) - if err != nil { - return Result{}, err - } + // Discover reachable set before initing the target. Submodule + // errors surface here, so a failed run leaves the target dir + // untouched (it was only ensured-empty so far) rather than half + // converted. rootSHA1s := make([]plumbing.Hash, 0, len(desired)) for _, d := range desired { rootSHA1s = append(rootSHA1s, d.SourceHash) } fmt.Fprintln(out, "discovering reachable objects ...") - if err := tr.discover(rootSHA1s); err != nil { + reachable, err := discoverReachable(srcRepo.Storer, rootSHA1s) + if err != nil { return Result{}, fmt.Errorf("discover reachable: %w", err) } + + // Discovery succeeded — safe to materialize the SHA256 target. + dstRepo, err := git.PlainInit(req.TargetDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + return Result{}, fmt.Errorf("init SHA256 target at %s: %w", req.TargetDir, err) + } + + tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, req.TargetDir, !req.SkipMessageRewrite, reachable) + if err != nil { + return Result{}, err + } fmt.Fprintln(out, "translating objects to sha256 ...") for _, d := range desired { if _, err := tr.translate(d.SourceHash); err != nil { @@ -250,6 +276,14 @@ func Run(ctx context.Context, req Request) (Result, error) { SignaturesStripped: tr.signaturesStripped, MessageRewrites: tr.messageRewrites, } + if len(tr.ambiguousMessageRefs) > 0 { + amb := make([]string, 0, len(tr.ambiguousMessageRefs)) + for s := range tr.ambiguousMessageRefs { + amb = append(amb, s) + } + sort.Strings(amb) + res.AmbiguousMessageRefs = amb + } if !req.SkipOriginNotes && len(tr.commits) > 0 { notesRef, err := tr.writeOriginNotes(originNotesRef) @@ -350,14 +384,6 @@ type authAdapter struct{ m auth.Method } func (a authAdapter) Authorizer(req *http.Request) error { return a.m.Authorizer(req) } -func toPlannerMappings(in []gitsync.RefMapping) []planner.RefMapping { - out := make([]planner.RefMapping, 0, len(in)) - for _, m := range in { - out = append(out, planner.RefMapping{Source: m.Source, Target: m.Target}) - } - return out -} - // translator walks the SHA1 source store, rewrites object content with // SHA256-mapped hashes, and writes the result as loose objects under the // target bare repo. Loose object writing is done by hand because go-git @@ -369,7 +395,7 @@ type translator struct { dst *filesystem.Storage objectsDir string // reachable holds every in-scope SHA1 with its object type, built up - // front by a discovery pass that walks tree/commit/tag dependencies + // front by discoverReachable, which walks tree/commit/tag dependencies // from the desired ref tips. It is the authoritative "what's in // scope" set: abbreviated SHA1 prefixes in commit/tag messages are // resolved against this set so a unique match is fixed before any @@ -386,15 +412,21 @@ type translator struct { // commits records every translated commit's old SHA1, in DFS order, // for use by writeOriginNotes. We track separately rather than walking // the full mapping because notes only attach meaningfully to commits. - commits []plumbing.Hash - counts Counts - signaturesStripped int - messageRewrites int - rewriteMessages bool - lastNotesCommit plumbing.Hash + commits []plumbing.Hash + // ambiguousMessageRefs collects every hex prefix in a commit/tag + // message that matched more than one in-scope SHA1 and was + // therefore left unrewritten. Surfaced to the user as a warning + // so they know which references to investigate via the mapping + // file. + ambiguousMessageRefs map[string]struct{} + counts Counts + signaturesStripped int + messageRewrites int + rewriteMessages bool + lastNotesCommit plumbing.Hash } -func newTranslator(src, dst storer.Storer, targetDir string, rewriteMessages bool) (*translator, error) { +func newTranslator(src, dst storer.Storer, targetDir string, rewriteMessages bool, reachable map[plumbing.Hash]plumbing.ObjectType) (*translator, error) { srcFS, ok := src.(*filesystem.Storage) if !ok { return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) @@ -403,85 +435,106 @@ func newTranslator(src, dst storer.Storer, targetDir string, rewriteMessages boo if !ok { return nil, fmt.Errorf("target storage is not filesystem-backed (%T)", dst) } + if reachable == nil { + reachable = make(map[plumbing.Hash]plumbing.ObjectType) + } return &translator{ - src: srcFS, - dst: dstFS, - objectsDir: filepath.Join(targetDir, "objects"), - reachable: make(map[plumbing.Hash]plumbing.ObjectType), - mapping: make(map[plumbing.Hash]plumbing.Hash), - inProgress: make(map[plumbing.Hash]struct{}), - rewriteMessages: rewriteMessages, + src: srcFS, + dst: dstFS, + objectsDir: filepath.Join(targetDir, "objects"), + reachable: reachable, + mapping: make(map[plumbing.Hash]plumbing.Hash), + inProgress: make(map[plumbing.Hash]struct{}), + ambiguousMessageRefs: make(map[string]struct{}), + rewriteMessages: rewriteMessages, }, nil } -// discover walks every object reachable from roots (via tree entries, -// commit tree+parent links, and tag targets) and records each one in -// t.reachable with its object type. Submodule gitlinks are followed -// only when the referenced commit exists in the same source store, to -// stay consistent with translateTree's handling. Message-reference -// edges are not part of this pass — those are added during translation. -func (t *translator) discover(roots []plumbing.Hash) error { - for _, root := range roots { - if err := t.visit(root); err != nil { - return err - } - } - return nil -} - -func (t *translator) visit(sha1 plumbing.Hash) error { - if _, seen := t.reachable[sha1]; seen { - return nil - } - obj, err := t.src.EncodedObject(plumbing.AnyObject, sha1) - if err != nil { - return fmt.Errorf("discover %s: %w", sha1, err) +// discoverReachable walks every object reachable from roots (via tree +// entries, commit tree+parent links, and tag targets) and returns a +// (SHA1 → object type) map covering the full in-scope set. +// +// Submodule gitlinks: a tree entry with mode 160000 points at a commit +// in another repository, and a SHA1 hash cannot be embedded in a +// SHA256 tree. If the referenced commit happens to live in this +// source store (rare; vendored modules), it is recursively visited +// like any other commit. Otherwise discovery returns an error here, +// before the target bare repo is initialized — failing fast keeps +// half-converted state off disk. +// +// Message-reference edges are not part of this pass; those are added +// during translation, where the partial mapping is updated as we go. +func discoverReachable(src storer.Storer, roots []plumbing.Hash) (map[plumbing.Hash]plumbing.ObjectType, error) { + srcFS, ok := src.(*filesystem.Storage) + if !ok { + return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) } - t.reachable[sha1] = obj.Type() - switch obj.Type() { - case plumbing.BlobObject: - return nil - case plumbing.TreeObject: - tree := &object.Tree{} - if err := tree.Decode(obj); err != nil { - return fmt.Errorf("discover decode tree %s: %w", sha1, err) - } - for _, e := range tree.Entries { - if e.Mode == filemode.Submodule { - if _, err := t.src.EncodedObject(plumbing.CommitObject, e.Hash); err == nil { - if err := t.visit(e.Hash); err != nil { - return err + reachable := make(map[plumbing.Hash]plumbing.ObjectType) + var visit func(plumbing.Hash) error + visit = func(sha1 plumbing.Hash) error { + if _, seen := reachable[sha1]; seen { + return nil + } + obj, err := srcFS.EncodedObject(plumbing.AnyObject, sha1) + if err != nil { + return fmt.Errorf("discover %s: %w", sha1, err) + } + reachable[sha1] = obj.Type() + switch obj.Type() { + case plumbing.BlobObject: + return nil + case plumbing.TreeObject: + tree := &object.Tree{} + if err := tree.Decode(obj); err != nil { + return fmt.Errorf("discover decode tree %s: %w", sha1, err) + } + for _, e := range tree.Entries { + if e.Mode == filemode.Submodule { + if _, err := srcFS.EncodedObject(plumbing.CommitObject, e.Hash); err == nil { + if err := visit(e.Hash); err != nil { + return err + } + continue } + return fmt.Errorf( + "tree %s contains a submodule gitlink %q at %s that is not present in the source repo; "+ + "convert the submodule repository first so its commit hashes are available in SHA256", + sha1, e.Name, e.Hash) } - continue + if err := visit(e.Hash); err != nil { + return err + } + } + case plumbing.CommitObject: + c := &object.Commit{} + if err := c.Decode(obj); err != nil { + return fmt.Errorf("discover decode commit %s: %w", sha1, err) } - if err := t.visit(e.Hash); err != nil { + if err := visit(c.TreeHash); err != nil { return err } - } - case plumbing.CommitObject: - c := &object.Commit{} - if err := c.Decode(obj); err != nil { - return fmt.Errorf("discover decode commit %s: %w", sha1, err) - } - if err := t.visit(c.TreeHash); err != nil { - return err - } - for _, p := range c.ParentHashes { - if err := t.visit(p); err != nil { + for _, p := range c.ParentHashes { + if err := visit(p); err != nil { + return err + } + } + case plumbing.TagObject: + tag := &object.Tag{} + if err := tag.Decode(obj); err != nil { + return fmt.Errorf("discover decode tag %s: %w", sha1, err) + } + if err := visit(tag.Target); err != nil { return err } } - case plumbing.TagObject: - tag := &object.Tag{} - if err := tag.Decode(obj); err != nil { - return fmt.Errorf("discover decode tag %s: %w", sha1, err) - } - if err := t.visit(tag.Target); err != nil { - return err + return nil + } + for _, r := range roots { + if err := visit(r); err != nil { + return nil, err } } - return nil + return reachable, nil } func (t *translator) translate(sha1 plumbing.Hash) (plumbing.Hash, error) { @@ -788,10 +841,25 @@ func (t *translator) writeLoose(typ plumbing.ObjectType, body []byte) (plumbing. // real source SHA1). var hashPattern = regexp.MustCompile(`\b[0-9a-f]{7,40}\b`) +// matchResult is the 3-state outcome of resolving a hex prefix in a +// commit/tag message against the reachable set. We distinguish +// "ambiguous" from "no match" so the caller can warn the user about +// prefixes that *could* be rewritten if they were a couple of chars +// longer. +type matchResult int + +const ( + matchNone matchResult = iota + matchUnique + matchAmbiguous +) + // rewriteHashesInMessage scans msg for short and full SHA1 hashes, // replacing any that uniquely identify a commit or tag in t.reachable // with the corresponding full SHA256 hex from t.mapping. Returns the -// rewritten message and the number of substitutions made. +// rewritten message and the number of substitutions made. Ambiguous +// prefixes are recorded in t.ambiguousMessageRefs so the caller can +// surface a warning at the end of the run. // // Uniqueness is decided against t.reachable rather than t.mapping so // that abbreviated prefixes get the same verdict during translation as @@ -805,44 +873,50 @@ var hashPattern = regexp.MustCompile(`\b[0-9a-f]{7,40}\b`) func (t *translator) rewriteHashesInMessage(msg string) (string, int) { count := 0 out := hashPattern.ReplaceAllStringFunc(msg, func(s string) string { - sha1, ok := t.resolveMessageRef(s) - if !ok { + sha1, result := t.resolveMessageRef(s) + switch result { + case matchAmbiguous: + t.ambiguousMessageRefs[s] = struct{}{} return s - } - newHash, ok := t.mapping[sha1] - if !ok { - // The reachable set says this SHA1 is in scope, but the - // translation DFS hasn't placed it yet. Shouldn't happen - // because translateCommit/translateTag add message-reference - // edges before encoding — leave the hex untouched if it - // somehow does. + case matchUnique: + newHash, ok := t.mapping[sha1] + if !ok { + // The reachable set says this SHA1 is in scope, but + // the translation DFS hasn't placed it yet. Shouldn't + // happen because translateCommit/translateTag add + // message-reference edges before encoding — leave the + // hex untouched if it somehow does. + return s + } + count++ + return newHash.String() + default: return s } - count++ - return newHash.String() }) return out, count } -// resolveMessageRef returns the unique commit/tag SHA1 in t.reachable -// that matches the given hex prefix. Returns (zero, false) for no -// match, an ambiguous prefix, or a match that is not a commit or tag -// (incidental hex strings that happen to collide with a blob or tree -// hash are not rewritten). -func (t *translator) resolveMessageRef(prefix string) (plumbing.Hash, bool) { +// resolveMessageRef classifies a hex prefix against the reachable set. +// Returns matchUnique with the resolved SHA1 when exactly one commit +// or tag in scope matches; matchAmbiguous when more than one does; +// matchNone otherwise (no match, or the match is a blob/tree — those +// are filtered so incidental hex collisions on content hashes aren't +// rewritten). +func (t *translator) resolveMessageRef(prefix string) (plumbing.Hash, matchResult) { if len(prefix) == 40 { sha1, ok := plumbing.FromHex(prefix) if !ok { - return plumbing.ZeroHash, false + return plumbing.ZeroHash, matchNone } typ, in := t.reachable[sha1] if !in { - return plumbing.ZeroHash, false + return plumbing.ZeroHash, matchNone } if typ != plumbing.CommitObject && typ != plumbing.TagObject { - return plumbing.ZeroHash, false + return plumbing.ZeroHash, matchNone } - return sha1, true + return sha1, matchUnique } var match plumbing.Hash matches := 0 @@ -853,27 +927,28 @@ func (t *translator) resolveMessageRef(prefix string) (plumbing.Hash, bool) { if strings.HasPrefix(sha1.String(), prefix) { matches++ if matches > 1 { - return plumbing.ZeroHash, false + return plumbing.ZeroHash, matchAmbiguous } match = sha1 } } - if matches != 1 { - return plumbing.ZeroHash, false + if matches == 1 { + return match, matchUnique } - return match, true + return plumbing.ZeroHash, matchNone } // extractMessageReferences returns the unique commit/tag SHA1s mentioned // by hex prefix in msg. Used by translateCommit/translateTag to add // message-reference edges to the translation DFS so the mapping is -// fully populated by the time the message is rewritten. +// fully populated by the time the message is rewritten. Ambiguous +// prefixes generate no edge — they cannot be rewritten anyway. func (t *translator) extractMessageReferences(msg string) []plumbing.Hash { seen := make(map[plumbing.Hash]struct{}) var out []plumbing.Hash for _, match := range hashPattern.FindAllString(msg, -1) { - sha1, ok := t.resolveMessageRef(match) - if !ok { + sha1, result := t.resolveMessageRef(match) + if result != matchUnique { continue } if _, dup := seen[sha1]; dup { diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 9a48e129..c53f5d70 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -81,12 +81,13 @@ func TestTranslator(t *testing.T) { } tagHash := writeObject(t, srcRepo.Storer, tag.Encode) - tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, false) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{tagHash}) if err != nil { - t.Fatalf("newTranslator: %v", err) + t.Fatalf("discoverReachable: %v", err) } - if err := tr.discover([]plumbing.Hash{tagHash}); err != nil { - t.Fatalf("discover: %v", err) + tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, false, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) } newTagHash, err := tr.translate(tagHash) if err != nil { @@ -195,12 +196,13 @@ func TestTranslator_RewritesMessageHashes(t *testing.T) { } childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) - tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, true) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{childSHA1}) if err != nil { - t.Fatalf("newTranslator: %v", err) + t.Fatalf("discoverReachable: %v", err) } - if err := tr.discover([]plumbing.Hash{childSHA1}); err != nil { - t.Fatalf("discover: %v", err) + tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) } if _, err := tr.translate(childSHA1); err != nil { t.Fatalf("translate child: %v", err) @@ -271,12 +273,13 @@ func TestTranslator_RewritesCrossBranchReferences(t *testing.T) { TreeHash: treeB, }).Encode) - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, true) // Discovery must see both branches so the reachable set covers cA // before cB is encoded. - if err := tr.discover([]plumbing.Hash{cB, cA}); err != nil { - t.Fatalf("discover: %v", err) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{cB, cA}) + if err != nil { + t.Fatalf("discoverReachable: %v", err) } + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) // Translate B first — the order that would have left the rewrite // stranded under the old design. if _, err := tr.translate(cB); err != nil { @@ -326,10 +329,11 @@ func TestTranslator_SkipMessageRewrite(t *testing.T) { } childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false) - if err := tr.discover([]plumbing.Hash{childSHA1}); err != nil { - t.Fatalf("discover: %v", err) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{childSHA1}) + if err != nil { + t.Fatalf("discoverReachable: %v", err) } + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) if _, err := tr.translate(childSHA1); err != nil { t.Fatalf("translate: %v", err) } @@ -356,10 +360,11 @@ func TestTranslator_WriteOriginNotes(t *testing.T) { c1 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c1\n", TreeHash: tree}).Encode) c2 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c2\n", TreeHash: tree, ParentHashes: []plumbing.Hash{c1}}).Encode) - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false) - if err := tr.discover([]plumbing.Hash{c2}); err != nil { - t.Fatalf("discover: %v", err) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{c2}) + if err != nil { + t.Fatalf("discoverReachable: %v", err) } + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) if _, err := tr.translate(c2); err != nil { t.Fatalf("translate: %v", err) } @@ -420,10 +425,11 @@ func TestTranslator_WriteMappingFile(t *testing.T) { sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} commit := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c\n", TreeHash: tree}).Encode) - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false) - if err := tr.discover([]plumbing.Hash{commit}); err != nil { - t.Fatalf("discover: %v", err) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{commit}) + if err != nil { + t.Fatalf("discoverReachable: %v", err) } + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) if _, err := tr.translate(commit); err != nil { t.Fatalf("translate: %v", err) } @@ -469,22 +475,53 @@ func TestTranslator_WriteMappingFile(t *testing.T) { } } +// TestTranslator_AmbiguousMessageRefWarning verifies that when an +// abbreviated SHA1 prefix in a commit message matches more than one +// in-scope commit, the prefix is left unrewritten and recorded in +// t.ambiguousMessageRefs so the caller can surface a warning. +// +// We can't easily force a real SHA1 prefix collision in a test, so +// we install two synthetic entries in the reachable map after the +// translator is constructed and then run rewriteHashesInMessage +// directly. This exercises the same code path the production +// pipeline takes. +func TestTranslator_AmbiguousMessageRefWarning(t *testing.T) { + root := t.TempDir() + srcRepo, _ := git.PlainInit(filepath.Join(root, "src.git"), true) + dstRepo, _ := git.PlainInit(filepath.Join(root, "dst.git"), true, git.WithObjectFormat(formatcfg.SHA256)) + tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), true, nil) + + // Two real-looking SHA1 hashes that share the prefix "deadbee". + one := plumbing.NewHash("deadbee100000000000000000000000000000001") + two := plumbing.NewHash("deadbee200000000000000000000000000000002") + tr.reachable[one] = plumbing.CommitObject + tr.reachable[two] = plumbing.CommitObject + + out, count := tr.rewriteHashesInMessage("see commit deadbee for details\n") + if count != 0 { + t.Errorf("ambiguous prefix should not be rewritten; got count=%d", count) + } + if !strings.Contains(out, "deadbee") { + t.Errorf("ambiguous prefix should be left in message; got %q", out) + } + if _, recorded := tr.ambiguousMessageRefs["deadbee"]; !recorded { + t.Errorf("expected %q to be recorded in ambiguousMessageRefs, got %v", + "deadbee", tr.ambiguousMessageRefs) + } +} + // TestTranslator_UnresolvableSubmodule confirms that a tree entry with -// Submodule mode pointing at a commit not in the source repo causes a -// clear error rather than silently producing a malformed SHA256 tree. +// Submodule mode pointing at a commit not in the source repo is +// rejected during discovery (fail-fast), before any object is written +// to the target. func TestTranslator_UnresolvableSubmodule(t *testing.T) { root := t.TempDir() srcDir := filepath.Join(root, "src.git") - dstDir := filepath.Join(root, "dst.git") srcRepo, err := git.PlainInit(srcDir, true) if err != nil { t.Fatalf("init SHA1 source: %v", err) } - dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) - if err != nil { - t.Fatalf("init SHA256 target: %v", err) - } blobHash := writeBlob(t, srcRepo.Storer, []byte("contents\n")) // External-looking SHA1 — not in source. @@ -494,16 +531,9 @@ func TestTranslator_UnresolvableSubmodule(t *testing.T) { {Name: "sub", Mode: filemode.Submodule, Hash: external}, }) - tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, false) - if err != nil { - t.Fatalf("newTranslator: %v", err) - } - if err := tr.discover([]plumbing.Hash{treeHash}); err != nil { - t.Fatalf("discover: %v", err) - } - _, err = tr.translate(treeHash) + _, err = discoverReachable(srcRepo.Storer, []plumbing.Hash{treeHash}) if err == nil { - t.Fatal("expected error for unresolvable submodule, got nil") + t.Fatal("expected discoverReachable to fail on unresolvable submodule, got nil") } if !strings.Contains(err.Error(), "submodule") { t.Errorf("error should mention submodule; got: %v", err) @@ -649,7 +679,6 @@ func TestRun_GitHTTPBackend(t *testing.T) { res, err := Run(context.Background(), Request{ SourceURL: srv.URL + "/source.git", TargetDir: dstDir, - IncludeTags: true, MappingFile: mappingPath, Out: io.Discard, }) diff --git a/docs/convert-sha256.md b/docs/convert-sha256.md index da0dbed0..33f539d8 100644 --- a/docs/convert-sha256.md +++ b/docs/convert-sha256.md @@ -3,121 +3,103 @@ `git-sync convert-sha256` is a one-off migration command that fetches a pack from a SHA1 HTTP source and writes a new SHA256 bare repository on disk. Every reachable object is re-hashed under SHA256 and tree, commit, and tag -references are rewritten accordingly. - -The command is intentionally narrow: it does not push to a remote, it does -not modify the source, and it is meant to be run once per repo. Resulting +references are rewritten accordingly. The command does not push to a +remote, does not modify the source, and is meant to run once per repo. SHA256 hashes have no relation to the original SHA1 hashes beyond a -mapping that the command can optionally emit. +mapping the command can optionally emit. ## Quick Start ```bash -git-sync convert-sha256 --tags \ +git-sync convert-sha256 \ https://github.com/source-org/source-repo.git \ /path/to/out.git ``` The target directory must not exist or must be empty. The result is a bare -repository with `extensions.objectformat = sha256` and a `refs/notes/sha1-origin` -ref recording each commit's pre-conversion SHA1. +repository with `extensions.objectformat = sha256` and a +`refs/notes/sha1-origin` ref recording each commit's pre-conversion SHA1. + +Scope is fixed: every branch and every tag on the source is always +converted. Pass `--all-refs` to also include `refs/notes/*`, +`refs/pull/*`, and other custom namespaces; pair with +`--exclude-ref-prefix` to subtract specific namespaces (e.g. +`--exclude-ref-prefix refs/pull/` on GitHub mirrors). For a private source, pass the token via the environment so it isn't exposed in `ps`: ```bash -GITSYNC_SOURCE_TOKEN=ghp_xxx git-sync convert-sha256 --tags \ +GITSYNC_SOURCE_TOKEN=ghp_xxx git-sync convert-sha256 \ https://github.com/source-org/private-repo.git \ /path/to/out.git ``` ## What It Does -1. Probes the source via smart HTTP and discovers refs matching the - requested scope (`--branch`, `--tags`, `--all-refs`, `--map`, - `--exclude-ref-prefix`). -2. Fetches a single self-contained pack via `upload-pack` and lands it in - a temporary on-disk SHA1 bare repo. The temp directory is cleaned up - on exit unless `--keep-source-objects` is passed. -3. Initializes the target as a bare SHA256 repository +1. Probes the source via smart HTTP and lists every in-scope ref. +2. Fetches a single self-contained pack via `upload-pack` into a + temporary on-disk SHA1 bare repo (cleaned up at the end unless + `--keep-source-objects` is passed). +3. Discovers every reachable object — walking trees, commits, and tags + — and records each one's SHA1 and object type. Submodule gitlinks + are checked here; unresolvable ones fail-fast before any output is + written. +4. Initializes the target as a bare SHA256 repository (`git init --object-format=sha256` equivalent). -4. Runs a **discovery pass** that walks every reachable object from - each desired ref tip and records its SHA1 and object type. This - gives the rewriter an authoritative "what's in scope" set so - abbreviated message references can be resolved consistently and - message-reference edges can be added to the translation graph. -5. Translates every reachable object in topological order via a - memoized DFS: +5. Translates every reachable object in topological order via memoized + DFS: - **Blobs**: re-hashed under SHA256; content unchanged. - - **Trees**: each entry's hash translated via the in-memory mapping; - submodule gitlinks left as-is when the referenced commit is in - this repo, otherwise the run errors. + - **Trees**: each entry's hash translated. - **Commits**: `tree` and `parent` hashes translated; GPG signatures - dropped; `mergetag` extra headers dropped; in-scope SHA1 - references in the message are translated first (so their SHA256s - are known) and then substituted into the message. - - **Tags**: target hash translated; signatures dropped; tag message - hashes rewritten with the same edge mechanism as commits. -6. Writes refs in the SHA256 target at the translated tip hashes. HEAD - is repointed at the source's symbolic HEAD when that ref made it - into the conversion. -7. Optionally writes the SHA1 → SHA256 mapping as a TSV sidecar - (`--write-mapping `). - -The temp SHA1 store is on disk, not in memory, so peak RAM is bounded -by the in-memory mapping plus a small fixed delta-resolution cache. -Large repos still work; expect runtime dominated by the network fetch -and the loose-object write throughput. - -## Handling External SHA1 References - -A SHA1 → SHA256 cutover is destructive for external systems that -reference commits by hash: PR descriptions, issue trackers, deploy -logs, container labels, doc links, and so on all stop resolving. The -command offers three on-ramps for migrating those out of band. - -### 1. Inline message rewriting (default on) + and `mergetag` headers dropped; in-scope SHA1 references in the + message are translated first and then substituted. + - **Tags**: target hash translated; signatures dropped; message + hashes rewritten the same way. +6. Writes refs at the translated tip hashes; repoints HEAD to the + source's symbolic HEAD; builds `refs/notes/sha1-origin` (unless + `--no-origin-notes`); emits the `--write-mapping` TSV (if requested). + +## Side Outputs + +The conversion deliberately decouples SHA1 from SHA256 — two runs of +this tool against the same source produce SHA256 hashes that share +nothing with the originals. Three on-ramps help bridge the gap. + +### Inline message rewriting (default on) Commit and tag messages are scanned for 7-to-40-character hex runs. -When a run uniquely matches a commit or tag SHA1 in the conversion's -reachable set, it is replaced with the full SHA256 hex. Examples that -get rewritten: +When a run uniquely matches a commit or tag SHA1 in the reachable set, +it is replaced with the full SHA256 hex: ``` Reverts: a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0 → full SHA256 Cherry-picked from a1b2c3d → full SHA256 ``` -Two design notes: - -- **Uniqueness is decided against the reachable set, not the - in-flight mapping.** The discovery pass enumerates every reachable - SHA1 before any encoding starts, so abbreviated prefixes get the - same verdict regardless of how far the translation has progressed. - If `a1b2c3d` matches two different commits in scope, it is treated - as ambiguous and left alone — never rewritten on the basis of which - one happened to be translated first. -- **Cross-branch references work the same as ancestor references.** - Each in-scope SHA1 mentioned in a commit's message is added as a - dependency edge in the translation DFS — that commit is translated - before the referencing commit is encoded. So a cherry-pick from a - sibling branch resolves just as reliably as a revert of an - ancestor. (Cycles in this graph are cryptographically impossible: - for both A's message to contain B's SHA1 and vice versa, you would - need to know each hash before computing it.) - -False positives are essentially impossible because a run is only -substituted if its prefix uniquely matches a real SHA1 of a commit or -tag in scope. Blob and tree hashes are excluded from the match set -so incidental hex strings that collide with content hashes are not -rewritten. Disable with `--no-rewrite-messages` if you prefer -untouched messages. - -### 2. Origin notes ref (default on) - -`refs/notes/sha1-origin` is written after translation and holds, for -each translated commit, the pre-conversion SHA1 hex keyed by the new -SHA256 hash. Standard git tooling can read it: +Two properties make this robust: + +- **Uniqueness is decided against the reachable set, not the in-flight + mapping.** The discovery pass enumerates every reachable SHA1 before + any encoding starts, so abbreviated prefixes get the same verdict + regardless of how far the translation has progressed. Ambiguous + prefixes are left unrewritten and reported (warning on stderr + + `--json`'s `ambiguousMessageRefs`); look them up in the mapping file. +- **Cross-branch references resolve.** Each in-scope SHA1 mentioned in + a message is added as a dependency edge in the translation DFS, so + the referenced commit is translated before the referencing commit is + encoded. A cherry-pick from a sibling branch resolves just as + reliably as a revert of an ancestor. + +False positives are essentially impossible: a run is substituted only +if its prefix uniquely matches a commit or tag in scope. Blob and tree +hashes are excluded from the match set. Disable with +`--no-rewrite-messages` if you prefer untouched messages. + +### Origin notes ref (default on) + +`refs/notes/sha1-origin` holds, for each translated commit, the +pre-conversion SHA1 keyed by the new SHA256: ```bash git -C /path/to/out.git notes --ref=sha1-origin show @@ -127,10 +109,10 @@ git -C /path/to/out.git log --notes=sha1-origin # shows the original SHA1 below each commit's body ``` -Notes attach meaningfully only to commits, so blobs, trees, and tags -are not represented in this ref. Disable with `--no-origin-notes`. +Notes attach meaningfully only to commits; blobs, trees, and tags are +not represented. Disable with `--no-origin-notes`. -### 3. Sidecar mapping file (opt in) +### Sidecar mapping file (opt in via `--write-mapping`) `--write-mapping ` emits a TSV with one line per translated object, sorted by SHA1: @@ -142,9 +124,9 @@ object, sorted by SHA1: ... ``` -Useful for bulk rewriting external systems: feed the file to a -script that walks Jira tickets, PR bodies, deploy manifests, or any -other system that holds frozen SHA1 references. +Useful for bulk rewriting external systems: feed the file to a script +that walks Jira tickets, PR bodies, deploy manifests, or any other +system that holds frozen SHA1 references. ## Flags @@ -157,11 +139,9 @@ other system that holds frozen SHA1 references. --source-follow-info-refs-redirect follow /info/refs cross-host redirects --target-dir SHA256 bare repo directory (must be empty) ---branch comma-separated branch list ---tags include annotated and lightweight tags ---all-refs include every refs/* on the source +--all-refs also include refs/* outside heads/tags + (notes, pulls, custom namespaces) --exclude-ref-prefix subtract refs by prefix; repeatable ---map ref mapping in src:dst form; repeatable --protocol protocol mode (auto, v1, v2) --write-mapping write SHA1 → SHA256 TSV to this path @@ -172,6 +152,9 @@ other system that holds frozen SHA1 references. --verbose, -v verbose logging ``` +There are no `--branch`, `--tags`, or `--map` flags: scope is fixed to +every branch and every tag on the source. + Environment fallbacks: `GITSYNC_SOURCE_TOKEN`, `GITSYNC_SOURCE_USERNAME`, `GITSYNC_SOURCE_BEARER_TOKEN`, `GITSYNC_SOURCE_INSECURE_SKIP_TLS_VERIFY`, `GITSYNC_SOURCE_FOLLOW_INFO_REFS_REDIRECT`, `GITSYNC_PROTOCOL`. @@ -179,48 +162,61 @@ Environment fallbacks: `GITSYNC_SOURCE_TOKEN`, `GITSYNC_SOURCE_USERNAME`, ## Sharp Edges **GPG signatures are stripped.** A signature is bytes signed over the -commit's pre-conversion content (including the SHA1 `tree` and -`parent` lines). After rewriting, the bytes no longer match the -signature, so verification would always fail. Rather than persist -invalid signatures, the command drops them and prints a warning count. -This matches upstream `git`'s own SHA256 conversion behavior. Tags -with embedded signatures and `mergetag` extra headers are handled the -same way. +commit's pre-conversion content (including the SHA1 hashes in `tree` +and `parent` lines). After rewriting, the bytes no longer match the +signature, so verification would always fail; the command drops them +and prints a count. Signed annotated tags lose their signature the +same way. `mergetag` headers on merge commits — which embed a signed +tag with its own signature — are removed entirely, since the embedded +tag references original SHA1s and the signature was computed over +those original bytes. **Submodule gitlinks must resolve in-repo.** Tree entries with mode `160000` reference a commit in another repository, but a SHA1 hash -cannot be embedded in a SHA256 tree. The command translates the -pointer if the referenced commit happens to live in the same store -(rare; sometimes seen in vendored modules), and otherwise exits with -an error naming the offending tree, entry, and hash. Scope around -those refs with `--exclude-ref-prefix` or `--branch`, or convert the -submodule repository first. - -**External SHA1 references break silently.** See the section above for -mitigations. References inside the repo (commit and tag messages) are -rewritten when they uniquely identify a commit or tag in scope. -Anything outside the repo — PR descriptions, issue trackers, deploy -manifests, container labels — is not the converter's job; use the -mapping file to drive those rewrites. - -**Replace refs and notes refs become detached.** `refs/replace/` -encodes a SHA1 in the ref name itself, so the name doesn't match -under SHA256 and the replacement never triggers. `refs/notes/*` paths -encode the target object's hash as a tree path, so existing notes -copied under `--all-refs` survive as data but no longer attach to -their original commits. Neither is a correctness issue, just lost -behavior. - -**HEAD can dangle.** When the source's symbolic HEAD branch is not in -the desired ref set, the target's HEAD is left at `refs/heads/master` -(go-git's PlainInit default) and resolves to nothing. Either include -the HEAD branch in scope or set it manually after conversion with -`git -C symbolic-ref HEAD refs/heads/`. - -**Storage is all loose objects.** The command writes one file per -object. Correct, but on filesystems that dislike millions of small -files this is slow. Run `git -C gc --aggressive` afterwards -to pack the converted repo down to a single packfile. +cannot be embedded in a SHA256 tree. The command fails-fast in the +discovery pass — before the target bare repo is initialized — naming +the offending tree, entry, and hash. Convert the submodule repository +first so its commit hashes are available in SHA256. + +**Replace refs and source notes refs become detached.** +`refs/replace/` encodes a SHA1 in the ref name, so the name +doesn't match under SHA256 and the replacement never triggers. +`refs/notes/*` trees from the source (copied under `--all-refs`) +encode the target object's hash as the entry name, so notes survive +as data but no longer attach to their original commits. Use the +tool's own `refs/notes/sha1-origin` for the inverse lookup. + +## Operational Notes + +**One-off, not incremental.** Each run produces a fresh SHA256 repo +from scratch — there is no "fetch the new SHA1 commits and append to +the existing SHA256 repo" mode. Realistic use: convert once, then +make the converted repo the new canonical store. Branch and tag +hashes are deterministic across runs against the same source state; +only `refs/notes/sha1-origin` differs because its wrapper commit +carries `time.Now()` as the committer timestamp. + +**Loose-object storage.** Every translated object is written as a +loose file under `objects//` — no pack file is produced. +Correct, but slow on filesystems that dislike millions of small files. +Run `git -C gc --aggressive` afterwards to pack the converted +repo down to a single packfile. + +**Memory linear in reachable object count.** Two `map[Hash]…` +structures stay live for the whole run: `reachable` (SHA1 → object +type, built by discovery) and `mapping` (SHA1 → SHA256, built by +translation). At cobra scale (~5k objects), kilobytes; at Linux kernel +scale (~16M objects), roughly 2 GB peak. + +**Discovery adds a ~1.5× decode pass.** Every reachable object is +decoded twice: once in discovery (no encoding) and once in translation +(decode + encode). The cost buys consistent uniqueness verdicts for +message rewriting and submodule fail-fast. + +**Abbreviated-prefix lookup is a linear scan.** Each abbreviated SHA1 +in a message triggers an O(reachable) scan to check uniqueness. Fine +to ~100k commits; slower past that. A sorted-prefix index would make +it O(log N), an easy optimization if someone hits the wall. ## Verifying the Output @@ -249,49 +245,22 @@ git -C /path/to/out.git push --mirror ## Implementation Notes -The translator works in four phases: - -1. **Pack fetch.** A single self-contained pack is streamed into a - filesystem-backed SHA1 storer via go-git's pack parser, so deltas - are resolved up front and the SHA1 source is randomly addressable - for the rest of the run. - -2. **Discovery.** A non-encoding DFS walks every object reachable - from each desired ref tip via tree entries, commit - tree+parent links, and tag targets. Each visited SHA1 is recorded - in a `reachable map[Hash]ObjectType`. This set is the authoritative - "what is in scope" answer used by both submodule resolution and - message-reference rewriting — uniqueness of abbreviated SHA1 - prefixes is decided against this set once, never against the - in-flight mapping. - -3. **Translation.** Memoized recursive DFS from each desired ref tip. - Blobs are copied as-is and re-hashed; trees, commits, and tags are - decoded, their embedded hashes rewritten via the SHA1 → SHA256 - mapping, signatures stripped, and messages rewritten. The DFS - recursion includes message-reference edges: for each commit or - tag whose message mentions a SHA1 of a commit or tag in the - reachable set, that referenced object is translated first. This - guarantees the mapping is populated before the substitution - happens, so cross-branch references resolve as reliably as - ancestor references. Each translated object is written as a loose - object under `objects//` in the target. - -4. **Refs and side outputs.** Refs and HEAD are written at the - translated tip hashes; the origin notes commit (if enabled) is - built and stored under `refs/notes/sha1-origin`; the mapping file - (if requested) is written. - -A defensive `inProgress` set guards against cycles during phase 3. -Real Git histories cannot form cycles (parent, tree, and tag-target -edges are a DAG by construction, and SHA1 message-reference cycles -are cryptographically infeasible), so a trip into this branch is a -hard error rather than a silent skip. - -Note: loose object writing is done by hand rather than via go-git's +The pipeline runs in four phases (pack fetch → discovery → target init → +translation), with refs and side outputs written at the end. Submodule +errors surface in discovery, before the target repo is materialized. + +Translation is a memoized recursive DFS. Tree, parent, tag-target, and +message-reference edges are all part of the DFS, so the mapping is +populated by the time any object's bytes are encoded. A defensive +`inProgress` set guards against cycles; real Git histories can't form +them (parent/tree/tag-target edges are a DAG, and SHA1 message- +reference cycles are cryptographically infeasible), but a trip into +the guard becomes a hard error rather than a stack overflow. + +Loose object writing is done by hand rather than via go-git's `SetEncodedObject`. The underlying `plumbing/format/objfile.Writer` in `go-git/v6@v6.0.0-alpha.3` hardcodes SHA1 in its hasher, which would put every translated object at a SHA1-derived path even though -the content references SHA256. This is verified by a unit test that -recomputes `sha256` of every loose object's decompressed content and -compares against the filename. +the content references SHA256. A unit test recomputes `sha256` of +every loose object's decompressed content and compares against the +filename to prevent regression. From b67f4c49b811d9413eae4b60ba82f9b47372e90d Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Sat, 23 May 2026 14:35:40 +0200 Subject: [PATCH 03/19] `--progress` Entire-Checkpoint: fb6dc8f73ee0 --- cmd/git-sync/convert_sha256.go | 2 + .../internal/sha256convert/sha256convert.go | 126 ++++++++++++++++-- .../sha256convert/sha256convert_test.go | 22 +-- docs/convert-sha256.md | 1 + 4 files changed, 128 insertions(+), 23 deletions(-) diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go index 080f1ce9..464f011f 100644 --- a/cmd/git-sync/convert_sha256.go +++ b/cmd/git-sync/convert_sha256.go @@ -78,6 +78,8 @@ any so the caller can convert the submodule repository first.`, excludeRefPrefixFlag(cmd, &req.ExcludeRefPrefixes) addProtocolFlag(cmd, &protocolVal) cmd.Flags().BoolVarP(&req.Verbose, "verbose", "v", false, "verbose logging") + cmd.Flags().BoolVar(&req.Progress, "progress", false, + "show live per-phase object counts on stderr (TTY only)") cmd.Flags().BoolVar(&req.KeepSourceObjects, "keep-source-objects", false, "keep the temporary SHA1 store on disk after conversion (for debugging)") cmd.Flags().StringVar(&req.MappingFile, "write-mapping", "", diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 34b3eecb..e1b2a1fa 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -30,6 +30,7 @@ import ( "sort" "strconv" "strings" + "sync/atomic" "time" git "github.com/go-git/go-git/v6" @@ -66,6 +67,7 @@ type Request struct { ProtocolMode gitsync.ProtocolMode Verbose bool + Progress bool KeepSourceObjects bool // MappingFile, when non-empty, is a path to which a TSV of every @@ -227,7 +229,20 @@ func Run(ctx context.Context, req Request) (Result, error) { rootSHA1s = append(rootSHA1s, d.SourceHash) } fmt.Fprintln(out, "discovering reachable objects ...") - reachable, err := discoverReachable(srcRepo.Storer, rootSHA1s) + progressActive := req.Progress && isTTY(out) + var discCounter *atomic.Int64 + var stopDisc func() + if progressActive { + c := new(atomic.Int64) + discCounter = c + stopDisc = startProgressTick(out, func() string { + return fmt.Sprintf(" discovered %d objects", c.Load()) + }) + } + reachable, err := discoverReachable(srcRepo.Storer, rootSHA1s, discCounter) + if stopDisc != nil { + stopDisc() + } if err != nil { return Result{}, fmt.Errorf("discover reachable: %w", err) } @@ -243,11 +258,24 @@ func Run(ctx context.Context, req Request) (Result, error) { return Result{}, err } fmt.Fprintln(out, "translating objects to sha256 ...") + var stopTr func() + if progressActive { + stopTr = startProgressTick(out, func() string { + return fmt.Sprintf(" translated %d blobs, %d trees, %d commits, %d tags", + tr.blobs.Load(), tr.trees.Load(), tr.commitsCount.Load(), tr.tags.Load()) + }) + } for _, d := range desired { if _, err := tr.translate(d.SourceHash); err != nil { + if stopTr != nil { + stopTr() + } return Result{}, fmt.Errorf("translate %s: %w", d.SourceRef, err) } } + if stopTr != nil { + stopTr() + } // Write refs --------------------------------------------------------- refsWritten, err := writeRefs(dstRepo.Storer, desired, tr.mapping) @@ -272,7 +300,7 @@ func Run(ctx context.Context, req Request) (Result, error) { TargetDir: req.TargetDir, Protocol: refService.Protocol, RefsConverted: refsWritten, - Counts: tr.counts, + Counts: tr.snapshotCounts(), SignaturesStripped: tr.signaturesStripped, MessageRewrites: tr.messageRewrites, } @@ -419,11 +447,26 @@ type translator struct { // so they know which references to investigate via the mapping // file. ambiguousMessageRefs map[string]struct{} - counts Counts - signaturesStripped int - messageRewrites int - rewriteMessages bool - lastNotesCommit plumbing.Hash + // Live counts updated atomically so the --progress ticker goroutine + // can sample them without racing against translation. Snapshot into + // a Counts struct at the end of the run. + blobs atomic.Int64 + trees atomic.Int64 + commitsCount atomic.Int64 + tags atomic.Int64 + signaturesStripped int + messageRewrites int + rewriteMessages bool + lastNotesCommit plumbing.Hash +} + +func (t *translator) snapshotCounts() Counts { + return Counts{ + Blobs: int(t.blobs.Load()), + Trees: int(t.trees.Load()), + Commits: int(t.commitsCount.Load()), + Tags: int(t.tags.Load()), + } } func newTranslator(src, dst storer.Storer, targetDir string, rewriteMessages bool, reachable map[plumbing.Hash]plumbing.ObjectType) (*translator, error) { @@ -464,7 +507,10 @@ func newTranslator(src, dst storer.Storer, targetDir string, rewriteMessages boo // // Message-reference edges are not part of this pass; those are added // during translation, where the partial mapping is updated as we go. -func discoverReachable(src storer.Storer, roots []plumbing.Hash) (map[plumbing.Hash]plumbing.ObjectType, error) { +// +// If progress is non-nil, it is incremented once per object visited. +// The --progress ticker samples this counter from another goroutine. +func discoverReachable(src storer.Storer, roots []plumbing.Hash, progress *atomic.Int64) (map[plumbing.Hash]plumbing.ObjectType, error) { srcFS, ok := src.(*filesystem.Storage) if !ok { return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) @@ -480,6 +526,9 @@ func discoverReachable(src storer.Storer, roots []plumbing.Hash) (map[plumbing.H return fmt.Errorf("discover %s: %w", sha1, err) } reachable[sha1] = obj.Type() + if progress != nil { + progress.Add(1) + } switch obj.Type() { case plumbing.BlobObject: return nil @@ -587,7 +636,7 @@ func (t *translator) translateBlob(sha1 plumbing.Hash, src plumbing.EncodedObjec return plumbing.ZeroHash, fmt.Errorf("blob store: %w", err) } t.mapping[sha1] = newHash - t.counts.Blobs++ + t.blobs.Add(1) return newHash, nil } @@ -635,7 +684,7 @@ func (t *translator) translateTree(sha1 plumbing.Hash, src plumbing.EncodedObjec return plumbing.ZeroHash, fmt.Errorf("store tree %s: %w", sha1, err) } t.mapping[sha1] = newHash - t.counts.Trees++ + t.trees.Add(1) return newHash, nil } @@ -706,7 +755,7 @@ func (t *translator) translateCommit(sha1 plumbing.Hash, src plumbing.EncodedObj } t.mapping[sha1] = newHash t.commits = append(t.commits, sha1) - t.counts.Commits++ + t.commitsCount.Add(1) return newHash, nil } @@ -751,7 +800,7 @@ func (t *translator) translateTag(sha1 plumbing.Hash, src plumbing.EncodedObject return plumbing.ZeroHash, fmt.Errorf("store tag %s: %w", sha1, err) } t.mapping[sha1] = newHash - t.counts.Tags++ + t.tags.Add(1) return newHash, nil } @@ -1036,6 +1085,59 @@ func (t *translator) writeOriginNotes(refName string) (string, error) { return refName, nil } +// startProgressTick spawns a goroutine that, every 500 ms, rewrites a +// single line in place on out with the string returned by render. The +// returned stop function halts the goroutine and emits a trailing +// newline so subsequent prints start on a fresh row. +// +// Only intended for TTY output: the rendered line uses '\r\x1b[K' to +// overwrite itself, which looks fine on a terminal and ugly anywhere +// else. Callers gate on isTTY before calling. +func startProgressTick(out io.Writer, render func() string) func() { + stop := make(chan struct{}) + done := make(chan struct{}) + go func() { + defer close(done) + t := time.NewTicker(500 * time.Millisecond) + defer t.Stop() + for { + select { + case <-stop: + return + case <-t.C: + fmt.Fprintf(out, "\r\x1b[K%s", render()) + } + } + }() + stopOnce := false + return func() { + if stopOnce { + return + } + stopOnce = true + close(stop) + <-done + // Last frame + newline so subsequent output is on a clean row. + fmt.Fprintf(out, "\r\x1b[K%s\n", render()) + } +} + +// isTTY reports whether w is a writable terminal. The --progress +// ticker is suppressed on non-TTY destinations because the '\r'-style +// in-place updates would otherwise show up as literal control +// characters in log files and pipes. +func isTTY(w io.Writer) bool { + f, ok := w.(*os.File) + if !ok { + return false + } + fi, err := f.Stat() + if err != nil { + return false + } + return (fi.Mode() & os.ModeCharDevice) != 0 +} + // writeMappingFile dumps the SHA1 → SHA256 mapping as a TSV. Lines are // sorted by SHA1 so diffs across runs are stable. Includes every // translated object (blob/tree/commit/tag), so external tooling can use diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index c53f5d70..3bd78b5c 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -81,7 +81,7 @@ func TestTranslator(t *testing.T) { } tagHash := writeObject(t, srcRepo.Storer, tag.Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{tagHash}) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{tagHash}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -95,8 +95,8 @@ func TestTranslator(t *testing.T) { } wantCounts := Counts{Blobs: 1, Trees: 1, Commits: 2, Tags: 1} - if tr.counts != wantCounts { - t.Errorf("counts: got %+v, want %+v", tr.counts, wantCounts) + if got := tr.snapshotCounts(); got != wantCounts { + t.Errorf("counts: got %+v, want %+v", got, wantCounts) } if tr.signaturesStripped != 2 { t.Errorf("signatures stripped: got %d, want 2 (commit + tag)", tr.signaturesStripped) @@ -104,11 +104,11 @@ func TestTranslator(t *testing.T) { // Idempotency: translating the same hash again must reuse the mapping // without writing more objects or bumping counters. - startBlobs := tr.counts.Blobs + startBlobs := tr.blobs.Load() if _, err := tr.translate(tagHash); err != nil { t.Fatalf("re-translate tag: %v", err) } - if tr.counts.Blobs != startBlobs { + if tr.blobs.Load() != startBlobs { t.Errorf("re-translate increased blob count; memoization broken") } @@ -196,7 +196,7 @@ func TestTranslator_RewritesMessageHashes(t *testing.T) { } childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{childSHA1}) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{childSHA1}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -275,7 +275,7 @@ func TestTranslator_RewritesCrossBranchReferences(t *testing.T) { // Discovery must see both branches so the reachable set covers cA // before cB is encoded. - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{cB, cA}) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{cB, cA}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -329,7 +329,7 @@ func TestTranslator_SkipMessageRewrite(t *testing.T) { } childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{childSHA1}) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{childSHA1}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -360,7 +360,7 @@ func TestTranslator_WriteOriginNotes(t *testing.T) { c1 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c1\n", TreeHash: tree}).Encode) c2 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c2\n", TreeHash: tree, ParentHashes: []plumbing.Hash{c1}}).Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{c2}) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{c2}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -425,7 +425,7 @@ func TestTranslator_WriteMappingFile(t *testing.T) { sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} commit := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c\n", TreeHash: tree}).Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{commit}) + reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{commit}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -531,7 +531,7 @@ func TestTranslator_UnresolvableSubmodule(t *testing.T) { {Name: "sub", Mode: filemode.Submodule, Hash: external}, }) - _, err = discoverReachable(srcRepo.Storer, []plumbing.Hash{treeHash}) + _, err = discoverReachable(srcRepo.Storer, []plumbing.Hash{treeHash}, nil) if err == nil { t.Fatal("expected discoverReachable to fail on unresolvable submodule, got nil") } diff --git a/docs/convert-sha256.md b/docs/convert-sha256.md index 33f539d8..e8a1cb83 100644 --- a/docs/convert-sha256.md +++ b/docs/convert-sha256.md @@ -148,6 +148,7 @@ system that holds frozen SHA1 references. --no-rewrite-messages skip inline hash rewrites in messages --no-origin-notes skip refs/notes/sha1-origin --keep-source-objects leave the temp SHA1 store on disk +--progress live per-phase object counts (TTY only) --json machine-readable output --verbose, -v verbose logging ``` From 44bd16c067b4a9833237697e3802030b1431c610 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Sat, 23 May 2026 14:44:35 +0200 Subject: [PATCH 04/19] Add --check Entire-Checkpoint: 80a4babf00ee --- cmd/git-sync/convert_sha256.go | 2 + .../internal/sha256convert/sha256convert.go | 114 ++++++++++++++++++ .../sha256convert/sha256convert_test.go | 22 ++++ docs/convert-sha256.md | 35 +++++- 4 files changed, 170 insertions(+), 3 deletions(-) diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go index 464f011f..876ee4a2 100644 --- a/cmd/git-sync/convert_sha256.go +++ b/cmd/git-sync/convert_sha256.go @@ -80,6 +80,8 @@ any so the caller can convert the submodule repository first.`, cmd.Flags().BoolVarP(&req.Verbose, "verbose", "v", false, "verbose logging") cmd.Flags().BoolVar(&req.Progress, "progress", false, "show live per-phase object counts on stderr (TTY only)") + cmd.Flags().BoolVar(&req.Check, "check", false, + "verify the output after conversion (config, HEAD, refs, git fsck --full)") cmd.Flags().BoolVar(&req.KeepSourceObjects, "keep-source-objects", false, "keep the temporary SHA1 store on disk after conversion (for debugging)") cmd.Flags().StringVar(&req.MappingFile, "write-mapping", "", diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index e1b2a1fa..91d0a988 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -25,6 +25,7 @@ import ( "net/http" "net/url" "os" + "os/exec" "path/filepath" "regexp" "sort" @@ -68,6 +69,7 @@ type Request struct { ProtocolMode gitsync.ProtocolMode Verbose bool Progress bool + Check bool KeepSourceObjects bool // MappingFile, when non-empty, is a path to which a TSV of every @@ -108,9 +110,18 @@ type Result struct { AmbiguousMessageRefs []string `json:"ambiguousMessageRefs,omitempty"` OriginNotesRef string `json:"originNotesRef,omitempty"` MappingFile string `json:"mappingFile,omitempty"` + Checks []Check `json:"checks,omitempty"` TempDir string `json:"tempDir,omitempty"` } +// Check is one named verification step from --check, with the result +// and a short detail string suitable for logging/JSON output. +type Check struct { + Name string `json:"name"` + OK bool `json:"ok"` + Detail string `json:"detail,omitempty"` +} + // Lines satisfies the human-readable output contract used by other git-sync subcommands. func (r Result) Lines() []string { lines := []string{ @@ -335,9 +346,112 @@ func Run(ctx context.Context, req Request) (Result, error) { cleanupTemp = false res.TempDir = tempDir } + + if req.Check { + fmt.Fprintln(out, "verifying output ...") + res.Checks = runChecks(req.TargetDir, dstRepo, refsWritten) + for _, c := range res.Checks { + mark := "✓" + if !c.OK { + mark = "✗" + } + fmt.Fprintf(out, " %s %s: %s\n", mark, c.Name, c.Detail) + } + for _, c := range res.Checks { + if !c.OK { + return res, fmt.Errorf("check %q failed: %s", c.Name, c.Detail) + } + } + } + return res, nil } +// runChecks performs lightweight verification of the converted repo. +// Returns one Check per step. Callers print and/or fail-on-error based +// on these. No early return so users see the full picture even when an +// earlier check fails. +func runChecks(targetDir string, repo *git.Repository, refsExpected int) []Check { + checks := []Check{} + + // 1. Config: extensions.objectformat = sha256. + cfgBytes, err := os.ReadFile(filepath.Join(targetDir, "config")) + switch { + case err != nil: + checks = append(checks, Check{Name: "config", OK: false, Detail: err.Error()}) + case !bytes.Contains(cfgBytes, []byte("objectformat = sha256")): + checks = append(checks, Check{Name: "config", OK: false, Detail: "extensions.objectformat = sha256 not set"}) + default: + checks = append(checks, Check{Name: "config", OK: true, Detail: "extensions.objectformat = sha256"}) + } + + // 2. HEAD resolves to an existing object. + head, err := repo.Reference(plumbing.HEAD, true) + switch { + case err != nil: + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: err.Error()}) + case head.Hash().IsZero(): + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: "resolves to zero hash"}) + default: + if _, err := repo.Storer.EncodedObject(plumbing.AnyObject, head.Hash()); err != nil { + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: fmt.Sprintf("%s: %v", head.Hash(), err)}) + } else { + checks = append(checks, Check{Name: "HEAD", OK: true, Detail: head.Hash().String()}) + } + } + + // 3. Every written ref resolves to an existing object. + resolved := 0 + missing := "" + refs, err := repo.References() + if err != nil { + checks = append(checks, Check{Name: "refs", OK: false, Detail: err.Error()}) + } else { + _ = refs.ForEach(func(r *plumbing.Reference) error { + if r.Type() != plumbing.HashReference { + return nil + } + if r.Name() == plumbing.ReferenceName(originNotesRef) { + // Counted separately below; not in the refsExpected total. + return nil + } + if _, err := repo.Storer.EncodedObject(plumbing.AnyObject, r.Hash()); err != nil { + if missing == "" { + missing = fmt.Sprintf("%s → %s: %v", r.Name(), r.Hash(), err) + } + return nil + } + resolved++ + return nil + }) + if missing != "" { + checks = append(checks, Check{Name: "refs", OK: false, Detail: missing}) + } else if resolved < refsExpected { + checks = append(checks, Check{Name: "refs", OK: false, Detail: fmt.Sprintf("only %d / %d refs resolved", resolved, refsExpected)}) + } else { + checks = append(checks, Check{Name: "refs", OK: true, Detail: fmt.Sprintf("%d / %d resolve to objects", resolved, refsExpected)}) + } + } + + // 4. git fsck --full (if git is on PATH). + gitBin, err := exec.LookPath("git") + if err != nil { + checks = append(checks, Check{Name: "git fsck --full", OK: true, Detail: "skipped (git not in PATH)"}) + return checks + } + cmd := exec.Command(gitBin, "-C", targetDir, "fsck", "--full") + fsckOut, err := cmd.CombinedOutput() + switch { + case err != nil: + checks = append(checks, Check{Name: "git fsck --full", OK: false, Detail: fmt.Sprintf("%v\n%s", err, fsckOut)}) + case bytes.Contains(fsckOut, []byte("error")) || bytes.Contains(fsckOut, []byte("bad sha")): + checks = append(checks, Check{Name: "git fsck --full", OK: false, Detail: strings.TrimSpace(string(fsckOut))}) + default: + checks = append(checks, Check{Name: "git fsck --full", OK: true, Detail: "clean"}) + } + return checks +} + const originNotesRef = "refs/notes/sha1-origin" // ensureEmptyTarget refuses to init into a non-empty directory so the user diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 3bd78b5c..a47d8632 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -680,6 +680,7 @@ func TestRun_GitHTTPBackend(t *testing.T) { SourceURL: srv.URL + "/source.git", TargetDir: dstDir, MappingFile: mappingPath, + Check: true, Out: io.Discard, }) if err != nil { @@ -753,6 +754,27 @@ func TestRun_GitHTTPBackend(t *testing.T) { if !strings.Contains(string(mapping), headSHA1) { t.Errorf("mapping file missing head SHA1 %s:\n%s", headSHA1, mapping) } + + // --check: every step should pass against a freshly-converted repo, + // including git fsck --full (available since we already need the + // git binary to drive the source side of this test). + if len(res.Checks) == 0 { + t.Fatal("expected Checks to be populated when --check is enabled") + } + for _, c := range res.Checks { + if !c.OK { + t.Errorf("check %q failed: %s", c.Name, c.Detail) + } + } + expected := map[string]bool{"config": false, "HEAD": false, "refs": false, "git fsck --full": false} + for _, c := range res.Checks { + expected[c.Name] = true + } + for name, present := range expected { + if !present { + t.Errorf("--check did not run %q step", name) + } + } } func mustGit(t *testing.T, dir string, args ...string) { diff --git a/docs/convert-sha256.md b/docs/convert-sha256.md index e8a1cb83..d39f43b5 100644 --- a/docs/convert-sha256.md +++ b/docs/convert-sha256.md @@ -147,6 +147,7 @@ system that holds frozen SHA1 references. --write-mapping write SHA1 → SHA256 TSV to this path --no-rewrite-messages skip inline hash rewrites in messages --no-origin-notes skip refs/notes/sha1-origin +--check verify the output (config, HEAD, refs, git fsck) --keep-source-objects leave the temp SHA1 store on disk --progress live per-phase object counts (TTY only) --json machine-readable output @@ -221,9 +222,37 @@ it O(log N), an easy optimization if someone hits the wall. ## Verifying the Output -Standard git tooling works against the converted repo without -additional flags — the `extensions.objectformat` setting in the local -config is enough for git to switch hashing: +Pass `--check` and the command runs four sanity checks against the +converted repo at the end of the run, printing one line each: + +``` +verifying output ... + ✓ config: extensions.objectformat = sha256 + ✓ HEAD: ffe9fff421b77f2dcc049a95b3b8ba7b9da8976dd61bcf35e9fe2d993babc470 + ✓ refs: 37 / 37 resolve to objects + ✓ git fsck --full: clean +``` + +The checks are: + +1. **config** — `extensions.objectformat = sha256` is present in + `/config`. +2. **HEAD** — resolves to a non-zero hash and that object exists in + the store. +3. **refs** — every written ref (except `refs/notes/sha1-origin`, + counted separately) resolves to an object in the store. The count + matches `RefsConverted`. +4. **git fsck --full** — the external `git` binary runs a full + integrity check. Skipped (and reported as such) when `git` isn't + on `PATH`; the conversion still succeeds. + +If any check fails the command exits non-zero. The full per-check +results are also in `--json`'s `checks` array. Without `--check` no +verification runs and the run completes as soon as the conversion +itself finishes. + +You can also run the checks by hand on a converted repo, with or +without `--check`: ```bash git -C /path/to/out.git fsck --full # zero errors expected From 7726110dec3f998cee62e578642f8aceb4b6044b Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Mon, 25 May 2026 10:40:33 +0200 Subject: [PATCH 05/19] Allow users to tag each branch tip and sign it This allows to "sign" the conversion. Entire-Checkpoint: 1857d92ce8c7 --- cmd/git-sync/convert_sha256.go | 4 + .../internal/sha256convert/sha256convert.go | 117 ++++++++++++++++-- .../sha256convert/sha256convert_test.go | 95 ++++++++++++++ docs/convert-sha256.md | 37 ++++++ 4 files changed, 244 insertions(+), 9 deletions(-) diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go index 876ee4a2..738ac120 100644 --- a/cmd/git-sync/convert_sha256.go +++ b/cmd/git-sync/convert_sha256.go @@ -82,6 +82,10 @@ any so the caller can convert the submodule repository first.`, "show live per-phase object counts on stderr (TTY only)") cmd.Flags().BoolVar(&req.Check, "check", false, "verify the output after conversion (config, HEAD, refs, git fsck --full)") + cmd.Flags().BoolVar(&req.Sign, "sign", false, + "after conversion, sign each branch tip as refs/tags/converted/ via `git tag -s`") + cmd.Flags().StringVar(&req.SignKey, "sign-key", "", + "signing key id to pass to `git tag -s -u`; default uses the repo's user.signingkey") cmd.Flags().BoolVar(&req.KeepSourceObjects, "keep-source-objects", false, "keep the temporary SHA1 store on disk after conversion (for debugging)") cmd.Flags().StringVar(&req.MappingFile, "write-mapping", "", diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 91d0a988..711fb823 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -66,10 +66,19 @@ type Request struct { AllRefs bool ExcludeRefPrefixes []string - ProtocolMode gitsync.ProtocolMode - Verbose bool - Progress bool - Check bool + ProtocolMode gitsync.ProtocolMode + Verbose bool + Progress bool + Check bool + + // Sign, when true, runs `git tag -s converted/ ` for + // every converted branch after the conversion completes, attesting + // the entire reachable history of each branch via its tip's parent + // chain. SignKey is passed to git as `-u `; leave empty to + // use the repo's default signing identity. + Sign bool + SignKey string + KeepSourceObjects bool // MappingFile, when non-empty, is a path to which a TSV of every @@ -110,6 +119,7 @@ type Result struct { AmbiguousMessageRefs []string `json:"ambiguousMessageRefs,omitempty"` OriginNotesRef string `json:"originNotesRef,omitempty"` MappingFile string `json:"mappingFile,omitempty"` + SignedTags []string `json:"signedTags,omitempty"` Checks []Check `json:"checks,omitempty"` TempDir string `json:"tempDir,omitempty"` } @@ -159,6 +169,21 @@ func (r Result) Lines() []string { if r.MappingFile != "" { lines = append(lines, fmt.Sprintf("mapping written to: %s", r.MappingFile)) } + if n := len(r.SignedTags); n > 0 { + preview := r.SignedTags + const max = 5 + extra := 0 + if len(preview) > max { + extra = len(preview) - max + preview = preview[:max] + } + line := fmt.Sprintf("signed %d branch attestation tag(s): %s", + n, strings.Join(preview, ", ")) + if extra > 0 { + line += fmt.Sprintf(", ... (%d more; full list in --json)", extra) + } + lines = append(lines, line) + } if r.TempDir != "" { lines = append(lines, fmt.Sprintf("kept source objects: %s", r.TempDir)) } @@ -342,6 +367,14 @@ func Run(ctx context.Context, req Request) (Result, error) { res.MappingFile = req.MappingFile } + if req.Sign { + signed, err := signBranchTips(out, req.TargetDir, req.SignKey, req.SourceURL, desired) + if err != nil { + return res, fmt.Errorf("sign: %w", err) + } + res.SignedTags = signed + } + if req.KeepSourceObjects { cleanupTemp = false res.TempDir = tempDir @@ -367,6 +400,62 @@ func Run(ctx context.Context, req Request) (Result, error) { return res, nil } +// signBranchTips runs `git tag -s converted/ ` for every +// branch in the desired set. The converter's signing identity (whatever +// `user.signingkey` / `gpg.format` is set to in the target repo, or the +// caller-supplied signKey) attests each branch's full reachable history +// via the parent chain encoded in the tip commit's bytes. +// +// stdin/stderr are inherited so gpg/ssh-agent prompts work +// interactively. A failure short-circuits the run; tags signed before +// the failure stay in the target repo. +func signBranchTips(out io.Writer, targetDir, signKey, sourceURL string, desired map[plumbing.ReferenceName]planner.DesiredRef) ([]string, error) { + gitBin, err := exec.LookPath("git") + if err != nil { + return nil, fmt.Errorf("git binary required to sign: %w", err) + } + // Iterate in a deterministic order so re-runs over the same source + // produce the same sequence of tags (modulo the signature payload, + // which carries the signer's timestamp). + branchNames := make([]string, 0, len(desired)) + for name := range desired { + if name.IsBranch() { + branchNames = append(branchNames, string(name)) + } + } + sort.Strings(branchNames) + + var signed []string + for _, refName := range branchNames { + shortName := plumbing.ReferenceName(refName).Short() + tagName := strings.TrimPrefix(attestationTagPrefix, "refs/tags/") + shortName + fmt.Fprintf(out, "signing %s ...\n", "refs/tags/"+tagName) + + msg := fmt.Sprintf( + "SHA1 → SHA256 conversion attestation for %s.\n\n"+ + "Source: %s\nProduced by git-sync convert-sha256.\n", + refName, sourceURL) + args := []string{"-C", targetDir, "tag", "-s", "-m", msg} + if signKey != "" { + args = append(args, "-u", signKey) + } + args = append(args, tagName, refName) + + cmd := exec.Command(gitBin, args...) + // Inherit stdio so gpg/ssh-agent passphrase prompts work. We + // intentionally do not capture stdout/stderr — the user needs + // to see them when authenticating. + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stderr // git tag -s is usually quiet on success + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return signed, fmt.Errorf("git tag -s %s: %w", tagName, err) + } + signed = append(signed, "refs/tags/"+tagName) + } + return signed, nil +} + // runChecks performs lightweight verification of the converted repo. // Returns one Check per step. Callers print and/or fail-on-error based // on these. No early return so users see the full picture even when an @@ -400,7 +489,11 @@ func runChecks(targetDir string, repo *git.Repository, refsExpected int) []Check } } - // 3. Every written ref resolves to an existing object. + // 3. Every written ref resolves to an existing object. Skip refs we + // add as side outputs (the origin-notes ref and any + // refs/tags/converted/* attestation tags from --sign), since they + // are accounted for in their own Result fields and would otherwise + // make the displayed fraction misleading. resolved := 0 missing := "" refs, err := repo.References() @@ -411,13 +504,16 @@ func runChecks(targetDir string, repo *git.Repository, refsExpected int) []Check if r.Type() != plumbing.HashReference { return nil } - if r.Name() == plumbing.ReferenceName(originNotesRef) { - // Counted separately below; not in the refsExpected total. + name := r.Name() + if name == plumbing.ReferenceName(originNotesRef) { + return nil + } + if strings.HasPrefix(string(name), attestationTagPrefix) { return nil } if _, err := repo.Storer.EncodedObject(plumbing.AnyObject, r.Hash()); err != nil { if missing == "" { - missing = fmt.Sprintf("%s → %s: %v", r.Name(), r.Hash(), err) + missing = fmt.Sprintf("%s → %s: %v", name, r.Hash(), err) } return nil } @@ -452,7 +548,10 @@ func runChecks(targetDir string, repo *git.Repository, refsExpected int) []Check return checks } -const originNotesRef = "refs/notes/sha1-origin" +const ( + originNotesRef = "refs/notes/sha1-origin" + attestationTagPrefix = "refs/tags/converted/" +) // ensureEmptyTarget refuses to init into a non-empty directory so the user // doesn't quietly accumulate objects into an existing repo. diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index a47d8632..e1dcdc6e 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -777,6 +777,101 @@ func TestRun_GitHTTPBackend(t *testing.T) { } } +// TestRun_GitHTTPBackend_Sign verifies the --sign path end-to-end. SSH +// signing is used (not GPG) because it can be set up from scratch in the +// test with just ssh-keygen, no agent required. +func TestRun_GitHTTPBackend_Sign(t *testing.T) { + if os.Getenv(gitHTTPBackendEnv) == "" { + t.Skipf("set %s=1 to run the convert-sha256 git-http-backend integration test", gitHTTPBackendEnv) + } + gitBin, err := exec.LookPath("git") + if err != nil { + t.Skipf("git binary not available: %v", err) + } + sshKeygenBin, err := exec.LookPath("ssh-keygen") + if err != nil { + t.Skipf("ssh-keygen not available: %v", err) + } + + root := t.TempDir() + srcBare := filepath.Join(root, "source.git") + worktree := filepath.Join(root, "work") + dstDir := filepath.Join(root, "target.git") + + mustGit(t, root, "init", "--bare", srcBare) + mustGit(t, root, "init", "-b", "main", worktree) + mustGit(t, worktree, "config", "user.name", "convert-sha256 test") + mustGit(t, worktree, "config", "user.email", "test@example.com") + mustWrite(t, filepath.Join(worktree, "README"), "hello\n") + mustGit(t, worktree, "add", "README") + mustGit(t, worktree, "commit", "-m", "initial") + mustGit(t, worktree, "remote", "add", "origin", srcBare) + mustGit(t, worktree, "push", "origin", "HEAD:refs/heads/main") + + // Generate an ephemeral ed25519 SSH key for signing. + keyPath := filepath.Join(root, "signkey") + keygen := exec.Command(sshKeygenBin, "-q", "-t", "ed25519", "-N", "", "-f", keyPath, "-C", "test@example.com") + if out, err := keygen.CombinedOutput(); err != nil { + t.Fatalf("ssh-keygen: %v\n%s", err, out) + } + + // Write a global gitconfig that points git at SSH signing using the + // ephemeral key, and route GIT_CONFIG_GLOBAL at it so signBranchTips' + // subprocess inherits the config. + globalCfg := filepath.Join(root, "global.gitconfig") + if err := os.WriteFile(globalCfg, []byte(fmt.Sprintf(` +[user] + name = Conversion Test + email = test@example.com + signingkey = %s +[gpg] + format = ssh +`, keyPath)), 0o600); err != nil { + t.Fatalf("write global gitconfig: %v", err) + } + t.Setenv("GIT_CONFIG_GLOBAL", globalCfg) + // Disable any system gitconfig so the test isn't influenced by host + // signing config. + t.Setenv("GIT_CONFIG_SYSTEM", "/dev/null") + + srv := newCGIBackend(t, gitBin, root) + defer srv.Close() + + res, err := Run(context.Background(), Request{ + SourceURL: srv.URL + "/source.git", + TargetDir: dstDir, + Sign: true, + Out: io.Discard, + }) + if err != nil { + t.Fatalf("convert-sha256 run: %v", err) + } + + wantTag := "refs/tags/converted/main" + if len(res.SignedTags) != 1 || res.SignedTags[0] != wantTag { + t.Errorf("SignedTags: got %v, want [%s]", res.SignedTags, wantTag) + } + + // The tag exists in the target and is an annotated, signed tag (the + // body contains a SSH SIGNATURE block; cat-file -p shows the tag + // object including the signature). + tagShow := mustGitOutput(t, dstDir, "cat-file", "-p", wantTag) + if !strings.Contains(tagShow, "BEGIN SSH SIGNATURE") { + t.Errorf("expected signed tag to contain an SSH SIGNATURE block:\n%s", tagShow) + } + if !strings.Contains(tagShow, "SHA1 → SHA256 conversion attestation") { + t.Errorf("expected signed tag message to contain attestation text:\n%s", tagShow) + } + + // Tag's target should be the branch tip (the SHA256 hash of the + // converted main). + mainTip := strings.TrimSpace(mustGitOutput(t, dstDir, "rev-parse", "refs/heads/main")) + tagTarget := strings.TrimSpace(mustGitOutput(t, dstDir, "rev-list", "-n", "1", wantTag)) + if tagTarget != mainTip { + t.Errorf("signed tag target: got %s, want %s (main tip)", tagTarget, mainTip) + } +} + func mustGit(t *testing.T, dir string, args ...string) { t.Helper() cmd := exec.Command("git", args...) diff --git a/docs/convert-sha256.md b/docs/convert-sha256.md index d39f43b5..3d7fa155 100644 --- a/docs/convert-sha256.md +++ b/docs/convert-sha256.md @@ -128,6 +128,41 @@ Useful for bulk rewriting external systems: feed the file to a script that walks Jira tickets, PR bodies, deploy manifests, or any other system that holds frozen SHA1 references. +### Branch-tip attestation tags (opt in via `--sign`) + +`--sign` shells out to `git tag -s converted/ ` for every +converted branch after the conversion completes. Each resulting +signed annotated tag is a cryptographic attestation by the converter +that the entire reachable history of that branch — every parent, tree, +and blob — is what the converter saw at conversion time. Anyone can +verify the chain afterwards with `git verify-tag refs/tags/converted/`. + +The mechanism is the standard one: parent hashes are part of each +commit's bytes, so the tip's hash transitively commits to the whole +history. Signing the tip attests every ancestor. + +Important nuance: the signature is by the *converter*, not by the +original authors (whose own signatures are necessarily lost — see +"GPG signatures are stripped" under Sharp Edges). The attestation +chain becomes "*X attests this is the conversion they produced*" +rather than "*the original authors wrote this commit*". For internal +mirrors or single-identity repos that's a strict improvement over +unsigned-everywhere; for broad public repos it is weaker than the +pre-conversion chain. + +Signing uses the target repo's git signing config (`user.signingkey`, +`gpg.format`) by default — same as `git commit -S` or a normal +`git tag -s`. Override with `--sign-key `, which is passed to +`git tag -s -u `. SSH signing (`gpg.format = ssh`) and OpenPGP +both work because we shell out to `git`. + +Requires the `git` binary on `PATH`. Signing failures (no key +configured, gpg/ssh-agent unavailable, etc.) abort the run after the +conversion has already completed — the target repo is left in a +valid converted state, just without the attestation tags. Re-run +`git tag -s converted/ ` manually once the signing +identity is set up. + ## Flags ``` @@ -148,6 +183,8 @@ system that holds frozen SHA1 references. --no-rewrite-messages skip inline hash rewrites in messages --no-origin-notes skip refs/notes/sha1-origin --check verify the output (config, HEAD, refs, git fsck) +--sign sign each branch tip via `git tag -s converted/` +--sign-key signing key id passed to `git tag -s -u ` --keep-source-objects leave the temp SHA1 store on disk --progress live per-phase object counts (TTY only) --json machine-readable output From 2b36d81c10adb57c92a4bb2fd2f73c7a1272345a Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Mon, 25 May 2026 10:56:02 +0200 Subject: [PATCH 06/19] Fix lint findings golangci-lint v2.11.4 was unhappy across errcheck, exhaustive, gocritic, inamedparam, ireturn, maintidx, noctx, perfsprint, revive, and wrapcheck. Mostly mechanical: - Test helpers (initSHA1, initSHA256, mustTranslator) collapse the '_ :=' patterns into t.Fatalf-on-error wrappers. - exec.Command -> exec.CommandContext(ctx, ...); ctx threaded into runChecks and signBranchTips. - refs.ForEach return value checked instead of discarded. - if/else chain in runChecks rewritten as switch. - ireturn for openSource/normalizeAuth annotated, since both return shared transport interfaces by design. - maintidx on Run annotated; the function is a phase orchestrator, splitting it would obscure the pipeline. - exhaustive switches on plumbing.ObjectType annotated; the unhandled cases (OFSDelta/REFDelta/AnyObject/InvalidObject) can't reach a resolved storer. - Errors from io.ReadAll/MemoryObject.Reader/bufio.Flush/fmt.Fprintln/ auth.Method.Authorizer wrapped with fmt.Errorf. - Constant 'max' renamed to package-level 'previewMax' to stop shadowing the builtin in two places. - Named parameter added to interface methods that lint flagged. - Two fmt.Sprintf calls replaced with string concatenation. Entire-Checkpoint: 43bf90985cc5 --- .../internal/sha256convert/sha256convert.go | 89 +++++++++++------- .../sha256convert/sha256convert_test.go | 91 ++++++++++++++----- 2 files changed, 123 insertions(+), 57 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 711fb823..0f84c0fc 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -132,10 +132,15 @@ type Check struct { Detail string `json:"detail,omitempty"` } +// previewMax caps how many items from a potentially-long list (ambiguous +// prefixes, signed tags) are inlined into a Lines() summary before +// switching to a "(N more)" suffix. +const previewMax = 5 + // Lines satisfies the human-readable output contract used by other git-sync subcommands. func (r Result) Lines() []string { lines := []string{ - fmt.Sprintf("sha256 bare repo: %s", r.TargetDir), + "sha256 bare repo: " + r.TargetDir, fmt.Sprintf("source: %s (%s)", r.SourceURL, r.Protocol), fmt.Sprintf("converted: %d blobs, %d trees, %d commits, %d tags", r.Counts.Blobs, r.Counts.Trees, r.Counts.Commits, r.Counts.Tags), @@ -149,11 +154,10 @@ func (r Result) Lines() []string { } if n := len(r.AmbiguousMessageRefs); n > 0 { preview := r.AmbiguousMessageRefs - const max = 5 extra := 0 - if len(preview) > max { - extra = len(preview) - max - preview = preview[:max] + if len(preview) > previewMax { + extra = len(preview) - previewMax + preview = preview[:previewMax] } line := fmt.Sprintf("warning: %d ambiguous SHA1 hex prefix(es) in messages left unrewritten (look up via the mapping file): %s", n, strings.Join(preview, ", ")) @@ -167,15 +171,14 @@ func (r Result) Lines() []string { r.OriginNotesRef, strings.TrimPrefix(r.OriginNotesRef, "refs/notes/"))) } if r.MappingFile != "" { - lines = append(lines, fmt.Sprintf("mapping written to: %s", r.MappingFile)) + lines = append(lines, "mapping written to: "+r.MappingFile) } if n := len(r.SignedTags); n > 0 { preview := r.SignedTags - const max = 5 extra := 0 - if len(preview) > max { - extra = len(preview) - max - preview = preview[:max] + if len(preview) > previewMax { + extra = len(preview) - previewMax + preview = preview[:previewMax] } line := fmt.Sprintf("signed %d branch attestation tag(s): %s", n, strings.Join(preview, ", ")) @@ -185,12 +188,14 @@ func (r Result) Lines() []string { lines = append(lines, line) } if r.TempDir != "" { - lines = append(lines, fmt.Sprintf("kept source objects: %s", r.TempDir)) + lines = append(lines, "kept source objects: "+r.TempDir) } return lines } // Run performs the conversion described by req. +// +//nolint:maintidx // Run is a linear orchestrator over distinct phases (fetch → discover → init → translate → refs → notes → mapping → sign → check); each phase is short and isolated. Splitting into helpers would obscure the pipeline rather than clarify it. func Run(ctx context.Context, req Request) (Result, error) { if req.SourceURL == "" { return Result{}, errors.New("convert-sha256 requires --source-url") @@ -368,7 +373,7 @@ func Run(ctx context.Context, req Request) (Result, error) { } if req.Sign { - signed, err := signBranchTips(out, req.TargetDir, req.SignKey, req.SourceURL, desired) + signed, err := signBranchTips(ctx, out, req.TargetDir, req.SignKey, req.SourceURL, desired) if err != nil { return res, fmt.Errorf("sign: %w", err) } @@ -382,7 +387,7 @@ func Run(ctx context.Context, req Request) (Result, error) { if req.Check { fmt.Fprintln(out, "verifying output ...") - res.Checks = runChecks(req.TargetDir, dstRepo, refsWritten) + res.Checks = runChecks(ctx, req.TargetDir, dstRepo, refsWritten) for _, c := range res.Checks { mark := "✓" if !c.OK { @@ -409,7 +414,7 @@ func Run(ctx context.Context, req Request) (Result, error) { // stdin/stderr are inherited so gpg/ssh-agent prompts work // interactively. A failure short-circuits the run; tags signed before // the failure stay in the target repo. -func signBranchTips(out io.Writer, targetDir, signKey, sourceURL string, desired map[plumbing.ReferenceName]planner.DesiredRef) ([]string, error) { +func signBranchTips(ctx context.Context, out io.Writer, targetDir, signKey, sourceURL string, desired map[plumbing.ReferenceName]planner.DesiredRef) ([]string, error) { gitBin, err := exec.LookPath("git") if err != nil { return nil, fmt.Errorf("git binary required to sign: %w", err) @@ -441,7 +446,7 @@ func signBranchTips(out io.Writer, targetDir, signKey, sourceURL string, desired } args = append(args, tagName, refName) - cmd := exec.Command(gitBin, args...) + cmd := exec.CommandContext(ctx, gitBin, args...) // Inherit stdio so gpg/ssh-agent passphrase prompts work. We // intentionally do not capture stdout/stderr — the user needs // to see them when authenticating. @@ -460,7 +465,7 @@ func signBranchTips(out io.Writer, targetDir, signKey, sourceURL string, desired // Returns one Check per step. Callers print and/or fail-on-error based // on these. No early return so users see the full picture even when an // earlier check fails. -func runChecks(targetDir string, repo *git.Repository, refsExpected int) []Check { +func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refsExpected int) []Check { checks := []Check{} // 1. Config: extensions.objectformat = sha256. @@ -500,7 +505,7 @@ func runChecks(targetDir string, repo *git.Repository, refsExpected int) []Check if err != nil { checks = append(checks, Check{Name: "refs", OK: false, Detail: err.Error()}) } else { - _ = refs.ForEach(func(r *plumbing.Reference) error { + walkErr := refs.ForEach(func(r *plumbing.Reference) error { if r.Type() != plumbing.HashReference { return nil } @@ -520,11 +525,14 @@ func runChecks(targetDir string, repo *git.Repository, refsExpected int) []Check resolved++ return nil }) - if missing != "" { + switch { + case walkErr != nil: + checks = append(checks, Check{Name: "refs", OK: false, Detail: walkErr.Error()}) + case missing != "": checks = append(checks, Check{Name: "refs", OK: false, Detail: missing}) - } else if resolved < refsExpected { + case resolved < refsExpected: checks = append(checks, Check{Name: "refs", OK: false, Detail: fmt.Sprintf("only %d / %d refs resolved", resolved, refsExpected)}) - } else { + default: checks = append(checks, Check{Name: "refs", OK: true, Detail: fmt.Sprintf("%d / %d resolve to objects", resolved, refsExpected)}) } } @@ -535,7 +543,7 @@ func runChecks(targetDir string, repo *git.Repository, refsExpected int) []Check checks = append(checks, Check{Name: "git fsck --full", OK: true, Detail: "skipped (git not in PATH)"}) return checks } - cmd := exec.Command(gitBin, "-C", targetDir, "fsck", "--full") + cmd := exec.CommandContext(ctx, gitBin, "-C", targetDir, "fsck", "--full") fsckOut, err := cmd.CombinedOutput() switch { case err != nil: @@ -572,6 +580,7 @@ func ensureEmptyTarget(path string) error { return nil } +//nolint:ireturn // gitproto.Conn is the shared transport interface; returning it directly mirrors the rest of git-sync. func openSource(ctx context.Context, req Request, planCfg planner.PlanConfig) (gitproto.Conn, *gitproto.RefService, []*plumbing.Reference, error) { ep, err := url.Parse(req.SourceURL) if err != nil { @@ -606,6 +615,7 @@ func openSource(ctx context.Context, req Request, planCfg planner.PlanConfig) (g return conn, svc, refs, nil } +//nolint:ireturn // gitproto.AuthMethod is the shared signing interface; returning it lets callers pass it straight through. func normalizeAuth(m auth.Method) gitproto.AuthMethod { if m == nil { return nil @@ -623,7 +633,12 @@ func normalizeAuth(m auth.Method) gitproto.AuthMethod { type authAdapter struct{ m auth.Method } -func (a authAdapter) Authorizer(req *http.Request) error { return a.m.Authorizer(req) } +func (a authAdapter) Authorizer(req *http.Request) error { + if err := a.m.Authorizer(req); err != nil { + return fmt.Errorf("authorize request: %w", err) + } + return nil +} // translator walks the SHA1 source store, rewrites object content with // SHA256-mapped hashes, and writes the result as loose objects under the @@ -742,7 +757,7 @@ func discoverReachable(src storer.Storer, roots []plumbing.Hash, progress *atomi if progress != nil { progress.Add(1) } - switch obj.Type() { + switch obj.Type() { //nolint:exhaustive // OFSDelta/REFDelta/AnyObject/InvalidObject cannot reach a resolved storage. case plumbing.BlobObject: return nil case plumbing.TreeObject: @@ -788,6 +803,8 @@ func discoverReachable(src storer.Storer, roots []plumbing.Hash, progress *atomi if err := visit(tag.Target); err != nil { return err } + default: + return fmt.Errorf("unexpected object type %v for %s during discovery", obj.Type(), sha1) } return nil } @@ -820,7 +837,7 @@ func (t *translator) translate(sha1 plumbing.Hash) (plumbing.Hash, error) { if err != nil { return plumbing.ZeroHash, fmt.Errorf("lookup %s: %w", sha1, err) } - switch obj.Type() { + switch obj.Type() { //nolint:exhaustive // OFSDelta/REFDelta/AnyObject/InvalidObject cannot reach a resolved storage. case plumbing.BlobObject: return t.translateBlob(sha1, obj) case plumbing.TreeObject: @@ -1029,10 +1046,14 @@ func encodeBody(typ plumbing.ObjectType, encode func(plumbing.EncodedObject) err } r, err := scratch.Reader() if err != nil { - return nil, err + return nil, fmt.Errorf("scratch reader: %w", err) } defer r.Close() - return io.ReadAll(r) + body, err := io.ReadAll(r) + if err != nil { + return nil, fmt.Errorf("read encoded body: %w", err) + } + return body, nil } // writeLoose writes a single object as a SHA256-named loose object under @@ -1137,6 +1158,8 @@ func (t *translator) rewriteHashesInMessage(msg string) (string, int) { out := hashPattern.ReplaceAllStringFunc(msg, func(s string) string { sha1, result := t.resolveMessageRef(s) switch result { + case matchNone: + return s case matchAmbiguous: t.ambiguousMessageRefs[s] = struct{}{} return s @@ -1238,8 +1261,8 @@ func (t *translator) writeOriginNotes(refName string) (string, error) { // We collect (sha256-of-new-commit → blob hash) pairs so the tree entry // path is the commit's new hash. type entry struct { - key plumbing.Hash - blob plumbing.Hash + key plumbing.Hash + blob plumbing.Hash } entries := make([]entry, 0, len(t.commits)) for _, oldSHA1 := range t.commits { @@ -1375,14 +1398,17 @@ func (t *translator) writeMappingFile(path string) error { defer f.Close() w := bufio.NewWriter(f) if _, err := fmt.Fprintln(w, "# sha1\tsha256"); err != nil { - return err + return fmt.Errorf("write mapping header: %w", err) } for _, p := range pairs { if _, err := fmt.Fprintf(w, "%s\t%s\n", p.sha1, p.sha256); err != nil { - return err + return fmt.Errorf("write mapping line: %w", err) } } - return w.Flush() + if err := w.Flush(); err != nil { + return fmt.Errorf("flush mapping file: %w", err) + } + return nil } func writeRefs( @@ -1403,4 +1429,3 @@ func writeRefs( } return written, nil } - diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index e1dcdc6e..510ad9ac 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -23,6 +23,7 @@ import ( "github.com/go-git/go-git/v6/plumbing/filemode" formatcfg "github.com/go-git/go-git/v6/plumbing/format/config" "github.com/go-git/go-git/v6/plumbing/object" + gogitstorer "github.com/go-git/go-git/v6/plumbing/storer" "github.com/go-git/go-git/v6/storage/filesystem" ) @@ -245,8 +246,8 @@ func TestTranslator_RewritesCrossBranchReferences(t *testing.T) { root := t.TempDir() srcDir := filepath.Join(root, "src.git") dstDir := filepath.Join(root, "dst.git") - srcRepo, _ := git.PlainInit(srcDir, true) - dstRepo, _ := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + srcRepo := initSHA1(t, srcDir) + dstRepo := initSHA256(t, dstDir) blobA := writeBlob(t, srcRepo.Storer, []byte("a\n")) treeA := writeTree(t, srcRepo.Storer, []object.TreeEntry{ @@ -279,7 +280,7 @@ func TestTranslator_RewritesCrossBranchReferences(t *testing.T) { if err != nil { t.Fatalf("discoverReachable: %v", err) } - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) // Translate B first — the order that would have left the rewrite // stranded under the old design. if _, err := tr.translate(cB); err != nil { @@ -314,8 +315,8 @@ func TestTranslator_RewritesCrossBranchReferences(t *testing.T) { // untouched. func TestTranslator_SkipMessageRewrite(t *testing.T) { root := t.TempDir() - srcRepo, _ := git.PlainInit(filepath.Join(root, "src.git"), true) - dstRepo, _ := git.PlainInit(filepath.Join(root, "dst.git"), true, git.WithObjectFormat(formatcfg.SHA256)) + srcRepo := initSHA1(t, filepath.Join(root, "src.git")) + dstRepo := initSHA256(t, filepath.Join(root, "dst.git")) blob := writeBlob(t, srcRepo.Storer, []byte("x\n")) tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) @@ -333,14 +334,17 @@ func TestTranslator_SkipMessageRewrite(t *testing.T) { if err != nil { t.Fatalf("discoverReachable: %v", err) } - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) if _, err := tr.translate(childSHA1); err != nil { t.Fatalf("translate: %v", err) } if tr.messageRewrites != 0 { t.Errorf("expected no rewrites when disabled; got %d", tr.messageRewrites) } - got, _ := object.GetCommit(dstRepo.Storer, tr.mapping[childSHA1]) + got, err := object.GetCommit(dstRepo.Storer, tr.mapping[childSHA1]) + if err != nil { + t.Fatalf("read translated child: %v", err) + } if !strings.Contains(got.Message, parentHex) { t.Errorf("rewrite-disabled run still mutated the message: %q", got.Message) } @@ -351,8 +355,8 @@ func TestTranslator_SkipMessageRewrite(t *testing.T) { // entry resolves to a blob whose content is the commit's original SHA1. func TestTranslator_WriteOriginNotes(t *testing.T) { root := t.TempDir() - srcRepo, _ := git.PlainInit(filepath.Join(root, "src.git"), true) - dstRepo, _ := git.PlainInit(filepath.Join(root, "dst.git"), true, git.WithObjectFormat(formatcfg.SHA256)) + srcRepo := initSHA1(t, filepath.Join(root, "src.git")) + dstRepo := initSHA256(t, filepath.Join(root, "dst.git")) blob := writeBlob(t, srcRepo.Storer, []byte("hi\n")) tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) @@ -364,7 +368,7 @@ func TestTranslator_WriteOriginNotes(t *testing.T) { if err != nil { t.Fatalf("discoverReachable: %v", err) } - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) if _, err := tr.translate(c2); err != nil { t.Fatalf("translate: %v", err) } @@ -396,8 +400,15 @@ func TestTranslator_WriteOriginNotes(t *testing.T) { if err != nil { t.Fatalf("read note blob: %v", err) } - reader, _ := blob.Reader() - buf, _ := io.ReadAll(reader) + reader, err := blob.Reader() + if err != nil { + t.Fatalf("open note blob: %v", err) + } + buf, err := io.ReadAll(reader) + if err != nil { + _ = reader.Close() + t.Fatalf("read note blob: %v", err) + } _ = reader.Close() got := strings.TrimSpace(string(buf)) var origSHA1 plumbing.Hash @@ -417,8 +428,8 @@ func TestTranslator_WriteOriginNotes(t *testing.T) { // line, sorted by SHA1, one entry per translated object. func TestTranslator_WriteMappingFile(t *testing.T) { root := t.TempDir() - srcRepo, _ := git.PlainInit(filepath.Join(root, "src.git"), true) - dstRepo, _ := git.PlainInit(filepath.Join(root, "dst.git"), true, git.WithObjectFormat(formatcfg.SHA256)) + srcRepo := initSHA1(t, filepath.Join(root, "src.git")) + dstRepo := initSHA256(t, filepath.Join(root, "dst.git")) blob := writeBlob(t, srcRepo.Storer, []byte("hi\n")) tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) @@ -429,7 +440,7 @@ func TestTranslator_WriteMappingFile(t *testing.T) { if err != nil { t.Fatalf("discoverReachable: %v", err) } - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) if _, err := tr.translate(commit); err != nil { t.Fatalf("translate: %v", err) } @@ -487,9 +498,9 @@ func TestTranslator_WriteMappingFile(t *testing.T) { // pipeline takes. func TestTranslator_AmbiguousMessageRefWarning(t *testing.T) { root := t.TempDir() - srcRepo, _ := git.PlainInit(filepath.Join(root, "src.git"), true) - dstRepo, _ := git.PlainInit(filepath.Join(root, "dst.git"), true, git.WithObjectFormat(formatcfg.SHA256)) - tr, _ := newTranslator(srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), true, nil) + srcRepo := initSHA1(t, filepath.Join(root, "src.git")) + dstRepo := initSHA256(t, filepath.Join(root, "dst.git")) + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), true, nil) // Two real-looking SHA1 hashes that share the prefix "deadbee". one := plumbing.NewHash("deadbee100000000000000000000000000000001") @@ -542,9 +553,39 @@ func TestTranslator_UnresolvableSubmodule(t *testing.T) { // --- helpers --- +// initSHA1 and initSHA256 are t.Fatalf-wrapping `git.PlainInit` shortcuts +// used to keep test bodies focused on the translator logic rather than +// error-handling boilerplate. +func initSHA1(t *testing.T, path string) *git.Repository { + t.Helper() + r, err := git.PlainInit(path, true) + if err != nil { + t.Fatalf("init SHA1 source at %s: %v", path, err) + } + return r +} + +func initSHA256(t *testing.T, path string) *git.Repository { + t.Helper() + r, err := git.PlainInit(path, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target at %s: %v", path, err) + } + return r +} + +func mustTranslator(t *testing.T, src, dst gogitstorer.Storer, dir string, rewrite bool, reachable map[plumbing.Hash]plumbing.ObjectType) *translator { + t.Helper() + tr, err := newTranslator(src, dst, dir, rewrite, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + return tr +} + func writeBlob(t *testing.T, storer interface { NewEncodedObject() plumbing.EncodedObject - SetEncodedObject(plumbing.EncodedObject) (plumbing.Hash, error) + SetEncodedObject(obj plumbing.EncodedObject) (plumbing.Hash, error) }, content []byte) plumbing.Hash { t.Helper() obj := storer.NewEncodedObject() @@ -569,7 +610,7 @@ func writeBlob(t *testing.T, storer interface { func writeTree(t *testing.T, storer interface { NewEncodedObject() plumbing.EncodedObject - SetEncodedObject(plumbing.EncodedObject) (plumbing.Hash, error) + SetEncodedObject(obj plumbing.EncodedObject) (plumbing.Hash, error) }, entries []object.TreeEntry) plumbing.Hash { t.Helper() tree := &object.Tree{Entries: entries} @@ -580,7 +621,7 @@ func writeTree(t *testing.T, storer interface { func writeObject(t *testing.T, storer interface { NewEncodedObject() plumbing.EncodedObject - SetEncodedObject(plumbing.EncodedObject) (plumbing.Hash, error) + SetEncodedObject(obj plumbing.EncodedObject) (plumbing.Hash, error) }, encode func(plumbing.EncodedObject) error) plumbing.Hash { t.Helper() obj := storer.NewEncodedObject() @@ -697,7 +738,7 @@ func TestRun_GitHTTPBackend(t *testing.T) { } // The converted repo must be self-consistent under SHA256. - fsckOut, err := exec.Command(gitBin, "-C", dstDir, "fsck", "--full").CombinedOutput() + fsckOut, err := exec.CommandContext(t.Context(), gitBin, "-C", dstDir, "fsck", "--full").CombinedOutput() if err != nil { t.Fatalf("git fsck failed: %v\n%s", err, fsckOut) } @@ -810,7 +851,7 @@ func TestRun_GitHTTPBackend_Sign(t *testing.T) { // Generate an ephemeral ed25519 SSH key for signing. keyPath := filepath.Join(root, "signkey") - keygen := exec.Command(sshKeygenBin, "-q", "-t", "ed25519", "-N", "", "-f", keyPath, "-C", "test@example.com") + keygen := exec.CommandContext(t.Context(), sshKeygenBin, "-q", "-t", "ed25519", "-N", "", "-f", keyPath, "-C", "test@example.com") if out, err := keygen.CombinedOutput(); err != nil { t.Fatalf("ssh-keygen: %v\n%s", err, out) } @@ -874,7 +915,7 @@ func TestRun_GitHTTPBackend_Sign(t *testing.T) { func mustGit(t *testing.T, dir string, args ...string) { t.Helper() - cmd := exec.Command("git", args...) + cmd := exec.CommandContext(t.Context(), "git", args...) cmd.Dir = dir cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") if out, err := cmd.CombinedOutput(); err != nil { @@ -884,7 +925,7 @@ func mustGit(t *testing.T, dir string, args ...string) { func mustGitOutput(t *testing.T, dir string, args ...string) string { t.Helper() - cmd := exec.Command("git", args...) + cmd := exec.CommandContext(t.Context(), "git", args...) cmd.Dir = dir cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") out, err := cmd.CombinedOutput() From a7d0e8681f0484626ae009c8ecb7f1c846f917e5 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:03:07 +0200 Subject: [PATCH 07/19] Parse target config section-aware in --check The previous substring check would pass on a commented `# objectformat = sha256` line or an `oldobjectformat = sha256` key in another section. Use go-git's config decoder to look up `extensions.objectformat` properly. Co-Authored-By: Claude Opus 4.7 (1M context) Entire-Checkpoint: 122fd96da7bf --- .../internal/sha256convert/sha256convert.go | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 0f84c0fc..b7c0f565 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -468,15 +468,25 @@ func signBranchTips(ctx context.Context, out io.Writer, targetDir, signKey, sour func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refsExpected int) []Check { checks := []Check{} - // 1. Config: extensions.objectformat = sha256. - cfgBytes, err := os.ReadFile(filepath.Join(targetDir, "config")) + // 1. Config: extensions.objectformat = sha256. Parse the file + // section-aware so we don't false-positive on a commented line or + // a similarly-named key in another section. + cfgFile, err := os.Open(filepath.Join(targetDir, "config")) switch { case err != nil: checks = append(checks, Check{Name: "config", OK: false, Detail: err.Error()}) - case !bytes.Contains(cfgBytes, []byte("objectformat = sha256")): - checks = append(checks, Check{Name: "config", OK: false, Detail: "extensions.objectformat = sha256 not set"}) default: - checks = append(checks, Check{Name: "config", OK: true, Detail: "extensions.objectformat = sha256"}) + cfg := formatcfg.New() + decodeErr := formatcfg.NewDecoder(cfgFile).Decode(cfg) + _ = cfgFile.Close() + switch { + case decodeErr != nil: + checks = append(checks, Check{Name: "config", OK: false, Detail: fmt.Sprintf("parse config: %v", decodeErr)}) + case !strings.EqualFold(cfg.Section("extensions").Option("objectformat"), "sha256"): + checks = append(checks, Check{Name: "config", OK: false, Detail: "extensions.objectformat = sha256 not set"}) + default: + checks = append(checks, Check{Name: "config", OK: true, Detail: "extensions.objectformat = sha256"}) + } } // 2. HEAD resolves to an existing object. From f5a21ed0424b50e9f859a0157fa14da42e271946 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:04:08 +0200 Subject: [PATCH 08/19] Match git fsck error lines by prefix in --check Substring-matching "error" against the full fsck output could false-positive on benign dangling/warning lines that happened to contain that word (e.g. a branch or path with "error" in it). Scan line-by-line for "error:" / "fatal:" / "missing " / "broken link" / "bad " prefixes, which is what git itself uses to signal real problems. Entire-Checkpoint: f16d85f433d8 --- .../internal/sha256convert/sha256convert.go | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index b7c0f565..b29aa4ea 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -558,7 +558,11 @@ func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refs switch { case err != nil: checks = append(checks, Check{Name: "git fsck --full", OK: false, Detail: fmt.Sprintf("%v\n%s", err, fsckOut)}) - case bytes.Contains(fsckOut, []byte("error")) || bytes.Contains(fsckOut, []byte("bad sha")): + case fsckHasError(fsckOut): + // Belt-and-braces against a hypothetical git version that prints + // "error:" / "fatal:" lines but exits zero. Match line prefixes + // rather than a substring so a branch or path containing "error" + // in a benign dangling/warning line doesn't trip the check. checks = append(checks, Check{Name: "git fsck --full", OK: false, Detail: strings.TrimSpace(string(fsckOut))}) default: checks = append(checks, Check{Name: "git fsck --full", OK: true, Detail: "clean"}) @@ -566,6 +570,23 @@ func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refs return checks } +// fsckHasError reports whether git-fsck output contains a line that signals +// a real problem (an "error:" or "fatal:" prefix, or a "missing"/"bad" +// object report). Dangling and warning lines are ignored. +func fsckHasError(out []byte) bool { + scanner := bufio.NewScanner(bytes.NewReader(out)) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(line, "error:") || strings.HasPrefix(line, "fatal:") { + return true + } + if strings.HasPrefix(line, "missing ") || strings.HasPrefix(line, "broken link") || strings.HasPrefix(line, "bad ") { + return true + } + } + return false +} + const ( originNotesRef = "refs/notes/sha1-origin" attestationTagPrefix = "refs/tags/converted/" From ec94459dccbabfb77c2d4e980bbb979c2879438b Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:09:48 +0200 Subject: [PATCH 09/19] Honor ctx cancellation in object translation translate() is the single recursive entry point invoked from the tree-entry, commit-parent, tag-target, and message-ref loops. Check ctx.Err() at the top so Ctrl-C during a kernel-scale conversion returns promptly instead of running the whole DFS to completion. ctx is plumbed through newTranslator and stored on the translator (signatures of the many translateX helpers stay untouched). Entire-Checkpoint: 354e2e640545 --- .../internal/sha256convert/sha256convert.go | 15 +++++++++++++-- .../internal/sha256convert/sha256convert_test.go | 6 +++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index b29aa4ea..88e2f548 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -294,7 +294,7 @@ func Run(ctx context.Context, req Request) (Result, error) { return Result{}, fmt.Errorf("init SHA256 target at %s: %w", req.TargetDir, err) } - tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, req.TargetDir, !req.SkipMessageRewrite, reachable) + tr, err := newTranslator(ctx, srcRepo.Storer, dstRepo.Storer, req.TargetDir, !req.SkipMessageRewrite, reachable) if err != nil { return Result{}, err } @@ -678,6 +678,10 @@ func (a authAdapter) Authorizer(req *http.Request) error { // plumbing/format/objfile/writer.go:68), which would store every SHA256 // object at a SHA1-derived path. type translator struct { + // ctx is checked at the top of every translate() call so a Ctrl-C + // during a million-object conversion is responsive. It is the same + // context passed to Run() and is not stored to outlive its caller. + ctx context.Context //nolint:containedctx // translate() is recursive and not directly called by Run; threading ctx through every signature is noisier than a single field used for cancellation only. src *filesystem.Storage dst *filesystem.Storage objectsDir string @@ -728,7 +732,7 @@ func (t *translator) snapshotCounts() Counts { } } -func newTranslator(src, dst storer.Storer, targetDir string, rewriteMessages bool, reachable map[plumbing.Hash]plumbing.ObjectType) (*translator, error) { +func newTranslator(ctx context.Context, src, dst storer.Storer, targetDir string, rewriteMessages bool, reachable map[plumbing.Hash]plumbing.ObjectType) (*translator, error) { srcFS, ok := src.(*filesystem.Storage) if !ok { return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) @@ -741,6 +745,7 @@ func newTranslator(src, dst storer.Storer, targetDir string, rewriteMessages boo reachable = make(map[plumbing.Hash]plumbing.ObjectType) } return &translator{ + ctx: ctx, src: srcFS, dst: dstFS, objectsDir: filepath.Join(targetDir, "objects"), @@ -848,6 +853,12 @@ func discoverReachable(src storer.Storer, roots []plumbing.Hash, progress *atomi } func (t *translator) translate(sha1 plumbing.Hash) (plumbing.Hash, error) { + // Cheap per-object cancellation check so Ctrl-C during a long + // conversion (kernel-scale: ~10M objects) returns promptly rather + // than running the whole DFS to completion. + if err := t.ctx.Err(); err != nil { + return plumbing.ZeroHash, fmt.Errorf("translate %s: %w", sha1, err) + } if newH, ok := t.mapping[sha1]; ok { return newH, nil } diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 510ad9ac..21967fc7 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -86,7 +86,7 @@ func TestTranslator(t *testing.T) { if err != nil { t.Fatalf("discoverReachable: %v", err) } - tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, false, reachable) + tr, err := newTranslator(t.Context(), srcRepo.Storer, dstRepo.Storer, dstDir, false, reachable) if err != nil { t.Fatalf("newTranslator: %v", err) } @@ -201,7 +201,7 @@ func TestTranslator_RewritesMessageHashes(t *testing.T) { if err != nil { t.Fatalf("discoverReachable: %v", err) } - tr, err := newTranslator(srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + tr, err := newTranslator(t.Context(), srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) if err != nil { t.Fatalf("newTranslator: %v", err) } @@ -576,7 +576,7 @@ func initSHA256(t *testing.T, path string) *git.Repository { func mustTranslator(t *testing.T, src, dst gogitstorer.Storer, dir string, rewrite bool, reachable map[plumbing.Hash]plumbing.ObjectType) *translator { t.Helper() - tr, err := newTranslator(src, dst, dir, rewrite, reachable) + tr, err := newTranslator(t.Context(), src, dst, dir, rewrite, reachable) if err != nil { t.Fatalf("newTranslator: %v", err) } From 7c1a7df3fea7372ea3c081f68282cde18fa2cb38 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:19:01 +0200 Subject: [PATCH 10/19] Fall back when source doesn't advertise HEAD Some HTTP v1 servers don't return HEAD in their info/refs. Previously HEAD was left at PlainInit's default refs/heads/master, which then fails the --check HEAD step on any main-default repo. pickHEAD now selects in order: server-advertised HEAD (resolved via the matched DesiredRef.TargetRef so user-supplied ref mappings are honored), refs/heads/main, refs/heads/master, then the lexicographically first branch. Tags-only conversions still leave HEAD at the PlainInit default since there's no sensible branch to point at. Entire-Checkpoint: 161329c78d89 --- .../internal/sha256convert/sha256convert.go | 58 +++++++++-- .../sha256convert/sha256convert_test.go | 99 +++++++++++++++++++ 2 files changed, 148 insertions(+), 9 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 88e2f548..ef76d248 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -324,15 +324,13 @@ func Run(ctx context.Context, req Request) (Result, error) { return Result{}, fmt.Errorf("write target refs: %w", err) } - // Point HEAD at the source's symbolic HEAD if it landed in the - // converted ref set. PlainInit defaults HEAD to refs/heads/master, - // which often doesn't exist (e.g. repos using "main" as the default). - if refService.HeadTarget != "" { - if _, ok := desired[refService.HeadTarget]; ok { - head := plumbing.NewSymbolicReference(plumbing.HEAD, refService.HeadTarget) - if err := dstRepo.Storer.SetReference(head); err != nil { - return Result{}, fmt.Errorf("set HEAD: %w", err) - } + // Point HEAD at a ref that actually exists in the target. PlainInit + // defaults HEAD to refs/heads/master, which often doesn't exist + // (e.g. repos using "main"), and would then fail the --check HEAD + // step. See pickHEAD for the selection order. + if headRef := pickHEAD(refService.HeadTarget, desired); headRef != "" { + if err := dstRepo.Storer.SetReference(plumbing.NewSymbolicReference(plumbing.HEAD, headRef)); err != nil { + return Result{}, fmt.Errorf("set HEAD: %w", err) } } @@ -1453,6 +1451,48 @@ func (t *translator) writeMappingFile(path string) error { return nil } +// pickHEAD chooses which target-side ref the bare repo's HEAD should +// symlink to. It returns "" when no suitable branch exists (e.g. a +// tags-only conversion), in which case the caller leaves HEAD at the +// PlainInit default. +// +// Selection order: +// 1. The source's advertised HEAD, if it landed in the converted set. +// Resolved via the desired entry's TargetRef so a user-supplied ref +// mapping is honored. +// 2. refs/heads/main, then refs/heads/master, if either is present in +// the converted target refs. Some HTTP v1 servers do not advertise +// HEAD, so we pattern-match on conventional defaults. +// 3. The lexicographically first refs/heads/* in the target set, for +// a deterministic fallback when neither convention is present. +func pickHEAD(advertised plumbing.ReferenceName, desired map[plumbing.ReferenceName]planner.DesiredRef) plumbing.ReferenceName { + if advertised != "" { + if d, ok := desired[advertised]; ok { + return d.TargetRef + } + } + branches := make(map[plumbing.ReferenceName]struct{}, len(desired)) + for _, d := range desired { + if d.TargetRef.IsBranch() { + branches[d.TargetRef] = struct{}{} + } + } + for _, candidate := range []plumbing.ReferenceName{"refs/heads/main", "refs/heads/master"} { + if _, ok := branches[candidate]; ok { + return candidate + } + } + if len(branches) == 0 { + return "" + } + names := make([]string, 0, len(branches)) + for name := range branches { + names = append(names, string(name)) + } + sort.Strings(names) + return plumbing.ReferenceName(names[0]) +} + func writeRefs( dst storer.Storer, desired map[plumbing.ReferenceName]planner.DesiredRef, diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 21967fc7..aac4aa68 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -25,6 +25,8 @@ import ( "github.com/go-git/go-git/v6/plumbing/object" gogitstorer "github.com/go-git/go-git/v6/plumbing/storer" "github.com/go-git/go-git/v6/storage/filesystem" + + "entire.io/entire/git-sync/internal/planner" ) // TestTranslator builds a small SHA1 source repo with blobs, trees, commits, @@ -968,3 +970,100 @@ func newCGIBackend(t *testing.T, gitBin, root string) *cgiBackend { // will start failing in this package's tests rather than only at runtime // against a real repo. var _ = (*filesystem.Storage)(nil) + +func TestPickHEAD(t *testing.T) { + branch := func(name string) planner.DesiredRef { + ref := plumbing.ReferenceName("refs/heads/" + name) + return planner.DesiredRef{Kind: planner.RefKindBranch, SourceRef: ref, TargetRef: ref} + } + tag := func(name string) planner.DesiredRef { + ref := plumbing.ReferenceName("refs/tags/" + name) + return planner.DesiredRef{Kind: planner.RefKindTag, SourceRef: ref, TargetRef: ref} + } + tests := []struct { + name string + advertised plumbing.ReferenceName + desired map[plumbing.ReferenceName]planner.DesiredRef + want plumbing.ReferenceName + }{ + { + name: "advertised HEAD wins when present in desired", + advertised: "refs/heads/develop", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": branch("main"), + "refs/heads/develop": branch("develop"), + }, + want: "refs/heads/develop", + }, + { + name: "advertised HEAD respects ref mapping (target side)", + advertised: "refs/heads/source-name", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/source-name": { + Kind: planner.RefKindBranch, + SourceRef: "refs/heads/source-name", + TargetRef: "refs/heads/target-name", + }, + }, + want: "refs/heads/target-name", + }, + { + name: "falls back to main when advertised HEAD missing", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": branch("main"), + "refs/heads/master": branch("master"), + }, + want: "refs/heads/main", + }, + { + name: "falls back to master when no main", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/master": branch("master"), + "refs/heads/feature": branch("feature"), + }, + want: "refs/heads/master", + }, + { + name: "falls back to first sorted branch when neither main nor master", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/zeta": branch("zeta"), + "refs/heads/alpha": branch("alpha"), + "refs/heads/beta": branch("beta"), + }, + want: "refs/heads/alpha", + }, + { + name: "advertised HEAD pointing outside desired falls back to convention", + advertised: "refs/heads/dropped", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": branch("main"), + }, + want: "refs/heads/main", + }, + { + name: "tags-only conversion returns empty so HEAD stays at PlainInit default", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/tags/v1.0": tag("v1.0"), + }, + want: "", + }, + { + name: "empty desired returns empty", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{}, + want: "", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := pickHEAD(tt.advertised, tt.desired) + if got != tt.want { + t.Fatalf("pickHEAD = %q, want %q", got, tt.want) + } + }) + } +} From 9d7c4942188ca0e9a10089f4d98c5335680b5b77 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:22:36 +0200 Subject: [PATCH 11/19] Address review nits in sha256convert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - signBranchTips: explain the os.Stderr departure from req.Out; TTY inheritance is required for gpg/ssh-agent prompts. - Don't double-count when a commit/tag has both Signature and SignatureSHA256 (two encodings of the same signature). Relabel the warning to "signature(s) / mergetag header(s)" so the count matches what it actually represents. - writeOriginNotes now honors SOURCE_DATE_EPOCH and otherwise pins the wrapper-commit timestamp to the Unix epoch, so the notes-ref hash is reproducible across runs over identical source state. The timestamp is bookkeeping; it carries no information about the underlying SHA1 history. - writeLoose uses zlib level 1, matching git's core.looseCompression default — loose objects are short-lived before gc packs them, so write speed > size. - Clarify encodeBody's "scratch MemoryObject" comment so the unused format argument is no longer mystifying on re-read. Entire-Checkpoint: cabf3e19b5f2 --- .../internal/sha256convert/sha256convert.go | 70 ++++++++++++++----- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index ef76d248..08765673 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -147,7 +147,10 @@ func (r Result) Lines() []string { fmt.Sprintf("refs written: %d", r.RefsConverted), } if r.SignaturesStripped > 0 { - lines = append(lines, fmt.Sprintf("warning: stripped %d GPG signature(s); they no longer match the rewritten object content", r.SignaturesStripped)) + // Mixes commit/tag signatures (GPG/SSH/X.509) and embedded + // mergetag headers — each counts as one signed artifact whose + // signature became invalid post-rewrite. + lines = append(lines, fmt.Sprintf("warning: stripped %d signature(s) / mergetag header(s); they no longer match the rewritten object content", r.SignaturesStripped)) } if r.MessageRewrites > 0 { lines = append(lines, fmt.Sprintf("rewrote %d SHA1 hash reference(s) in commit/tag messages", r.MessageRewrites)) @@ -445,9 +448,13 @@ func signBranchTips(ctx context.Context, out io.Writer, targetDir, signKey, sour args = append(args, tagName, refName) cmd := exec.CommandContext(ctx, gitBin, args...) - // Inherit stdio so gpg/ssh-agent passphrase prompts work. We - // intentionally do not capture stdout/stderr — the user needs - // to see them when authenticating. + // Deliberate departure from the req.Out plumbing the rest of + // Run uses: gpg/ssh-agent and pinentry need a real TTY for + // passphrase prompts, so we inherit the parent's stdio + // directly. The consequence is that callers passing + // req.Out = io.Discard (e.g. tests) still see subprocess + // output on real stderr — that's the cost of working + // authentication. cmd.Stdin = os.Stdin cmd.Stdout = os.Stderr // git tag -s is usually quiet on success cmd.Stderr = os.Stderr @@ -993,11 +1000,12 @@ func (t *translator) translateCommit(sha1 plumbing.Hash, src plumbing.EncodedObj t.messageRewrites += n } } - if c.Signature != "" { + // A commit can carry both Signature (SHA1 form, "gpgsig") and + // SignatureSHA256 ("gpgsig-sha256") in a transitional dual-hash + // repo, but they encode the same logical signature. Strip both + // fields if present, count once. + if c.Signature != "" || c.SignatureSHA256 != "" { c.Signature = "" - t.signaturesStripped++ - } - if c.SignatureSHA256 != "" { c.SignatureSHA256 = "" t.signaturesStripped++ } @@ -1053,11 +1061,10 @@ func (t *translator) translateTag(sha1 plumbing.Hash, src plumbing.EncodedObject t.messageRewrites += n } } - if tag.Signature != "" { + // Same as commits: Signature and SignatureSHA256 are two encodings + // of the same logical signature in a transitional dual-hash repo. + if tag.Signature != "" || tag.SignatureSHA256 != "" { tag.Signature = "" - t.signaturesStripped++ - } - if tag.SignatureSHA256 != "" { tag.SignatureSHA256 = "" t.signaturesStripped++ } @@ -1074,10 +1081,13 @@ func (t *translator) translateTag(sha1 plumbing.Hash, src plumbing.EncodedObject return newHash, nil } -// encodeBody runs an object's go-git Encode method into a SHA1-hasher -// MemoryObject (the hasher we use to capture bytes is irrelevant; we only -// read the body back out) and returns just the payload bytes — without the +// encodeBody runs an object's go-git Encode method into a scratch +// MemoryObject and returns just the payload bytes — without the // " \x00" header. writeLoose adds the SHA256-correct header. +// +// The format argument to NewMemoryObject is required by the constructor +// but unused here: we never ask the scratch object for its hash, only +// for its byte stream. func encodeBody(typ plumbing.ObjectType, encode func(plumbing.EncodedObject) error) ([]byte, error) { scratch := plumbing.NewMemoryObject(plumbing.FromObjectFormat(formatcfg.SHA1)) scratch.SetType(typ) @@ -1125,7 +1135,13 @@ func (t *translator) writeLoose(typ plumbing.ObjectType, body []byte) (plumbing. } var buf bytes.Buffer - zw := zlib.NewWriter(&buf) + // Level 1 matches git's core.looseCompression default. Loose objects + // are short-lived (gc rolls them into packs), so optimizing for write + // speed over size is the standard trade-off. + zw, err := zlib.NewWriterLevel(&buf, zlib.BestSpeed) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("zlib writer: %w", err) + } if _, err := zw.Write(header); err != nil { return plumbing.ZeroHash, fmt.Errorf("zlib write header: %w", err) } @@ -1285,6 +1301,20 @@ func (t *translator) extractMessageReferences(msg string) []plumbing.Hash { return out } +// notesCommitTime returns the committer/author timestamp for the +// synthetic notes wrapper commit. Reads SOURCE_DATE_EPOCH (the +// reproducible-builds convention) when set, falling back to the Unix +// epoch so two runs over identical source state always produce the +// same notes-ref hash. +func notesCommitTime() time.Time { + if raw := os.Getenv("SOURCE_DATE_EPOCH"); raw != "" { + if secs, err := strconv.ParseInt(raw, 10, 64); err == nil { + return time.Unix(secs, 0).UTC() + } + } + return time.Unix(0, 0).UTC() +} + // writeOriginNotes writes a `git notes` ref to dst that records each // translated commit's original SHA1, keyed by its new SHA256. Standard // git tooling (`git log --notes=`, `git notes --ref= show @@ -1341,8 +1371,12 @@ func (t *translator) writeOriginNotes(refName string) (string, error) { return "", fmt.Errorf("store notes tree: %w", err) } - now := time.Now().UTC() - sig := object.Signature{Name: "git-sync", Email: "noreply@entire.io", When: now} + // Honor SOURCE_DATE_EPOCH for reproducible builds; otherwise pin to + // the Unix epoch so the notes-ref hash is identical across runs over + // the same source state. The notes commit is bookkeeping — its + // timestamp carries no meaningful information about when the + // underlying SHA1 history was created. + sig := object.Signature{Name: "git-sync", Email: "noreply@entire.io", When: notesCommitTime()} commit := &object.Commit{ Author: sig, Committer: sig, From e0505920fdc42208cbffa6ab0368dd7b919144b7 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:35:26 +0200 Subject: [PATCH 12/19] Reject --exclude-ref-prefix values that drop branches or tags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The convert-sha256 docs promise that every branch and tag is always converted, since dropping any of them risks stranding cross-branch SHA1 references in commit and tag messages — the exact invariant the message-rewrite pass exists to maintain. The previous code piped req.ExcludeRefPrefixes straight into planner.BuildDesiredRefs, which applies it to branch and tag selection too, so the flag silently broke the promise. Validate at the top of Run: refuse any prefix that, under the planner's HasPrefix matching, would catch a refs/heads/* or refs/tags/* name. The check covers bare "", "refs/", partial "refs/h", whole "refs/heads/" / "refs/tags/", and any sub- namespace under either. Help text on convert-sha256 now also calls the rejection out. Entire-Checkpoint: ec9a87c6c8f1 --- cmd/git-sync/convert_sha256.go | 4 +- .../internal/sha256convert/sha256convert.go | 40 +++++++++++++ .../sha256convert/sha256convert_test.go | 58 +++++++++++++++++++ 3 files changed, 101 insertions(+), 1 deletion(-) diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go index 738ac120..55321ec2 100644 --- a/cmd/git-sync/convert_sha256.go +++ b/cmd/git-sync/convert_sha256.go @@ -27,7 +27,9 @@ All branches and tags on the source are always converted — partial scope risks stranding cross-branch references in commit messages. Pass --all-refs to also include refs/notes/*, refs/pull/*, and other custom namespaces; pass --exclude-ref-prefix to subtract specific namespaces -from --all-refs. +from --all-refs. Exclude prefixes that would drop any branch or tag +(e.g. refs/heads/feature/, refs/tags/, refs/) are rejected at run time +to preserve the always-convert invariant. The conversion is destructive in two ways the caller should be aware of: GPG signatures on commits and tags are dropped (they sign over the diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 08765673..3ef3ca31 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -206,6 +206,13 @@ func Run(ctx context.Context, req Request) (Result, error) { if req.TargetDir == "" { return Result{}, errors.New("convert-sha256 requires a target directory") } + // Enforce the documented invariant: every branch and every tag is + // always converted. Otherwise the partial set could strand + // cross-branch hash references in commit and tag messages, which + // the message-rewrite pass is built to keep intact. + if bad := protectedExcludePrefixes(req.ExcludeRefPrefixes); len(bad) > 0 { + return Result{}, fmt.Errorf("convert-sha256 refuses --exclude-ref-prefix values that would drop branches or tags: %s (only namespaces outside refs/heads/ and refs/tags/ may be excluded)", strings.Join(bad, ", ")) + } out := req.Out if out == nil { out = os.Stderr @@ -597,6 +604,39 @@ const ( attestationTagPrefix = "refs/tags/converted/" ) +// protectedExcludePrefixes returns the subset of prefixes that, under +// planner.IsRefExcluded's string-prefix semantics, would knock out at +// least one branch or tag. A prefix matches a branch if either side +// is a string-prefix of the other against "refs/heads/" (and likewise +// for "refs/tags/"). That covers: +// +// - bare "" (excludes every ref) +// - "refs/" or "refs/h", "refs/heads/" (whole branch namespace) +// - "refs/heads/feature/" (some branches) +// - "refs/tags/" and any narrower suffix +// +// Returned in input order, with duplicates removed, so the error +// message shows the user exactly which flag values to drop. +func protectedExcludePrefixes(prefixes []string) []string { + protected := []string{"refs/heads/", "refs/tags/"} + var bad []string + seen := map[string]struct{}{} + for _, raw := range prefixes { + p := strings.TrimSpace(raw) + if _, dup := seen[p]; dup { + continue + } + for _, prot := range protected { + if strings.HasPrefix(p, prot) || strings.HasPrefix(prot, p) { + bad = append(bad, raw) + seen[p] = struct{}{} + break + } + } + } + return bad +} + // ensureEmptyTarget refuses to init into a non-empty directory so the user // doesn't quietly accumulate objects into an existing repo. func ensureEmptyTarget(path string) error { diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index aac4aa68..1bae8053 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -971,6 +971,64 @@ func newCGIBackend(t *testing.T, gitBin, root string) *cgiBackend { // against a real repo. var _ = (*filesystem.Storage)(nil) +func TestProtectedExcludePrefixes(t *testing.T) { + tests := []struct { + name string + prefixes []string + want []string + }{ + {"nil input", nil, nil}, + {"single benign namespace", []string{"refs/pull/"}, nil}, + {"multiple benign namespaces", []string{"refs/pull/", "refs/notes/", "refs/changes/"}, nil}, + {"whole branches namespace banned", []string{"refs/heads/"}, []string{"refs/heads/"}}, + {"whole tags namespace banned", []string{"refs/tags/"}, []string{"refs/tags/"}}, + {"branch sub-namespace banned", []string{"refs/heads/feature/"}, []string{"refs/heads/feature/"}}, + {"tag sub-namespace banned", []string{"refs/tags/v1/"}, []string{"refs/tags/v1/"}}, + {"refs/ banned because it would drop everything", []string{"refs/"}, []string{"refs/"}}, + {"empty string banned (would drop every ref)", []string{""}, []string{""}}, + {"partial refs/h banned (covers refs/heads/)", []string{"refs/h"}, []string{"refs/h"}}, + {"mixed input reports only the bad ones, in order", []string{"refs/pull/", "refs/heads/", "refs/notes/", "refs/tags/v1.0"}, []string{"refs/heads/", "refs/tags/v1.0"}}, + {"duplicates collapsed", []string{"refs/heads/", "refs/heads/"}, []string{"refs/heads/"}}, + {"trims whitespace before matching", []string{" refs/heads/ "}, []string{" refs/heads/ "}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := protectedExcludePrefixes(tt.prefixes) + if len(got) != len(tt.want) { + t.Fatalf("protectedExcludePrefixes(%v) = %v, want %v", tt.prefixes, got, tt.want) + } + for i := range got { + if got[i] != tt.want[i] { + t.Fatalf("protectedExcludePrefixes(%v)[%d] = %q, want %q", tt.prefixes, i, got[i], tt.want[i]) + } + } + }) + } +} + +func TestRun_RejectsExcludePrefixesThatDropBranchesOrTags(t *testing.T) { + // We never reach the network here — the validation fires before + // any I/O — so a non-empty target dir is the only thing the early + // path needs. + dst := t.TempDir() + req := Request{ + SourceURL: "http://example.invalid/repo.git", + TargetDir: filepath.Join(dst, "out"), + ExcludeRefPrefixes: []string{"refs/pull/", "refs/heads/feature/"}, + } + _, err := Run(t.Context(), req) + if err == nil { + t.Fatalf("Run accepted --exclude-ref-prefix refs/heads/feature/, expected refusal") + } + msg := err.Error() + if !strings.Contains(msg, "refs/heads/feature/") { + t.Fatalf("error did not name the offending prefix: %v", err) + } + if !strings.Contains(msg, "exclude-ref-prefix") { + t.Fatalf("error did not mention the flag: %v", err) + } +} + func TestPickHEAD(t *testing.T) { branch := func(name string) planner.DesiredRef { ref := plumbing.ReferenceName("refs/heads/" + name) From 635a94b593b8d8da8b0f909c4686cc11495173ea Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:38:58 +0200 Subject: [PATCH 13/19] Refuse source refs that collide with side outputs writeRefs lands the source ref set on the target, then writeOriginNotes and signBranchTips publish their own refs on top. A source repo that already advertised refs/notes/sha1-origin (under --all-refs) or any refs/tags/converted/* (always, since tags are mandatory) would have those refs silently clobbered. Add checkSideOutputCollision: run after planner.BuildDesiredRefs and before any object work, refusing with an actionable message that names the offending ref(s) and points at --no-origin-notes / --exclude-ref-prefix / dropping --sign as escapes. Also tighten --check to skip *only* the side-output refs we actually wrote: pass the {origin notes, signed tags} set into runChecks instead of pattern-matching by prefix, so a legitimate source ref that happened to share a namespace is not silently hidden from the resolved/expected fraction. Entire-Checkpoint: 250149a84f90 --- .../internal/sha256convert/sha256convert.go | 72 +++++++++++++---- .../sha256convert/sha256convert_test.go | 78 +++++++++++++++++++ 2 files changed, 137 insertions(+), 13 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 3ef3ca31..25afc307 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -263,6 +263,14 @@ func Run(ctx context.Context, req Request) (Result, error) { return Result{}, errors.New("no source refs matched the requested scope") } + // Refuse before any further I/O if the source carries refs that + // would collide with our side outputs. writeRefs runs before + // writeOriginNotes / signBranchTips, so without this check the + // later side-output write would silently clobber the source ref. + if err := checkSideOutputCollision(desired, req.SkipOriginNotes, req.Sign); err != nil { + return Result{}, err + } + // Fetch into temp SHA1 store ------------------------------------------ fmt.Fprintf(out, "fetching %d ref(s) from %s ...\n", len(desired), req.SourceURL) gpDesired := convert.DesiredRefs(desired) @@ -395,7 +403,17 @@ func Run(ctx context.Context, req Request) (Result, error) { if req.Check { fmt.Fprintln(out, "verifying output ...") - res.Checks = runChecks(ctx, req.TargetDir, dstRepo, refsWritten) + // Collect the side outputs this run actually wrote so the + // refs check knows which target refs to ignore. Anything not + // in here is assumed to be a translated source ref. + sideOutputs := make(map[plumbing.ReferenceName]struct{}, 1+len(res.SignedTags)) + if res.OriginNotesRef != "" { + sideOutputs[plumbing.ReferenceName(res.OriginNotesRef)] = struct{}{} + } + for _, tag := range res.SignedTags { + sideOutputs[plumbing.ReferenceName(tag)] = struct{}{} + } + res.Checks = runChecks(ctx, req.TargetDir, dstRepo, refsWritten, sideOutputs) for _, c := range res.Checks { mark := "✓" if !c.OK { @@ -477,7 +495,12 @@ func signBranchTips(ctx context.Context, out io.Writer, targetDir, signKey, sour // Returns one Check per step. Callers print and/or fail-on-error based // on these. No early return so users see the full picture even when an // earlier check fails. -func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refsExpected int) []Check { +// +// sideOutputs holds the exact refs the run created on top of the +// source set (the origin-notes ref, any --sign attestation tags), so +// the refs check can omit them from the resolved/expected fraction +// without false-positive-skipping a same-named source ref. +func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refsExpected int, sideOutputs map[plumbing.ReferenceName]struct{}) []Check { checks := []Check{} // 1. Config: extensions.objectformat = sha256. Parse the file @@ -516,11 +539,12 @@ func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refs } } - // 3. Every written ref resolves to an existing object. Skip refs we - // add as side outputs (the origin-notes ref and any - // refs/tags/converted/* attestation tags from --sign), since they - // are accounted for in their own Result fields and would otherwise - // make the displayed fraction misleading. + // 3. Every written ref resolves to an existing object. Skip the + // specific refs this run created as side outputs — they're + // accounted for in their own Result fields and would otherwise + // make the displayed fraction misleading. Skipping by exact name + // (not by prefix) avoids hiding a legitimate source ref that + // happened to share a namespace. resolved := 0 missing := "" refs, err := repo.References() @@ -531,16 +555,12 @@ func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refs if r.Type() != plumbing.HashReference { return nil } - name := r.Name() - if name == plumbing.ReferenceName(originNotesRef) { - return nil - } - if strings.HasPrefix(string(name), attestationTagPrefix) { + if _, skip := sideOutputs[r.Name()]; skip { return nil } if _, err := repo.Storer.EncodedObject(plumbing.AnyObject, r.Hash()); err != nil { if missing == "" { - missing = fmt.Sprintf("%s → %s: %v", name, r.Hash(), err) + missing = fmt.Sprintf("%s → %s: %v", r.Name(), r.Hash(), err) } return nil } @@ -637,6 +657,32 @@ func protectedExcludePrefixes(prefixes []string) []string { return bad } +// checkSideOutputCollision refuses the conversion when the source set +// already contains a ref name this run would later write as a side +// output. Without this guard, writeRefs would publish the source's +// value first and writeOriginNotes / signBranchTips would silently +// overwrite it — losing the source ref and hiding the conflict. +func checkSideOutputCollision(desired map[plumbing.ReferenceName]planner.DesiredRef, skipOriginNotes, sign bool) error { + if !skipOriginNotes { + if _, conflict := desired[plumbing.ReferenceName(originNotesRef)]; conflict { + return fmt.Errorf("source already advertises %s; pass --no-origin-notes to keep that source ref, or --exclude-ref-prefix %s to drop it from the conversion", originNotesRef, originNotesRef) + } + } + if sign { + var clashes []string + for name := range desired { + if strings.HasPrefix(string(name), attestationTagPrefix) { + clashes = append(clashes, string(name)) + } + } + if len(clashes) > 0 { + sort.Strings(clashes) + return fmt.Errorf("source has %s under %s, which collides with the attestation tags --sign would create; drop --sign or rename the source tag(s)", strings.Join(clashes, ", "), attestationTagPrefix) + } + } + return nil +} + // ensureEmptyTarget refuses to init into a non-empty directory so the user // doesn't quietly accumulate objects into an existing repo. func ensureEmptyTarget(path string) error { diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 1bae8053..3fd819ff 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -1029,6 +1029,84 @@ func TestRun_RejectsExcludePrefixesThatDropBranchesOrTags(t *testing.T) { } } +func TestCheckSideOutputCollision(t *testing.T) { + mk := func(name string) planner.DesiredRef { + ref := plumbing.ReferenceName(name) + return planner.DesiredRef{SourceRef: ref, TargetRef: ref} + } + tests := []struct { + name string + desired map[plumbing.ReferenceName]planner.DesiredRef + skipOriginNotes bool + sign bool + wantErrSubstring string + }{ + { + name: "no collisions accepted", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": mk("refs/heads/main"), + "refs/tags/v1": mk("refs/tags/v1"), + }, + wantErrSubstring: "", + }, + { + name: "origin-notes collision refused by default", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": mk("refs/heads/main"), + "refs/notes/sha1-origin": mk("refs/notes/sha1-origin"), + }, + wantErrSubstring: "refs/notes/sha1-origin", + }, + { + name: "origin-notes collision allowed when --no-origin-notes set", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/notes/sha1-origin": mk("refs/notes/sha1-origin"), + }, + skipOriginNotes: true, + wantErrSubstring: "", + }, + { + name: "converted-tag collision refused only when --sign", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": mk("refs/heads/main"), + "refs/tags/converted/main": mk("refs/tags/converted/main"), + }, + sign: true, + wantErrSubstring: "refs/tags/converted/main", + }, + { + name: "converted-tag without --sign passes through", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/tags/converted/main": mk("refs/tags/converted/main"), + }, + sign: false, + wantErrSubstring: "", + }, + { + name: "multiple converted-tag collisions listed in sorted order", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/tags/converted/zeta": mk("refs/tags/converted/zeta"), + "refs/tags/converted/alpha": mk("refs/tags/converted/alpha"), + }, + sign: true, + wantErrSubstring: "refs/tags/converted/alpha, refs/tags/converted/zeta", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := checkSideOutputCollision(tt.desired, tt.skipOriginNotes, tt.sign) + switch { + case tt.wantErrSubstring == "" && err != nil: + t.Fatalf("unexpected error: %v", err) + case tt.wantErrSubstring != "" && err == nil: + t.Fatalf("expected error containing %q, got nil", tt.wantErrSubstring) + case tt.wantErrSubstring != "" && !strings.Contains(err.Error(), tt.wantErrSubstring): + t.Fatalf("error %q does not contain %q", err.Error(), tt.wantErrSubstring) + } + }) + } +} + func TestPickHEAD(t *testing.T) { branch := func(name string) planner.DesiredRef { ref := plumbing.ReferenceName("refs/heads/" + name) From 48ba15336d83579d0f62f9c4c5b382ac510748c0 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:42:22 +0200 Subject: [PATCH 14/19] gofmt: align map literal in collision test CI-blocking gofmt diff on the refs/notes/sha1-origin row. Entire-Checkpoint: 0ae6b701b541 --- cmd/git-sync/internal/sha256convert/sha256convert_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 3fd819ff..066ab6ba 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -1052,8 +1052,8 @@ func TestCheckSideOutputCollision(t *testing.T) { { name: "origin-notes collision refused by default", desired: map[plumbing.ReferenceName]planner.DesiredRef{ - "refs/heads/main": mk("refs/heads/main"), - "refs/notes/sha1-origin": mk("refs/notes/sha1-origin"), + "refs/heads/main": mk("refs/heads/main"), + "refs/notes/sha1-origin": mk("refs/notes/sha1-origin"), }, wantErrSubstring: "refs/notes/sha1-origin", }, From f05a4e3edf29ebcb33c910f8fcc32365c94b61f7 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:47:03 +0200 Subject: [PATCH 15/19] Resolve convert-sha256 positionals against unset fields The previous code assigned args[0] to SourceURL and args[1] to TargetDir unconditionally, so `--source-url ` left TargetDir empty (len(args) == 1, so args[1] was never read) and failed the "requires a source URL and a target directory" check. Consume positionals left-to-right against fields the user has not yet supplied via flags. The four flag/positional shapes (zero/one/two flags) now all yield the correct assignment. Extracted into resolveConvertSHA256Args for a unit test that covers every shape including the regression case. Entire-Checkpoint: 60184126410a --- cmd/git-sync/convert_sha256.go | 31 ++++++++--- cmd/git-sync/convert_sha256_test.go | 84 +++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 8 deletions(-) create mode 100644 cmd/git-sync/convert_sha256_test.go diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go index 55321ec2..f3f8da37 100644 --- a/cmd/git-sync/convert_sha256.go +++ b/cmd/git-sync/convert_sha256.go @@ -42,14 +42,8 @@ any so the caller can convert the submodule repository first.`, SilenceUsage: true, RunE: func(cmd *cobra.Command, args []string) error { req.ProtocolMode = gitsync.ProtocolMode(protocolVal) - if req.SourceURL == "" && len(args) > 0 { - req.SourceURL = args[0] - } - if req.TargetDir == "" && len(args) > 1 { - req.TargetDir = args[1] - } - if req.SourceURL == "" || req.TargetDir == "" { - return errors.New("convert-sha256 requires a source URL and a target directory") + if err := resolveConvertSHA256Args(&req, args); err != nil { + return err } result, err := sha256convert.Run(cmd.Context(), req) @@ -100,3 +94,24 @@ any so the caller can convert the submodule repository first.`, return cmd } + +// resolveConvertSHA256Args consumes positional args left-to-right, +// skipping fields the user already supplied via flags. Without that +// rule, `--source-url ` would look like one positional and +// land in SourceURL — leaving TargetDir empty even though the user +// gave both. The two-flags-no-positionals and zero-flags-two-positionals +// shapes also work, as do the symmetric --target-dir + positional URL. +func resolveConvertSHA256Args(req *sha256convert.Request, args []string) error { + positional := args + if req.SourceURL == "" && len(positional) > 0 { + req.SourceURL = positional[0] + positional = positional[1:] + } + if req.TargetDir == "" && len(positional) > 0 { + req.TargetDir = positional[0] + } + if req.SourceURL == "" || req.TargetDir == "" { + return errors.New("convert-sha256 requires a source URL and a target directory") + } + return nil +} diff --git a/cmd/git-sync/convert_sha256_test.go b/cmd/git-sync/convert_sha256_test.go new file mode 100644 index 00000000..ceb6fb3a --- /dev/null +++ b/cmd/git-sync/convert_sha256_test.go @@ -0,0 +1,84 @@ +package main + +import ( + "strings" + "testing" + + "entire.io/entire/git-sync/cmd/git-sync/internal/sha256convert" +) + +func TestResolveConvertSHA256Args(t *testing.T) { + const url = "http://example.invalid/repo.git" + const dir = "/tmp/out" + + tests := []struct { + name string + req sha256convert.Request + args []string + wantURL string + wantDir string + wantErr string + }{ + { + name: "both positionals", + args: []string{url, dir}, + wantURL: url, + wantDir: dir, + }, + { + name: "url flag plus positional dir — the reported bug", + req: sha256convert.Request{SourceURL: url}, + args: []string{dir}, + wantURL: url, + wantDir: dir, + }, + { + name: "dir flag plus positional url", + req: sha256convert.Request{TargetDir: dir}, + args: []string{url}, + wantURL: url, + wantDir: dir, + }, + { + name: "both flags, no positionals", + req: sha256convert.Request{SourceURL: url, TargetDir: dir}, + args: nil, + wantURL: url, + wantDir: dir, + }, + { + name: "missing dir", + req: sha256convert.Request{SourceURL: url}, + args: nil, + wantErr: "requires a source URL and a target directory", + }, + { + name: "missing both", + args: nil, + wantErr: "requires a source URL and a target directory", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := tt.req + err := resolveConvertSHA256Args(&req, tt.args) + switch { + case tt.wantErr == "" && err != nil: + t.Fatalf("unexpected error: %v", err) + case tt.wantErr != "" && err == nil: + t.Fatalf("expected error containing %q, got nil", tt.wantErr) + case tt.wantErr != "" && !strings.Contains(err.Error(), tt.wantErr): + t.Fatalf("error %q does not contain %q", err.Error(), tt.wantErr) + } + if tt.wantErr != "" { + return + } + if req.SourceURL != tt.wantURL { + t.Errorf("SourceURL: got %q, want %q", req.SourceURL, tt.wantURL) + } + if req.TargetDir != tt.wantDir { + t.Errorf("TargetDir: got %q, want %q", req.TargetDir, tt.wantDir) + } + }) + } +} From f6e944c2547e25603128a5c4fc295b627f4e2cbe Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 10:54:01 +0200 Subject: [PATCH 16/19] Allow gitproto.Conn and AuthMethod through ireturn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These are project-wide transport/auth interfaces, the same shape as the auth.Method entry that's already on the ireturn allowlist. Without the entries, mise run lint (which runs golangci-lint --fix) was racing: nolintlint stripped the //nolint:ireturn directives on openSource / normalizeAuth / newConn before ireturn flagged the functions, leaving lint red on every run. With the types allowed by policy, the directives become dead weight — drop them so the source documents intent once, in .golangci.yaml. Entire-Checkpoint: 9fa947bf886d --- .golangci.yaml | 2 ++ cmd/git-sync/internal/sha256convert/sha256convert.go | 2 -- internal/syncer/syncer.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.golangci.yaml b/.golangci.yaml index f7ade94e..5562d36b 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -98,6 +98,8 @@ linters: - github.com/go-git/go-git/v6/plumbing/storer.EncodedObjectIter - github.com/go-git/go-billy/v6.Filesystem - entire.io/entire/git-sync/internal/auth.Method + - entire.io/entire/git-sync/internal/gitproto.Conn + - entire.io/entire/git-sync/internal/gitproto.AuthMethod nolintlint: require-explanation: true require-specific: true diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 25afc307..207b74bd 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -702,7 +702,6 @@ func ensureEmptyTarget(path string) error { return nil } -//nolint:ireturn // gitproto.Conn is the shared transport interface; returning it directly mirrors the rest of git-sync. func openSource(ctx context.Context, req Request, planCfg planner.PlanConfig) (gitproto.Conn, *gitproto.RefService, []*plumbing.Reference, error) { ep, err := url.Parse(req.SourceURL) if err != nil { @@ -737,7 +736,6 @@ func openSource(ctx context.Context, req Request, planCfg planner.PlanConfig) (g return conn, svc, refs, nil } -//nolint:ireturn // gitproto.AuthMethod is the shared signing interface; returning it lets callers pass it straight through. func normalizeAuth(m auth.Method) gitproto.AuthMethod { if m == nil { return nil diff --git a/internal/syncer/syncer.go b/internal/syncer/syncer.go index 6f990235..92761c43 100644 --- a/internal/syncer/syncer.go +++ b/internal/syncer/syncer.go @@ -349,7 +349,7 @@ func measurementLine(m Measurement) []string { // --- Session setup --- -func newConn(raw Endpoint, label string, stats *statsCollector, httpClient *http.Client) (gitproto.Conn, error) { //nolint:ireturn // transport selection intentionally returns the shared connection interface +func newConn(raw Endpoint, label string, stats *statsCollector, httpClient *http.Client) (gitproto.Conn, error) { ep, err := transport.ParseURL(raw.URL) if err != nil { return nil, fmt.Errorf("parse endpoint: %w", err) From fb4e4b66762eb7a4e740543d60f8447671735483 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 11:37:00 +0200 Subject: [PATCH 17/19] Fix blocking issues in convert-sha256 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Submodule gitlinks: drop the "vendored" carve-out. Even when the linked-to commit lives in the source store, rewriting the gitlink to SHA256 produces a tree that fsck-passes but breaks `git submodule update` forever — the .gitmodules upstream still advertises only SHA1. Refuse any submodule gitlink in discoverReachable; keep a defensive guard in translateTree. 2. --check HEAD on tag-only conversions: pickHEAD returns "" when no branches landed, so HEAD stays at the PlainInit default refs/heads/master and the HEAD check guarantees a failure after an otherwise successful run. runChecks now takes a hasBranches bool and marks HEAD as skipped when false, with a "tags-only conversion" detail. 3. Partial signed-tags list dropped on error: signBranchTips returns the tags it created before failing, but Run was assigning res.SignedTags = signed only on the success path and the cobra wrapper dropped result entirely on err. Assign res.SignedTags before the err check, and have the cobra wrapper print the partial result on error so users see which converted/* tags landed and need cleanup. 4. --keep-source-objects on error paths: the flag's whole purpose is debugging failed conversions, but cleanupTemp was only flipped at the *end* of Run, so every error before that wiped the temp store. Hoist the cleanupTemp = false / res.TempDir assignment to right after MkdirTemp, and propagate `res` through every subsequent error return so the kept path surfaces in both Result and Lines() output. Entire-Checkpoint: 451aea1c737a --- cmd/git-sync/convert_sha256.go | 20 ++- .../internal/sha256convert/sha256convert.go | 168 ++++++++++-------- .../sha256convert/sha256convert_test.go | 70 ++++++++ 3 files changed, 179 insertions(+), 79 deletions(-) diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go index f3f8da37..17d71c1c 100644 --- a/cmd/git-sync/convert_sha256.go +++ b/cmd/git-sync/convert_sha256.go @@ -33,10 +33,12 @@ to preserve the always-convert invariant. The conversion is destructive in two ways the caller should be aware of: GPG signatures on commits and tags are dropped (they sign over the -original SHA1 content and would be invalid post-rewrite), and submodule -gitlinks that point at a commit outside this repository cannot be -embedded in a SHA256 tree — the command exits with an error if it finds -any so the caller can convert the submodule repository first.`, +original SHA1 content and would be invalid post-rewrite), and any +submodule gitlink fails the run — its .gitmodules upstream still +advertises SHA1 hashes, so a rewritten SHA256 gitlink would point at a +hash the upstream cannot resolve and break ` + "`git submodule update`" + ` in +every clone. Exclude refs that reference submodules, or convert the +submodule repository first and re-point .gitmodules.`, Args: cobra.MaximumNArgs(2), SilenceErrors: true, SilenceUsage: true, @@ -47,10 +49,18 @@ any so the caller can convert the submodule repository first.`, } result, err := sha256convert.Run(cmd.Context(), req) + // Print whatever state Run produced even on error: signed + // tags landed before signBranchTips failed, --check + // findings, and the --keep-source-objects temp dir are + // all things the user needs to see to clean up or debug. + // Run zero-values fields it never touched, so this is + // safe to call on a half-populated result. + if result.SourceURL != "" || result.TargetDir != "" { + printOutput(jsonOutput, result) + } if err != nil { return fmt.Errorf("convert-sha256: %w", err) } - printOutput(jsonOutput, result) return nil }, } diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 207b74bd..4b87ea7c 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -5,11 +5,12 @@ // // The tool is intentionally scoped: no hash mapping is persisted, GPG // signatures on commits and tags are dropped (they sign over the original -// SHA1 byte stream and would be invalid post-rewrite), and submodule -// gitlinks are left at their original SHA1 hash unless the referenced -// commit happens to live in the same repo. A run that encounters an -// unresolvable submodule entry fails so the caller can choose which refs -// to exclude. +// SHA1 byte stream and would be invalid post-rewrite), and any submodule +// gitlink fails the run so the caller chooses which refs to exclude. The +// linked-to repository's URL still points at an upstream SHA1 store, +// which has no way to resolve a SHA256-rewritten gitlink, so rewriting +// would produce a tree that fsck-passes but breaks +// `git submodule update`. package sha256convert import ( @@ -233,9 +234,20 @@ func Run(ctx context.Context, req Request) (Result, error) { } }() + // Build the result struct early so error paths can surface + // what little ran successfully. In particular, --keep-source-objects + // exists to debug failures, so cleanupTemp must flip and TempDir + // must be in the result *before* any later error return; otherwise + // the temp store gets wiped on exactly the runs that need it. + res := Result{SourceURL: req.SourceURL, TargetDir: req.TargetDir} + if req.KeepSourceObjects { + cleanupTemp = false + res.TempDir = tempDir + } + srcRepo, err := git.PlainInit(tempDir, true) if err != nil { - return Result{}, fmt.Errorf("init temporary SHA1 store: %w", err) + return res, fmt.Errorf("init temporary SHA1 store: %w", err) } // Source connection + ref discovery ----------------------------------- @@ -249,7 +261,7 @@ func Run(ctx context.Context, req Request) (Result, error) { } conn, refService, sourceRefList, err := openSource(ctx, req, planCfg) if err != nil { - return Result{}, err + return res, err } defer conn.Close() refService.Verbose = req.Verbose @@ -257,10 +269,10 @@ func Run(ctx context.Context, req Request) (Result, error) { sourceRefs := gitproto.RefHashMap(sourceRefList) desired, _, err := planner.BuildDesiredRefs(sourceRefs, planCfg) if err != nil { - return Result{}, fmt.Errorf("build desired refs: %w", err) + return res, fmt.Errorf("build desired refs: %w", err) } if len(desired) == 0 { - return Result{}, errors.New("no source refs matched the requested scope") + return res, errors.New("no source refs matched the requested scope") } // Refuse before any further I/O if the source carries refs that @@ -268,7 +280,7 @@ func Run(ctx context.Context, req Request) (Result, error) { // writeOriginNotes / signBranchTips, so without this check the // later side-output write would silently clobber the source ref. if err := checkSideOutputCollision(desired, req.SkipOriginNotes, req.Sign); err != nil { - return Result{}, err + return res, err } // Fetch into temp SHA1 store ------------------------------------------ @@ -276,7 +288,7 @@ func Run(ctx context.Context, req Request) (Result, error) { gpDesired := convert.DesiredRefs(desired) if err := refService.FetchToStore(ctx, srcRepo.Storer, conn, gpDesired, nil); err != nil && !errors.Is(err, git.NoErrAlreadyUpToDate) { - return Result{}, fmt.Errorf("fetch source pack: %w", err) + return res, fmt.Errorf("fetch source pack: %w", err) } // Discover reachable set before initing the target. Submodule @@ -303,18 +315,18 @@ func Run(ctx context.Context, req Request) (Result, error) { stopDisc() } if err != nil { - return Result{}, fmt.Errorf("discover reachable: %w", err) + return res, fmt.Errorf("discover reachable: %w", err) } // Discovery succeeded — safe to materialize the SHA256 target. dstRepo, err := git.PlainInit(req.TargetDir, true, git.WithObjectFormat(formatcfg.SHA256)) if err != nil { - return Result{}, fmt.Errorf("init SHA256 target at %s: %w", req.TargetDir, err) + return res, fmt.Errorf("init SHA256 target at %s: %w", req.TargetDir, err) } tr, err := newTranslator(ctx, srcRepo.Storer, dstRepo.Storer, req.TargetDir, !req.SkipMessageRewrite, reachable) if err != nil { - return Result{}, err + return res, err } fmt.Fprintln(out, "translating objects to sha256 ...") var stopTr func() @@ -329,7 +341,7 @@ func Run(ctx context.Context, req Request) (Result, error) { if stopTr != nil { stopTr() } - return Result{}, fmt.Errorf("translate %s: %w", d.SourceRef, err) + return res, fmt.Errorf("translate %s: %w", d.SourceRef, err) } } if stopTr != nil { @@ -339,7 +351,7 @@ func Run(ctx context.Context, req Request) (Result, error) { // Write refs --------------------------------------------------------- refsWritten, err := writeRefs(dstRepo.Storer, desired, tr.mapping) if err != nil { - return Result{}, fmt.Errorf("write target refs: %w", err) + return res, fmt.Errorf("write target refs: %w", err) } // Point HEAD at a ref that actually exists in the target. PlainInit @@ -348,19 +360,15 @@ func Run(ctx context.Context, req Request) (Result, error) { // step. See pickHEAD for the selection order. if headRef := pickHEAD(refService.HeadTarget, desired); headRef != "" { if err := dstRepo.Storer.SetReference(plumbing.NewSymbolicReference(plumbing.HEAD, headRef)); err != nil { - return Result{}, fmt.Errorf("set HEAD: %w", err) + return res, fmt.Errorf("set HEAD: %w", err) } } - res := Result{ - SourceURL: req.SourceURL, - TargetDir: req.TargetDir, - Protocol: refService.Protocol, - RefsConverted: refsWritten, - Counts: tr.snapshotCounts(), - SignaturesStripped: tr.signaturesStripped, - MessageRewrites: tr.messageRewrites, - } + res.Protocol = refService.Protocol + res.RefsConverted = refsWritten + res.Counts = tr.snapshotCounts() + res.SignaturesStripped = tr.signaturesStripped + res.MessageRewrites = tr.messageRewrites if len(tr.ambiguousMessageRefs) > 0 { amb := make([]string, 0, len(tr.ambiguousMessageRefs)) for s := range tr.ambiguousMessageRefs { @@ -373,32 +381,32 @@ func Run(ctx context.Context, req Request) (Result, error) { if !req.SkipOriginNotes && len(tr.commits) > 0 { notesRef, err := tr.writeOriginNotes(originNotesRef) if err != nil { - return Result{}, fmt.Errorf("write origin notes: %w", err) + return res, fmt.Errorf("write origin notes: %w", err) } if err := dstRepo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(notesRef), tr.lastNotesCommit)); err != nil { - return Result{}, fmt.Errorf("set %s: %w", notesRef, err) + return res, fmt.Errorf("set %s: %w", notesRef, err) } res.OriginNotesRef = notesRef } if req.MappingFile != "" { if err := tr.writeMappingFile(req.MappingFile); err != nil { - return Result{}, fmt.Errorf("write mapping file: %w", err) + return res, fmt.Errorf("write mapping file: %w", err) } res.MappingFile = req.MappingFile } if req.Sign { signed, err := signBranchTips(ctx, out, req.TargetDir, req.SignKey, req.SourceURL, desired) + // signBranchTips returns the tags it had already created + // when it failed mid-iteration. Surface that partial list + // even on error so the caller can clean up — without it, + // signed converted/* tags would be left on disk with no + // indication in either Result or the error. + res.SignedTags = signed if err != nil { return res, fmt.Errorf("sign: %w", err) } - res.SignedTags = signed - } - - if req.KeepSourceObjects { - cleanupTemp = false - res.TempDir = tempDir } if req.Check { @@ -413,7 +421,14 @@ func Run(ctx context.Context, req Request) (Result, error) { for _, tag := range res.SignedTags { sideOutputs[plumbing.ReferenceName(tag)] = struct{}{} } - res.Checks = runChecks(ctx, req.TargetDir, dstRepo, refsWritten, sideOutputs) + hasBranches := false + for _, d := range desired { + if d.TargetRef.IsBranch() { + hasBranches = true + break + } + } + res.Checks = runChecks(ctx, req.TargetDir, dstRepo, refsWritten, sideOutputs, hasBranches) for _, c := range res.Checks { mark := "✓" if !c.OK { @@ -500,7 +515,12 @@ func signBranchTips(ctx context.Context, out io.Writer, targetDir, signKey, sour // source set (the origin-notes ref, any --sign attestation tags), so // the refs check can omit them from the resolved/expected fraction // without false-positive-skipping a same-named source ref. -func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refsExpected int, sideOutputs map[plumbing.ReferenceName]struct{}) []Check { +// +// hasBranches says whether any refs/heads/* landed in the target. If +// false, this is a tags-only conversion and HEAD is left at the +// PlainInit default (refs/heads/master, which won't exist); the HEAD +// check is then a no-op rather than a guaranteed failure. +func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refsExpected int, sideOutputs map[plumbing.ReferenceName]struct{}, hasBranches bool) []Check { checks := []Check{} // 1. Config: extensions.objectformat = sha256. Parse the file @@ -524,18 +544,25 @@ func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refs } } - // 2. HEAD resolves to an existing object. - head, err := repo.Reference(plumbing.HEAD, true) + // 2. HEAD resolves to an existing object. Skipped on tags-only + // conversions, where the target legitimately has no branch for + // HEAD to symlink to. switch { - case err != nil: - checks = append(checks, Check{Name: "HEAD", OK: false, Detail: err.Error()}) - case head.Hash().IsZero(): - checks = append(checks, Check{Name: "HEAD", OK: false, Detail: "resolves to zero hash"}) + case !hasBranches: + checks = append(checks, Check{Name: "HEAD", OK: true, Detail: "skipped (tags-only conversion; no branch to point at)"}) default: - if _, err := repo.Storer.EncodedObject(plumbing.AnyObject, head.Hash()); err != nil { - checks = append(checks, Check{Name: "HEAD", OK: false, Detail: fmt.Sprintf("%s: %v", head.Hash(), err)}) - } else { - checks = append(checks, Check{Name: "HEAD", OK: true, Detail: head.Hash().String()}) + head, err := repo.Reference(plumbing.HEAD, true) + switch { + case err != nil: + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: err.Error()}) + case head.Hash().IsZero(): + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: "resolves to zero hash"}) + default: + if _, err := repo.Storer.EncodedObject(plumbing.AnyObject, head.Hash()); err != nil { + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: fmt.Sprintf("%s: %v", head.Hash(), err)}) + } else { + checks = append(checks, Check{Name: "HEAD", OK: true, Detail: head.Hash().String()}) + } } } @@ -892,15 +919,21 @@ func discoverReachable(src storer.Storer, roots []plumbing.Hash, progress *atomi } for _, e := range tree.Entries { if e.Mode == filemode.Submodule { - if _, err := srcFS.EncodedObject(plumbing.CommitObject, e.Hash); err == nil { - if err := visit(e.Hash); err != nil { - return err - } - continue - } + // A submodule gitlink stores a hash that refers to a + // commit in a *different* repository — the one named + // by the matching .gitmodules URL. Even when that + // commit happens to be in our source store, the URL + // still points at an upstream SHA1 repo, so rewriting + // the gitlink to SHA256 produces a tree that fsck- + // passes but breaks `git submodule update` forever: + // the upstream advertises only SHA1 hashes. The only + // safe answer is to refuse and let the caller scope + // the offending ref out (or convert the submodule + // upstream first and re-point .gitmodules). return fmt.Errorf( - "tree %s contains a submodule gitlink %q at %s that is not present in the source repo; "+ - "convert the submodule repository first so its commit hashes are available in SHA256", + "tree %s contains a submodule gitlink %q at %s; convert-sha256 cannot rewrite submodule pointers "+ + "because the linked-to repository would still advertise SHA1 hashes — "+ + "exclude refs that reference it or convert the submodule repository first", sha1, e.Name, e.Hash) } if err := visit(e.Hash); err != nil { @@ -1008,26 +1041,13 @@ func (t *translator) translateTree(sha1 plumbing.Hash, src plumbing.EncodedObjec } for i, entry := range tree.Entries { if entry.Mode == filemode.Submodule { - // Submodule gitlinks reference a commit in a different repo. - // We can only translate if that commit happens to live in our - // SHA1 store too (rare, e.g. vendored). Otherwise the SHA1 - // pointer can't be embedded in a SHA256 tree, so we error - // out and let the caller scope around it. - if _, ok := t.mapping[entry.Hash]; ok { - tree.Entries[i].Hash = t.mapping[entry.Hash] - continue - } - if _, err := t.src.EncodedObject(plumbing.CommitObject, entry.Hash); err == nil { - newH, err := t.translate(entry.Hash) - if err != nil { - return plumbing.ZeroHash, err - } - tree.Entries[i].Hash = newH - continue - } + // Should not be reachable: discoverReachable refuses any + // submodule gitlink up-front. Keep this as a defensive + // guard so the rewrite path never silently produces a + // SHA256 tree whose gitlink points at a hash the + // .gitmodules upstream repo cannot resolve. return plumbing.ZeroHash, fmt.Errorf( - "tree %s contains submodule gitlink %q at %s that is not present in the source repo; "+ - "exclude refs that reference it or convert the submodule repository first", + "tree %s contains submodule gitlink %q at %s; convert-sha256 refuses to rewrite submodule pointers", sha1, entry.Name, entry.Hash) } newH, err := t.translate(entry.Hash) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 066ab6ba..333225fd 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -553,6 +553,45 @@ func TestTranslator_UnresolvableSubmodule(t *testing.T) { } } +// TestTranslator_VendoredSubmoduleStillRefused locks in the rule that +// even a submodule whose commit happens to live in the source store is +// rejected. The earlier "vendored" carve-out rewrote such gitlinks to +// SHA256, but .gitmodules still points at an upstream SHA1 repo, so +// `git submodule update` would fail in clones of the converted repo. +func TestTranslator_VendoredSubmoduleStillRefused(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init SHA1 source: %v", err) + } + + // Create a commit that lives in this source store, then point a + // tree's submodule gitlink at it. discoverReachable used to recurse + // into that commit ("vendored") and translate the gitlink; now it + // refuses regardless. + blobHash := writeBlob(t, srcRepo.Storer, []byte("inner\n")) + innerTree := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "f", Mode: filemode.Regular, Hash: blobHash}, + }) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + innerCommit := &object.Commit{Author: sig, Committer: sig, Message: "inner\n", TreeHash: innerTree} + innerSHA1 := writeObject(t, srcRepo.Storer, innerCommit.Encode) + + outerTree := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "sub", Mode: filemode.Submodule, Hash: innerSHA1}, + }) + + _, err = discoverReachable(srcRepo.Storer, []plumbing.Hash{outerTree}, nil) + if err == nil { + t.Fatal("expected discoverReachable to refuse vendored submodule, got nil") + } + if !strings.Contains(err.Error(), "submodule") { + t.Errorf("error should mention submodule; got: %v", err) + } +} + // --- helpers --- // initSHA1 and initSHA256 are t.Fatalf-wrapping `git.PlainInit` shortcuts @@ -1107,6 +1146,37 @@ func TestCheckSideOutputCollision(t *testing.T) { } } +// TestRunChecks_TagOnlyConversionSkipsHEAD locks in the rule that a +// tags-only conversion does not fail --check on HEAD. PlainInit leaves +// HEAD pointing at refs/heads/master (which won't exist), and pickHEAD +// returns "" because the desired set has no branches; runChecks must +// detect that and mark HEAD as "skipped" rather than "missing". +func TestRunChecks_TagOnlyConversionSkipsHEAD(t *testing.T) { + dir := t.TempDir() + repo, err := git.PlainInit(dir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target: %v", err) + } + + checks := runChecks(t.Context(), dir, repo, 0, nil, false) + var head Check + for _, c := range checks { + if c.Name == "HEAD" { + head = c + break + } + } + if head.Name == "" { + t.Fatalf("HEAD check missing from runChecks output") + } + if !head.OK { + t.Errorf("HEAD should be OK (skipped) for tags-only conversion, got %+v", head) + } + if !strings.Contains(head.Detail, "skipped") { + t.Errorf("HEAD check should record skipped reason, got %q", head.Detail) + } +} + func TestPickHEAD(t *testing.T) { branch := func(name string) planner.DesiredRef { ref := plumbing.ReferenceName("refs/heads/" + name) From b7f8e31a48e39dc4bcdaed4d3608ba750a2cb148 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 11:46:57 +0200 Subject: [PATCH 18/19] Address strongly-recommended review items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 5. discoverReachable now takes ctx and checks ctx.Err() at the top of every visit. Without it, Ctrl-C during the multi-minute discovery phase on kernel-scale repos was ignored — exactly the case the per-translate() check was added for. 6. writeLoose: document the durability tradeoff. No fsync, by design — convert-sha256 is a one-shot bulk operation, not incremental, processes millions of objects, and Run wipes the target on error so the only supported recovery is rerunning from clean state. 7. Target directory cleanup on error. ensureEmptyTarget refuses to write into a non-empty dir; without this fix, any failure after PlainInit left config/HEAD/refs behind and blocked a retry with no recovery hint. New deferred cleanup arms after PlainInit, disarms on success, and is suppressed by --keep-source-objects so users can inspect partial state. A --check failure also disarms cleanup since the conversion itself finished and the partial target is what the user needs to inspect. 8. hashPattern is now case-insensitive ((?i) prepended), and resolveMessageRef lowercases the input before lookup. An uppercase/mixed-case SHA1 reference in a commit or tag message is now rewritten the same as a lowercase one. 9. Check gained a Skipped bool. Skipped implies OK so callers gating on OK still treat it as non-fatal; callers needing a stricter audit signal can branch on Skipped first. Applied to the fsck-when-git-missing and HEAD-on-tags-only paths, with a "○" glyph in the progress output to distinguish from real passes. Tests cover discovery cancellation, fsck skipped, uppercase hash rewrite, and the existing tag-only HEAD check rewritten against the Skipped field. Entire-Checkpoint: 248018a9dd16 --- .../internal/sha256convert/sha256convert.go | 92 +++++++++--- .../sha256convert/sha256convert_test.go | 139 ++++++++++++++++-- 2 files changed, 203 insertions(+), 28 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index 4b87ea7c..da18c2dd 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -127,10 +127,17 @@ type Result struct { // Check is one named verification step from --check, with the result // and a short detail string suitable for logging/JSON output. +// +// Skipped distinguishes "this check passed" from "this check did not +// run" — e.g. fsck when git is not on PATH, or HEAD on a tags-only +// conversion. Skipped implies OK so callers that only branch on OK +// still treat it as non-fatal; callers that need a stricter signal +// (CI gating, audit logs) should branch on Skipped first. type Check struct { - Name string `json:"name"` - OK bool `json:"ok"` - Detail string `json:"detail,omitempty"` + Name string `json:"name"` + OK bool `json:"ok"` + Skipped bool `json:"skipped,omitempty"` + Detail string `json:"detail,omitempty"` } // previewMax caps how many items from a potentially-long list (ambiguous @@ -234,6 +241,21 @@ func Run(ctx context.Context, req Request) (Result, error) { } }() + // cleanupTarget fires when set, wiping the SHA256 bare repo we + // initialize below. ensureEmptyTarget already verified the dir was + // empty going in, so a defensive RemoveAll on failure only ever + // removes content this run created. Without it, any error after + // PlainInit leaves config/objects/refs/HEAD behind, and the next + // retry hits ensureEmptyTarget's "not empty" refusal with no + // indication of how to recover. Suppressed by --keep-source-objects + // so users can inspect partial state. + cleanupTarget := false + defer func() { + if cleanupTarget && !req.KeepSourceObjects { + _ = os.RemoveAll(req.TargetDir) + } + }() + // Build the result struct early so error paths can surface // what little ran successfully. In particular, --keep-source-objects // exists to debug failures, so cleanupTemp must flip and TempDir @@ -310,7 +332,7 @@ func Run(ctx context.Context, req Request) (Result, error) { return fmt.Sprintf(" discovered %d objects", c.Load()) }) } - reachable, err := discoverReachable(srcRepo.Storer, rootSHA1s, discCounter) + reachable, err := discoverReachable(ctx, srcRepo.Storer, rootSHA1s, discCounter) if stopDisc != nil { stopDisc() } @@ -323,6 +345,10 @@ func Run(ctx context.Context, req Request) (Result, error) { if err != nil { return res, fmt.Errorf("init SHA256 target at %s: %w", req.TargetDir, err) } + // Anything that fails past here would leave the target dir + // non-empty (config + HEAD + maybe objects/refs), blocking a + // retry on ensureEmptyTarget; arm the deferred cleanup now. + cleanupTarget = true tr, err := newTranslator(ctx, srcRepo.Storer, dstRepo.Storer, req.TargetDir, !req.SkipMessageRewrite, reachable) if err != nil { @@ -431,18 +457,28 @@ func Run(ctx context.Context, req Request) (Result, error) { res.Checks = runChecks(ctx, req.TargetDir, dstRepo, refsWritten, sideOutputs, hasBranches) for _, c := range res.Checks { mark := "✓" - if !c.OK { + switch { + case !c.OK: mark = "✗" + case c.Skipped: + mark = "○" } fmt.Fprintf(out, " %s %s: %s\n", mark, c.Name, c.Detail) } for _, c := range res.Checks { if !c.OK { + // The conversion finished; the failure is a + // post-hoc verification miss. Keep the target + // on disk so the user can inspect exactly what + // failed the check. + cleanupTarget = false return res, fmt.Errorf("check %q failed: %s", c.Name, c.Detail) } } } + // Run completed; keep the target dir. + cleanupTarget = false return res, nil } @@ -549,7 +585,7 @@ func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refs // HEAD to symlink to. switch { case !hasBranches: - checks = append(checks, Check{Name: "HEAD", OK: true, Detail: "skipped (tags-only conversion; no branch to point at)"}) + checks = append(checks, Check{Name: "HEAD", OK: true, Skipped: true, Detail: "tags-only conversion; no branch to point at"}) default: head, err := repo.Reference(plumbing.HEAD, true) switch { @@ -609,7 +645,7 @@ func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refs // 4. git fsck --full (if git is on PATH). gitBin, err := exec.LookPath("git") if err != nil { - checks = append(checks, Check{Name: "git fsck --full", OK: true, Detail: "skipped (git not in PATH)"}) + checks = append(checks, Check{Name: "git fsck --full", OK: true, Skipped: true, Detail: "git not in PATH"}) return checks } cmd := exec.CommandContext(ctx, gitBin, "-C", targetDir, "fsck", "--full") @@ -877,20 +913,18 @@ func newTranslator(ctx context.Context, src, dst storer.Storer, targetDir string // entries, commit tree+parent links, and tag targets) and returns a // (SHA1 → object type) map covering the full in-scope set. // -// Submodule gitlinks: a tree entry with mode 160000 points at a commit -// in another repository, and a SHA1 hash cannot be embedded in a -// SHA256 tree. If the referenced commit happens to live in this -// source store (rare; vendored modules), it is recursively visited -// like any other commit. Otherwise discovery returns an error here, -// before the target bare repo is initialized — failing fast keeps -// half-converted state off disk. +// Submodule gitlinks: any submodule entry (mode 160000) fails the run +// here, before the target bare repo is initialized — failing fast +// keeps half-converted state off disk. Rewriting the gitlink to SHA256 +// would produce a tree the upstream .gitmodules repo can never +// resolve, since it advertises only SHA1. // // Message-reference edges are not part of this pass; those are added // during translation, where the partial mapping is updated as we go. // // If progress is non-nil, it is incremented once per object visited. // The --progress ticker samples this counter from another goroutine. -func discoverReachable(src storer.Storer, roots []plumbing.Hash, progress *atomic.Int64) (map[plumbing.Hash]plumbing.ObjectType, error) { +func discoverReachable(ctx context.Context, src storer.Storer, roots []plumbing.Hash, progress *atomic.Int64) (map[plumbing.Hash]plumbing.ObjectType, error) { srcFS, ok := src.(*filesystem.Storage) if !ok { return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) @@ -898,6 +932,13 @@ func discoverReachable(src storer.Storer, roots []plumbing.Hash, progress *atomi reachable := make(map[plumbing.Hash]plumbing.ObjectType) var visit func(plumbing.Hash) error visit = func(sha1 plumbing.Hash) error { + // Per-object cancellation check. Discovery on a kernel-scale + // repo runs for several minutes before translate() takes over, + // so without this Ctrl-C would not interrupt the run until + // the discovery phase finished on its own. + if err := ctx.Err(); err != nil { + return fmt.Errorf("discover %s: %w", sha1, err) + } if _, seen := reachable[sha1]; seen { return nil } @@ -1213,6 +1254,16 @@ func encodeBody(typ plumbing.ObjectType, encode func(plumbing.EncodedObject) err // writeLoose writes a single object as a SHA256-named loose object under // objects//. Bypasses go-git's objfile.Writer, which would hash // with SHA1. Atomic via tempfile+rename, idempotent on duplicate hashes. +// +// Durability is not guaranteed against power loss: we do not fsync the +// loose file or its parent directory before returning. The Stat-by-name +// idempotency shortcut would then accept a torn file from a previous +// crashed run as already-written. That trade-off is intentional — +// convert-sha256 is a single-shot bulk operation (not an incremental +// sync), it processes millions of objects on kernel-scale repos where +// per-object fsync would dominate runtime, and Run wipes the target +// directory on error (see the cleanupTarget defer in Run) so the only +// supported recovery is re-running from clean state. func (t *translator) writeLoose(typ plumbing.ObjectType, body []byte) (plumbing.Hash, error) { h := sha256.New() header := append(typ.Bytes(), ' ') @@ -1277,12 +1328,15 @@ func (t *translator) writeLoose(typ plumbing.ObjectType, body []byte) (plumbing. } // hashPattern matches hex runs that could be a git object hash. Git's -// default abbreviation is 7 chars; 40 is a full SHA1. We only rewrite a +// default abbreviation is 7 chars; 40 is a full SHA1. Case-insensitive +// so messages that paste an uppercase or mixed-case hash (e.g. from +// some commit graph viewers) still resolve — the lookup canonicalizes +// to lowercase before checking the reachable set. We only rewrite a // match if the prefix uniquely identifies a commit or tag in the // reachable set, so false positives on incidental hex strings are // essentially impossible (a random hex would have to collide with a // real source SHA1). -var hashPattern = regexp.MustCompile(`\b[0-9a-f]{7,40}\b`) +var hashPattern = regexp.MustCompile(`(?i)\b[0-9a-f]{7,40}\b`) // matchResult is the 3-state outcome of resolving a hex prefix in a // commit/tag message against the reachable set. We distinguish @@ -1349,6 +1403,10 @@ func (t *translator) rewriteHashesInMessage(msg string) (string, int) { // are filtered so incidental hex collisions on content hashes aren't // rewritten). func (t *translator) resolveMessageRef(prefix string) (plumbing.Hash, matchResult) { + // Canonicalize to lowercase: hashPattern is case-insensitive so + // the caller can match `ABCD1234` in a message, but reachable + // keys and plumbing.Hash.String() are always lowercase hex. + prefix = strings.ToLower(prefix) if len(prefix) == 40 { sha1, ok := plumbing.FromHex(prefix) if !ok { diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 333225fd..62cb79fb 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -6,6 +6,7 @@ import ( "context" "crypto/sha256" "encoding/hex" + "errors" "fmt" "io" "net/http" @@ -84,7 +85,7 @@ func TestTranslator(t *testing.T) { } tagHash := writeObject(t, srcRepo.Storer, tag.Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{tagHash}, nil) + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{tagHash}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -199,7 +200,7 @@ func TestTranslator_RewritesMessageHashes(t *testing.T) { } childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{childSHA1}, nil) + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{childSHA1}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -278,7 +279,7 @@ func TestTranslator_RewritesCrossBranchReferences(t *testing.T) { // Discovery must see both branches so the reachable set covers cA // before cB is encoded. - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{cB, cA}, nil) + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{cB, cA}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -332,7 +333,7 @@ func TestTranslator_SkipMessageRewrite(t *testing.T) { } childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{childSHA1}, nil) + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{childSHA1}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -366,7 +367,7 @@ func TestTranslator_WriteOriginNotes(t *testing.T) { c1 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c1\n", TreeHash: tree}).Encode) c2 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c2\n", TreeHash: tree, ParentHashes: []plumbing.Hash{c1}}).Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{c2}, nil) + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{c2}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -438,7 +439,7 @@ func TestTranslator_WriteMappingFile(t *testing.T) { sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} commit := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c\n", TreeHash: tree}).Encode) - reachable, err := discoverReachable(srcRepo.Storer, []plumbing.Hash{commit}, nil) + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{commit}, nil) if err != nil { t.Fatalf("discoverReachable: %v", err) } @@ -544,7 +545,7 @@ func TestTranslator_UnresolvableSubmodule(t *testing.T) { {Name: "sub", Mode: filemode.Submodule, Hash: external}, }) - _, err = discoverReachable(srcRepo.Storer, []plumbing.Hash{treeHash}, nil) + _, err = discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{treeHash}, nil) if err == nil { t.Fatal("expected discoverReachable to fail on unresolvable submodule, got nil") } @@ -583,7 +584,7 @@ func TestTranslator_VendoredSubmoduleStillRefused(t *testing.T) { {Name: "sub", Mode: filemode.Submodule, Hash: innerSHA1}, }) - _, err = discoverReachable(srcRepo.Storer, []plumbing.Hash{outerTree}, nil) + _, err = discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{outerTree}, nil) if err == nil { t.Fatal("expected discoverReachable to refuse vendored submodule, got nil") } @@ -1146,6 +1147,119 @@ func TestCheckSideOutputCollision(t *testing.T) { } } +// TestDiscoverReachable_HonorsCtxCancellation confirms discovery +// returns promptly when its context is canceled before it starts, +// matching the per-object check translate() already does. +func TestDiscoverReachable_HonorsCtxCancellation(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init source: %v", err) + } + blob := writeBlob(t, srcRepo.Storer, []byte("x\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + + ctx, cancel := context.WithCancel(t.Context()) + cancel() + _, err = discoverReachable(ctx, srcRepo.Storer, []plumbing.Hash{tree}, nil) + if err == nil { + t.Fatal("expected canceled ctx to surface as error") + } + if !errors.Is(err, context.Canceled) { + t.Errorf("error should wrap context.Canceled; got %v", err) + } +} + +// TestRunChecks_FsckSkippedWhenGitMissing locks in the Skipped flag +// for the fsck check. Callers that gate on Check.OK alone now can't +// tell a real fsck pass from a skip; Skipped resolves the ambiguity. +func TestRunChecks_FsckSkippedWhenGitMissing(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git binary available; this test exercises the missing-git path via PATH override") + } + // Force LookPath("git") to fail by overriding PATH. + t.Setenv("PATH", "") + dir := t.TempDir() + repo, err := git.PlainInit(dir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init: %v", err) + } + checks := runChecks(t.Context(), dir, repo, 0, nil, false) + var fsck Check + for _, c := range checks { + if c.Name == "git fsck --full" { + fsck = c + break + } + } + if fsck.Name == "" { + t.Fatalf("fsck check missing from output") + } + if !fsck.Skipped { + t.Errorf("fsck should be Skipped when git is missing, got %+v", fsck) + } + if !fsck.OK { + t.Errorf("Skipped implies OK; got %+v", fsck) + } +} + +// TestHashPattern_CaseInsensitive locks in the (?i) on hashPattern — +// uppercase or mixed-case SHA1 references in messages must resolve +// against the (lowercase-canonical) reachable set. +func TestHashPattern_CaseInsensitive(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init src: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init dst: %v", err) + } + blob := writeBlob(t, srcRepo.Storer, []byte("x\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + sig := object.Signature{Name: "T", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + parent := &object.Commit{Author: sig, Committer: sig, Message: "first\n", TreeHash: tree} + parentHash := writeObject(t, srcRepo.Storer, parent.Encode) + + // Reference the parent with an UPPERCASE full hash. + upper := strings.ToUpper(parentHash.String()) + child := &object.Commit{ + Author: sig, Committer: sig, + Message: "see " + upper + " for context\n", + TreeHash: tree, + ParentHashes: []plumbing.Hash{parentHash}, + } + childHash := writeObject(t, srcRepo.Storer, child.Encode) + + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{childHash}, nil) + if err != nil { + t.Fatalf("discover: %v", err) + } + tr, err := newTranslator(t.Context(), srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + newChild, err := tr.translate(childHash) + if err != nil { + t.Fatalf("translate: %v", err) + } + if tr.messageRewrites != 1 { + t.Errorf("expected 1 rewrite (case-insensitive match), got %d", tr.messageRewrites) + } + c, err := object.GetCommit(dstRepo.Storer, newChild) + if err != nil { + t.Fatalf("read translated child: %v", err) + } + if strings.Contains(c.Message, upper) { + t.Errorf("uppercase SHA1 should have been rewritten; message: %q", c.Message) + } +} + // TestRunChecks_TagOnlyConversionSkipsHEAD locks in the rule that a // tags-only conversion does not fail --check on HEAD. PlainInit leaves // HEAD pointing at refs/heads/master (which won't exist), and pickHEAD @@ -1170,10 +1284,13 @@ func TestRunChecks_TagOnlyConversionSkipsHEAD(t *testing.T) { t.Fatalf("HEAD check missing from runChecks output") } if !head.OK { - t.Errorf("HEAD should be OK (skipped) for tags-only conversion, got %+v", head) + t.Errorf("HEAD should be OK for tags-only conversion, got %+v", head) + } + if !head.Skipped { + t.Errorf("HEAD should be marked Skipped on tags-only conversion, got %+v", head) } - if !strings.Contains(head.Detail, "skipped") { - t.Errorf("HEAD check should record skipped reason, got %q", head.Detail) + if !strings.Contains(head.Detail, "tags-only") { + t.Errorf("HEAD detail should explain the skip reason, got %q", head.Detail) } } From 26f70ca225237a1ba014e98a397e5f881cbfb1b8 Mon Sep 17 00:00:00 2001 From: Andrea Nodari Date: Tue, 26 May 2026 11:53:57 +0200 Subject: [PATCH 19/19] Tighten sha256convert: scanner, recursion, dead field, fsck match, close, memoization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 10. fsckHasError used bufio.Scanner with the default 64 KiB buffer, so a single long line (paths in repository-scale fsck output) would silently truncate and we'd miss whatever followed it. Switch to bytes.Split on raw newlines. 11. discoverReachable was recursive, which on long linear chains (kernel-scale: ~70k commits on the deepest single-parent path) grew the goroutine stack into the tens of MiB. Iterativize with an explicit work stack — memory now scales with the in-flight frontier, not the longest chain. translate() stays recursive: its edges are dynamic (tree entries + commit parents + tag targets + message-reference edges resolved against the in-progress mapping), and rewriting it would risk silent corruption of message rewrites. Documented the rationale and the depth math. 12. translator.dst was set in newTranslator but never read — a landmine inviting a future contributor to call go-git's SHA1-hardcoded SetEncodedObject. Field removed; the type assertion stays (with `_`) so a memory-backed dst is still refused with a clear error. 13. fsckHasError's error-line matcher was prefix-only and case- sensitive ("error:" / "fatal:"). Broaden to any line whose first token starts with "error" or "fatal" (case-insensitive), keep the "missing "/"broken link"/"bad " object reports for older git. Closer to the previous substring match's coverage without the path-substring false positive. 14. writeMappingFile dropped Close errors via a deferred Close. On NFS or quota-bound filesystems write failures surface at close time, not flush time, so the caller would think the mapping landed when it hadn't. Explicit Close on the success path, with the deferred Close kept as a best-effort net for the failure path. 15. resolveMessageRef ran a full O(len(reachable)) scan for every short hash prefix, hit twice per token (once by extractMessageReferences, once by rewriteHashesInMessage). reachable is frozen before translation starts, so the (prefix → matchResult) mapping is stable; memoize it on the translator. Halves the documented quadratic ceiling on message-token resolution. Tests cover the long-line + case-insensitive fsck path and the resolveMessageRef cache (second call returns the cached answer even after reachable is mutated). Entire-Checkpoint: bc9f9981f089 --- .../internal/sha256convert/sha256convert.go | 153 ++++++++++++------ .../sha256convert/sha256convert_test.go | 72 +++++++++ 2 files changed, 178 insertions(+), 47 deletions(-) diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go index da18c2dd..ffeaebc1 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -665,17 +665,28 @@ func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refs return checks } -// fsckHasError reports whether git-fsck output contains a line that signals -// a real problem (an "error:" or "fatal:" prefix, or a "missing"/"bad" -// object report). Dangling and warning lines are ignored. +// fsckHasError reports whether git-fsck output contains a line that +// signals a real problem. We match (case-insensitively) any line whose +// first token starts with "error" or "fatal" — covering "error:", +// "fatal:", and the rare "errorInX:" variants — plus the +// "missing " / "broken link" / "bad " object reports +// emitted by older git. Dangling and warning lines are intentionally +// ignored. +// +// Splits on raw newlines rather than using bufio.Scanner so a single +// very long line (some fsck reports include long paths) is not +// silently truncated at the scanner's 64 KiB default. func fsckHasError(out []byte) bool { - scanner := bufio.NewScanner(bytes.NewReader(out)) - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - if strings.HasPrefix(line, "error:") || strings.HasPrefix(line, "fatal:") { + for _, raw := range bytes.Split(out, []byte("\n")) { + line := strings.TrimSpace(string(raw)) + if line == "" { + continue + } + lower := strings.ToLower(line) + if strings.HasPrefix(lower, "error") || strings.HasPrefix(lower, "fatal") { return true } - if strings.HasPrefix(line, "missing ") || strings.HasPrefix(line, "broken link") || strings.HasPrefix(line, "bad ") { + if strings.HasPrefix(lower, "missing ") || strings.HasPrefix(lower, "broken link") || strings.HasPrefix(lower, "bad ") { return true } } @@ -835,7 +846,6 @@ type translator struct { // context passed to Run() and is not stored to outlive its caller. ctx context.Context //nolint:containedctx // translate() is recursive and not directly called by Run; threading ctx through every signature is noisier than a single field used for cancellation only. src *filesystem.Storage - dst *filesystem.Storage objectsDir string // reachable holds every in-scope SHA1 with its object type, built up // front by discoverReachable, which walks tree/commit/tag dependencies @@ -862,6 +872,13 @@ type translator struct { // so they know which references to investigate via the mapping // file. ambiguousMessageRefs map[string]struct{} + // resolveCache memoizes resolveMessageRef results. reachable is + // frozen before translation starts, so the (prefix → matchResult) + // mapping is stable for the lifetime of the translator. + // extractMessageReferences and rewriteHashesInMessage hit + // resolveMessageRef for the same tokens, and the abbreviated-hash + // path costs O(len(reachable)) per call — caching halves that. + resolveCache map[string]resolveCacheEntry // Live counts updated atomically so the --progress ticker goroutine // can sample them without racing against translation. Snapshot into // a Counts struct at the end of the run. @@ -889,8 +906,12 @@ func newTranslator(ctx context.Context, src, dst storer.Storer, targetDir string if !ok { return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) } - dstFS, ok := dst.(*filesystem.Storage) - if !ok { + // Type-check that the target is filesystem-backed too — we write + // loose objects by hand into targetDir/objects, bypassing the + // storer, but a memory-backed dst here would silently leave the + // caller's expected destination empty. Result is discarded: the + // translator only references targetDir directly. + if _, ok := dst.(*filesystem.Storage); !ok { return nil, fmt.Errorf("target storage is not filesystem-backed (%T)", dst) } if reachable == nil { @@ -899,12 +920,12 @@ func newTranslator(ctx context.Context, src, dst storer.Storer, targetDir string return &translator{ ctx: ctx, src: srcFS, - dst: dstFS, objectsDir: filepath.Join(targetDir, "objects"), reachable: reachable, mapping: make(map[plumbing.Hash]plumbing.Hash), inProgress: make(map[plumbing.Hash]struct{}), ambiguousMessageRefs: make(map[string]struct{}), + resolveCache: make(map[string]resolveCacheEntry), rewriteMessages: rewriteMessages, }, nil } @@ -930,21 +951,31 @@ func discoverReachable(ctx context.Context, src storer.Storer, roots []plumbing. return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) } reachable := make(map[plumbing.Hash]plumbing.ObjectType) - var visit func(plumbing.Hash) error - visit = func(sha1 plumbing.Hash) error { + + // Iterative DFS with an explicit stack. The previous recursive + // implementation walked deep linear histories (50k–100k commits + // is not unheard of) one Go stack frame deep per parent edge, + // growing the goroutine stack by tens of MiB on kernel-scale + // runs. The explicit stack keeps memory usage proportional to + // the in-flight frontier, not the longest chain. + stack := make([]plumbing.Hash, 0, len(roots)) + stack = append(stack, roots...) + for len(stack) > 0 { // Per-object cancellation check. Discovery on a kernel-scale - // repo runs for several minutes before translate() takes over, - // so without this Ctrl-C would not interrupt the run until - // the discovery phase finished on its own. + // repo runs for several minutes before translate() takes + // over, so without this Ctrl-C would not interrupt the run + // until the discovery phase finished on its own. if err := ctx.Err(); err != nil { - return fmt.Errorf("discover %s: %w", sha1, err) + return nil, fmt.Errorf("discover: %w", err) } + sha1 := stack[len(stack)-1] + stack = stack[:len(stack)-1] if _, seen := reachable[sha1]; seen { - return nil + continue } obj, err := srcFS.EncodedObject(plumbing.AnyObject, sha1) if err != nil { - return fmt.Errorf("discover %s: %w", sha1, err) + return nil, fmt.Errorf("discover %s: %w", sha1, err) } reachable[sha1] = obj.Type() if progress != nil { @@ -952,11 +983,11 @@ func discoverReachable(ctx context.Context, src storer.Storer, roots []plumbing. } switch obj.Type() { //nolint:exhaustive // OFSDelta/REFDelta/AnyObject/InvalidObject cannot reach a resolved storage. case plumbing.BlobObject: - return nil + // No outgoing edges. case plumbing.TreeObject: tree := &object.Tree{} if err := tree.Decode(obj); err != nil { - return fmt.Errorf("discover decode tree %s: %w", sha1, err) + return nil, fmt.Errorf("discover decode tree %s: %w", sha1, err) } for _, e := range tree.Entries { if e.Mode == filemode.Submodule { @@ -971,50 +1002,49 @@ func discoverReachable(ctx context.Context, src storer.Storer, roots []plumbing. // safe answer is to refuse and let the caller scope // the offending ref out (or convert the submodule // upstream first and re-point .gitmodules). - return fmt.Errorf( + return nil, fmt.Errorf( "tree %s contains a submodule gitlink %q at %s; convert-sha256 cannot rewrite submodule pointers "+ "because the linked-to repository would still advertise SHA1 hashes — "+ "exclude refs that reference it or convert the submodule repository first", sha1, e.Name, e.Hash) } - if err := visit(e.Hash); err != nil { - return err - } + stack = append(stack, e.Hash) } case plumbing.CommitObject: c := &object.Commit{} if err := c.Decode(obj); err != nil { - return fmt.Errorf("discover decode commit %s: %w", sha1, err) - } - if err := visit(c.TreeHash); err != nil { - return err - } - for _, p := range c.ParentHashes { - if err := visit(p); err != nil { - return err - } + return nil, fmt.Errorf("discover decode commit %s: %w", sha1, err) } + stack = append(stack, c.TreeHash) + stack = append(stack, c.ParentHashes...) case plumbing.TagObject: tag := &object.Tag{} if err := tag.Decode(obj); err != nil { - return fmt.Errorf("discover decode tag %s: %w", sha1, err) - } - if err := visit(tag.Target); err != nil { - return err + return nil, fmt.Errorf("discover decode tag %s: %w", sha1, err) } + stack = append(stack, tag.Target) default: - return fmt.Errorf("unexpected object type %v for %s during discovery", obj.Type(), sha1) - } - return nil - } - for _, r := range roots { - if err := visit(r); err != nil { - return nil, err + return nil, fmt.Errorf("unexpected object type %v for %s during discovery", obj.Type(), sha1) } } return reachable, nil } +// translate is intentionally recursive. Unlike discoverReachable's +// purely-structural DFS, translate's edges are dynamic: tree entries, +// commit parents, tag targets, *and* message-reference edges resolved +// against the partial mapping built so far. Converting that to an +// explicit work stack would require an "after-children" callback per +// object type and is easy to get subtly wrong (re-encoding before all +// referenced hashes are placed silently corrupts the message rewrite). +// +// Recursion depth is bounded by the longest dependency chain in the +// source DAG — in practice the longest commit-parent chain, since +// trees and tags add at most one frame each. Linux kernel history is +// O(70k) commits along its deepest single-parent path; Go's growable +// stacks comfortably absorb that (~tens of MiB). Cycle detection above +// turns any unexpected graph shape into a clear error rather than a +// stack-overflow crash. func (t *translator) translate(sha1 plumbing.Hash) (plumbing.Hash, error) { // Cheap per-object cancellation check so Ctrl-C during a long // conversion (kernel-scale: ~10M objects) returns promptly rather @@ -1402,11 +1432,27 @@ func (t *translator) rewriteHashesInMessage(msg string) (string, int) { // matchNone otherwise (no match, or the match is a blob/tree — those // are filtered so incidental hex collisions on content hashes aren't // rewritten). +// resolveCacheEntry holds a memoized (Hash, matchResult) pair from +// resolveMessageRef. Stored in t.resolveCache keyed by lowercased prefix. +type resolveCacheEntry struct { + hash plumbing.Hash + result matchResult +} + func (t *translator) resolveMessageRef(prefix string) (plumbing.Hash, matchResult) { // Canonicalize to lowercase: hashPattern is case-insensitive so // the caller can match `ABCD1234` in a message, but reachable // keys and plumbing.Hash.String() are always lowercase hex. prefix = strings.ToLower(prefix) + if cached, ok := t.resolveCache[prefix]; ok { + return cached.hash, cached.result + } + hash, result := t.resolveMessageRefUncached(prefix) + t.resolveCache[prefix] = resolveCacheEntry{hash: hash, result: result} + return hash, result +} + +func (t *translator) resolveMessageRefUncached(prefix string) (plumbing.Hash, matchResult) { if len(prefix) == 40 { sha1, ok := plumbing.FromHex(prefix) if !ok { @@ -1631,7 +1677,16 @@ func (t *translator) writeMappingFile(path string) error { if err != nil { return fmt.Errorf("create %s: %w", path, err) } - defer f.Close() + // Close is best-effort on the failure path (the underlying issue + // will already have surfaced via Flush). On the success path the + // explicit Close below propagates its error — networked / quota'd + // filesystems can defer write failures until close. + closed := false + defer func() { + if !closed { + _ = f.Close() + } + }() w := bufio.NewWriter(f) if _, err := fmt.Fprintln(w, "# sha1\tsha256"); err != nil { return fmt.Errorf("write mapping header: %w", err) @@ -1644,6 +1699,10 @@ func (t *translator) writeMappingFile(path string) error { if err := w.Flush(); err != nil { return fmt.Errorf("flush mapping file: %w", err) } + if err := f.Close(); err != nil { + return fmt.Errorf("close mapping file: %w", err) + } + closed = true return nil } diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go index 62cb79fb..d9a0e8fc 100644 --- a/cmd/git-sync/internal/sha256convert/sha256convert_test.go +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -1260,6 +1260,78 @@ func TestHashPattern_CaseInsensitive(t *testing.T) { } } +// TestFsckHasError_HandlesLongLinesAndCase covers two fragility +// fixes: lines longer than bufio.Scanner's 64 KiB default must not be +// silently truncated (we use bytes.Split now), and the "error" / +// "fatal" prefix match must be case-insensitive so e.g. older or +// custom git builds emitting "ERROR:" still trip the check. +func TestFsckHasError_HandlesLongLinesAndCase(t *testing.T) { + t.Run("long line still scanned", func(t *testing.T) { + // 100 KiB of dangling-blob filler followed by an error line. + out := append(bytes.Repeat([]byte("a"), 100*1024), []byte("\nerror: bad ref\n")...) + if !fsckHasError(out) { + t.Errorf("fsckHasError should detect error line after a long preceding line") + } + }) + t.Run("uppercase ERROR matches", func(t *testing.T) { + if !fsckHasError([]byte("ERROR: corruption\n")) { + t.Errorf("fsckHasError should match uppercase ERROR") + } + }) + t.Run("fatal without colon matches", func(t *testing.T) { + if !fsckHasError([]byte("Fatal failure in pack\n")) { + t.Errorf("fsckHasError should match Fatal prefix even without colon") + } + }) + t.Run("dangling warnings are not errors", func(t *testing.T) { + if fsckHasError([]byte("dangling commit abc123\n")) { + t.Errorf("dangling lines should not trip fsckHasError") + } + }) +} + +// TestResolveMessageRef_Memoizes confirms a second call for the same +// prefix doesn't re-scan reachable. We do not have a counter on the +// scan, so we test the cache by mutating reachable between calls and +// verifying the second call returns the original result. (In real +// usage, reachable is frozen — this is just a behavioral observation +// to lock in cache effectiveness.) +func TestResolveMessageRef_Memoizes(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init src: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init dst: %v", err) + } + reachable := map[plumbing.Hash]plumbing.ObjectType{ + plumbing.NewHash("abc1234567890abcdef1234567890abcdef12345"): plumbing.CommitObject, + } + tr, err := newTranslator(t.Context(), srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + + prefix := "abc12345" + h1, r1 := tr.resolveMessageRef(prefix) + // Mutate reachable; if the cache works, the next call must return + // the same answer as the first. + for k := range tr.reachable { + delete(tr.reachable, k) + } + h2, r2 := tr.resolveMessageRef(prefix) + if h1 != h2 || r1 != r2 { + t.Errorf("resolveMessageRef should return cached value; got first (%s, %v) vs second (%s, %v)", h1, r1, h2, r2) + } + if _, cached := tr.resolveCache[strings.ToLower(prefix)]; !cached { + t.Errorf("resolveCache should contain entry for %q", prefix) + } +} + // TestRunChecks_TagOnlyConversionSkipsHEAD locks in the rule that a // tags-only conversion does not fail --check on HEAD. PlainInit leaves // HEAD pointing at refs/heads/master (which won't exist), and pickHEAD