diff --git a/.entire/.gitignore b/.entire/.gitignore index 2cffdefa..e66987d2 100644 --- a/.entire/.gitignore +++ b/.entire/.gitignore @@ -2,3 +2,4 @@ tmp/ settings.local.json metadata/ logs/ +redactors/local/ diff --git a/.golangci.yaml b/.golangci.yaml index f7ade94e..5562d36b 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -98,6 +98,8 @@ linters: - github.com/go-git/go-git/v6/plumbing/storer.EncodedObjectIter - github.com/go-git/go-billy/v6.Filesystem - entire.io/entire/git-sync/internal/auth.Method + - entire.io/entire/git-sync/internal/gitproto.Conn + - entire.io/entire/git-sync/internal/gitproto.AuthMethod nolintlint: require-explanation: true require-specific: true diff --git a/README.md b/README.md index b3a0255c..e1a2484e 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,8 @@ The main commands are: `sync` automatically bootstraps an empty target, so the same command covers initial seeding and ongoing sync. To preview what would happen without pushing, run `git-sync plan` — it takes the same flags as `sync`, and `--mode replicate` previews a `replicate` run. +For one-off SHA1 → SHA256 repo conversion, `git-sync convert-sha256` fetches from an HTTP source and writes a new SHA256 bare repo on disk, with optional commit-message hash rewrites, an origin-notes ref, and a sidecar mapping file. See [docs/convert-sha256.md](docs/convert-sha256.md). + For command examples, JSON output, auth, protocol flags, and advanced command notes, see [docs/usage.md](docs/usage.md). ## Library API @@ -93,6 +95,7 @@ Extended and environment-specific test instructions are in [docs/testing.md](doc - [docs/usage.md](docs/usage.md) — CLI commands, examples, sync behavior, JSON output, auth, protocol notes - [docs/architecture.md](docs/architecture.md) — product rationale, package layout, operation modes vs transfer modes, memory model - [docs/protocol.md](docs/protocol.md) — smart HTTP, pkt-line, capability negotiation, sideband, relay framing +- [docs/convert-sha256.md](docs/convert-sha256.md) — one-off SHA1 → SHA256 repo conversion, mapping outputs, sharp edges - [docs/testing.md](docs/testing.md) — test suites and integration coverage ## FAQ diff --git a/cmd/git-sync/convert_sha256.go b/cmd/git-sync/convert_sha256.go new file mode 100644 index 00000000..17d71c1c --- /dev/null +++ b/cmd/git-sync/convert_sha256.go @@ -0,0 +1,127 @@ +package main + +import ( + "errors" + "fmt" + + gitsync "entire.io/entire/git-sync" + "entire.io/entire/git-sync/cmd/git-sync/internal/sha256convert" + "github.com/spf13/cobra" +) + +func newConvertSHA256Cmd() *cobra.Command { + var ( + req = sha256convert.Request{} + jsonOutput bool + protocolVal = newProtocolFlag() + ) + + cmd := &cobra.Command{ + Use: "convert-sha256 [flags] ", + Short: "One-off SHA1 → SHA256 conversion of a remote repo into a local bare repo", + Long: `convert-sha256 fetches a pack from a SHA1 HTTP source and writes a new +SHA256 bare repository on disk at . Every reachable object is +re-hashed under SHA256 and tree/commit/tag references are rewritten. + +All branches and tags on the source are always converted — partial scope +risks stranding cross-branch references in commit messages. Pass +--all-refs to also include refs/notes/*, refs/pull/*, and other custom +namespaces; pass --exclude-ref-prefix to subtract specific namespaces +from --all-refs. Exclude prefixes that would drop any branch or tag +(e.g. refs/heads/feature/, refs/tags/, refs/) are rejected at run time +to preserve the always-convert invariant. + +The conversion is destructive in two ways the caller should be aware of: +GPG signatures on commits and tags are dropped (they sign over the +original SHA1 content and would be invalid post-rewrite), and any +submodule gitlink fails the run — its .gitmodules upstream still +advertises SHA1 hashes, so a rewritten SHA256 gitlink would point at a +hash the upstream cannot resolve and break ` + "`git submodule update`" + ` in +every clone. Exclude refs that reference submodules, or convert the +submodule repository first and re-point .gitmodules.`, + Args: cobra.MaximumNArgs(2), + SilenceErrors: true, + SilenceUsage: true, + RunE: func(cmd *cobra.Command, args []string) error { + req.ProtocolMode = gitsync.ProtocolMode(protocolVal) + if err := resolveConvertSHA256Args(&req, args); err != nil { + return err + } + + result, err := sha256convert.Run(cmd.Context(), req) + // Print whatever state Run produced even on error: signed + // tags landed before signBranchTips failed, --check + // findings, and the --keep-source-objects temp dir are + // all things the user needs to see to clean up or debug. + // Run zero-values fields it never touched, so this is + // safe to call on a half-populated result. + if result.SourceURL != "" || result.TargetDir != "" { + printOutput(jsonOutput, result) + } + if err != nil { + return fmt.Errorf("convert-sha256: %w", err) + } + return nil + }, + } + + cmd.Flags().StringVar(&req.SourceURL, "source-url", "", "source repository URL") + cmd.Flags().BoolVar(&req.SourceFollowInfoRefsRedirect, "source-follow-info-refs-redirect", + envBool("GITSYNC_SOURCE_FOLLOW_INFO_REFS_REDIRECT"), + "send follow-up source RPCs to the final /info/refs redirect host") + cmd.Flags().StringVar(&req.SourceAuth.Token, "source-token", + envOr("GITSYNC_SOURCE_TOKEN", ""), "source token/password") + cmd.Flags().StringVar(&req.SourceAuth.Username, "source-username", + envOr("GITSYNC_SOURCE_USERNAME", "git"), "source basic auth username") + cmd.Flags().StringVar(&req.SourceAuth.BearerToken, "source-bearer-token", + envOr("GITSYNC_SOURCE_BEARER_TOKEN", ""), "source bearer token") + cmd.Flags().BoolVar(&req.SourceAuth.SkipTLSVerify, "source-insecure-skip-tls-verify", + envBool("GITSYNC_SOURCE_INSECURE_SKIP_TLS_VERIFY"), + "skip TLS certificate verification for the source") + cmd.Flags().StringVar(&req.TargetDir, "target-dir", "", "directory to initialize as a SHA256 bare repository") + + allRefsFlag(cmd, allRefsUsageScopeOnly, &req.AllRefs) + excludeRefPrefixFlag(cmd, &req.ExcludeRefPrefixes) + addProtocolFlag(cmd, &protocolVal) + cmd.Flags().BoolVarP(&req.Verbose, "verbose", "v", false, "verbose logging") + cmd.Flags().BoolVar(&req.Progress, "progress", false, + "show live per-phase object counts on stderr (TTY only)") + cmd.Flags().BoolVar(&req.Check, "check", false, + "verify the output after conversion (config, HEAD, refs, git fsck --full)") + cmd.Flags().BoolVar(&req.Sign, "sign", false, + "after conversion, sign each branch tip as refs/tags/converted/ via `git tag -s`") + cmd.Flags().StringVar(&req.SignKey, "sign-key", "", + "signing key id to pass to `git tag -s -u`; default uses the repo's user.signingkey") + cmd.Flags().BoolVar(&req.KeepSourceObjects, "keep-source-objects", false, + "keep the temporary SHA1 store on disk after conversion (for debugging)") + cmd.Flags().StringVar(&req.MappingFile, "write-mapping", "", + "write the full SHA1 → SHA256 mapping as a TSV to this path; useful for rewriting external references") + cmd.Flags().BoolVar(&req.SkipMessageRewrite, "no-rewrite-messages", false, + "do not rewrite SHA1 hash references found in commit and tag messages") + cmd.Flags().BoolVar(&req.SkipOriginNotes, "no-origin-notes", false, + "do not write a refs/notes/sha1-origin ref recording each commit's original SHA1") + cmd.Flags().BoolVar(&jsonOutput, "json", false, "print JSON output") + + return cmd +} + +// resolveConvertSHA256Args consumes positional args left-to-right, +// skipping fields the user already supplied via flags. Without that +// rule, `--source-url ` would look like one positional and +// land in SourceURL — leaving TargetDir empty even though the user +// gave both. The two-flags-no-positionals and zero-flags-two-positionals +// shapes also work, as do the symmetric --target-dir + positional URL. +func resolveConvertSHA256Args(req *sha256convert.Request, args []string) error { + positional := args + if req.SourceURL == "" && len(positional) > 0 { + req.SourceURL = positional[0] + positional = positional[1:] + } + if req.TargetDir == "" && len(positional) > 0 { + req.TargetDir = positional[0] + } + if req.SourceURL == "" || req.TargetDir == "" { + return errors.New("convert-sha256 requires a source URL and a target directory") + } + return nil +} diff --git a/cmd/git-sync/convert_sha256_test.go b/cmd/git-sync/convert_sha256_test.go new file mode 100644 index 00000000..ceb6fb3a --- /dev/null +++ b/cmd/git-sync/convert_sha256_test.go @@ -0,0 +1,84 @@ +package main + +import ( + "strings" + "testing" + + "entire.io/entire/git-sync/cmd/git-sync/internal/sha256convert" +) + +func TestResolveConvertSHA256Args(t *testing.T) { + const url = "http://example.invalid/repo.git" + const dir = "/tmp/out" + + tests := []struct { + name string + req sha256convert.Request + args []string + wantURL string + wantDir string + wantErr string + }{ + { + name: "both positionals", + args: []string{url, dir}, + wantURL: url, + wantDir: dir, + }, + { + name: "url flag plus positional dir — the reported bug", + req: sha256convert.Request{SourceURL: url}, + args: []string{dir}, + wantURL: url, + wantDir: dir, + }, + { + name: "dir flag plus positional url", + req: sha256convert.Request{TargetDir: dir}, + args: []string{url}, + wantURL: url, + wantDir: dir, + }, + { + name: "both flags, no positionals", + req: sha256convert.Request{SourceURL: url, TargetDir: dir}, + args: nil, + wantURL: url, + wantDir: dir, + }, + { + name: "missing dir", + req: sha256convert.Request{SourceURL: url}, + args: nil, + wantErr: "requires a source URL and a target directory", + }, + { + name: "missing both", + args: nil, + wantErr: "requires a source URL and a target directory", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := tt.req + err := resolveConvertSHA256Args(&req, tt.args) + switch { + case tt.wantErr == "" && err != nil: + t.Fatalf("unexpected error: %v", err) + case tt.wantErr != "" && err == nil: + t.Fatalf("expected error containing %q, got nil", tt.wantErr) + case tt.wantErr != "" && !strings.Contains(err.Error(), tt.wantErr): + t.Fatalf("error %q does not contain %q", err.Error(), tt.wantErr) + } + if tt.wantErr != "" { + return + } + if req.SourceURL != tt.wantURL { + t.Errorf("SourceURL: got %q, want %q", req.SourceURL, tt.wantURL) + } + if req.TargetDir != tt.wantDir { + t.Errorf("TargetDir: got %q, want %q", req.TargetDir, tt.wantDir) + } + }) + } +} diff --git a/cmd/git-sync/internal/sha256convert/sha256convert.go b/cmd/git-sync/internal/sha256convert/sha256convert.go new file mode 100644 index 00000000..ffeaebc1 --- /dev/null +++ b/cmd/git-sync/internal/sha256convert/sha256convert.go @@ -0,0 +1,1768 @@ +// Package sha256convert implements a one-off SHA1 → SHA256 conversion for a +// single repository. It fetches a pack from a remote SHA1 HTTP endpoint into +// a temporary on-disk SHA1 bare repo, then walks every reachable object and +// re-emits it under SHA256 into a new bare repo at the user-supplied path. +// +// The tool is intentionally scoped: no hash mapping is persisted, GPG +// signatures on commits and tags are dropped (they sign over the original +// SHA1 byte stream and would be invalid post-rewrite), and any submodule +// gitlink fails the run so the caller chooses which refs to exclude. The +// linked-to repository's URL still points at an upstream SHA1 store, +// which has no way to resolve a SHA256-rewritten gitlink, so rewriting +// would produce a tree that fsck-passes but breaks +// `git submodule update`. +package sha256convert + +import ( + "bufio" + "bytes" + "compress/zlib" + "context" + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "os" + "os/exec" + "path/filepath" + "regexp" + "sort" + "strconv" + "strings" + "sync/atomic" + "time" + + git "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/filemode" + formatcfg "github.com/go-git/go-git/v6/plumbing/format/config" + "github.com/go-git/go-git/v6/plumbing/object" + "github.com/go-git/go-git/v6/plumbing/storer" + transporthttp "github.com/go-git/go-git/v6/plumbing/transport/http" + "github.com/go-git/go-git/v6/storage/filesystem" + + gitsync "entire.io/entire/git-sync" + "entire.io/entire/git-sync/internal/auth" + "entire.io/entire/git-sync/internal/convert" + "entire.io/entire/git-sync/internal/gitproto" + "entire.io/entire/git-sync/internal/planner" +) + +// Request describes a single SHA1 → SHA256 conversion. +// +// Scope is intentionally fixed: every branch and every annotated/lightweight +// tag on the source is always converted. Partial scope risks stranding +// cross-branch references in commit messages, which defeats the point of a +// one-off cutover. AllRefs additionally pulls in refs/notes, refs/pull, and +// other custom namespaces; ExcludeRefPrefixes subtracts from that. +type Request struct { + SourceURL string + SourceAuth gitsync.EndpointAuth + SourceFollowInfoRefsRedirect bool + TargetDir string + + AllRefs bool + ExcludeRefPrefixes []string + + ProtocolMode gitsync.ProtocolMode + Verbose bool + Progress bool + Check bool + + // Sign, when true, runs `git tag -s converted/ ` for + // every converted branch after the conversion completes, attesting + // the entire reachable history of each branch via its tip's parent + // chain. SignKey is passed to git as `-u `; leave empty to + // use the repo's default signing identity. + Sign bool + SignKey string + + KeepSourceObjects bool + + // MappingFile, when non-empty, is a path to which a TSV of every + // translated object's SHA1 → SHA256 mapping is written. Useful for + // rewriting external systems that reference old commit hashes. + MappingFile string + + // SkipMessageRewrite disables the inline rewrite of SHA1 hashes found + // in commit and tag messages. Off by default (rewriting is on). + SkipMessageRewrite bool + + // SkipOriginNotes disables the refs/notes/sha1-origin output that + // records each translated commit's original SHA1. Off by default + // (notes are written). + SkipOriginNotes bool + + // Out receives human-readable status lines. Nil means os.Stderr. + Out io.Writer +} + +// Counts tallies converted objects by kind. +type Counts struct { + Blobs int `json:"blobs"` + Trees int `json:"trees"` + Commits int `json:"commits"` + Tags int `json:"tags"` +} + +// Result is the conversion summary, suitable for JSON output. +type Result struct { + SourceURL string `json:"sourceUrl"` + TargetDir string `json:"targetDir"` + Protocol string `json:"protocol"` + RefsConverted int `json:"refsConverted"` + Counts Counts `json:"counts"` + SignaturesStripped int `json:"signaturesStripped"` + MessageRewrites int `json:"messageRewrites"` + AmbiguousMessageRefs []string `json:"ambiguousMessageRefs,omitempty"` + OriginNotesRef string `json:"originNotesRef,omitempty"` + MappingFile string `json:"mappingFile,omitempty"` + SignedTags []string `json:"signedTags,omitempty"` + Checks []Check `json:"checks,omitempty"` + TempDir string `json:"tempDir,omitempty"` +} + +// Check is one named verification step from --check, with the result +// and a short detail string suitable for logging/JSON output. +// +// Skipped distinguishes "this check passed" from "this check did not +// run" — e.g. fsck when git is not on PATH, or HEAD on a tags-only +// conversion. Skipped implies OK so callers that only branch on OK +// still treat it as non-fatal; callers that need a stricter signal +// (CI gating, audit logs) should branch on Skipped first. +type Check struct { + Name string `json:"name"` + OK bool `json:"ok"` + Skipped bool `json:"skipped,omitempty"` + Detail string `json:"detail,omitempty"` +} + +// previewMax caps how many items from a potentially-long list (ambiguous +// prefixes, signed tags) are inlined into a Lines() summary before +// switching to a "(N more)" suffix. +const previewMax = 5 + +// Lines satisfies the human-readable output contract used by other git-sync subcommands. +func (r Result) Lines() []string { + lines := []string{ + "sha256 bare repo: " + r.TargetDir, + fmt.Sprintf("source: %s (%s)", r.SourceURL, r.Protocol), + fmt.Sprintf("converted: %d blobs, %d trees, %d commits, %d tags", + r.Counts.Blobs, r.Counts.Trees, r.Counts.Commits, r.Counts.Tags), + fmt.Sprintf("refs written: %d", r.RefsConverted), + } + if r.SignaturesStripped > 0 { + // Mixes commit/tag signatures (GPG/SSH/X.509) and embedded + // mergetag headers — each counts as one signed artifact whose + // signature became invalid post-rewrite. + lines = append(lines, fmt.Sprintf("warning: stripped %d signature(s) / mergetag header(s); they no longer match the rewritten object content", r.SignaturesStripped)) + } + if r.MessageRewrites > 0 { + lines = append(lines, fmt.Sprintf("rewrote %d SHA1 hash reference(s) in commit/tag messages", r.MessageRewrites)) + } + if n := len(r.AmbiguousMessageRefs); n > 0 { + preview := r.AmbiguousMessageRefs + extra := 0 + if len(preview) > previewMax { + extra = len(preview) - previewMax + preview = preview[:previewMax] + } + line := fmt.Sprintf("warning: %d ambiguous SHA1 hex prefix(es) in messages left unrewritten (look up via the mapping file): %s", + n, strings.Join(preview, ", ")) + if extra > 0 { + line += fmt.Sprintf(", ... (%d more)", extra) + } + lines = append(lines, line) + } + if r.OriginNotesRef != "" { + lines = append(lines, fmt.Sprintf("origin notes ref: %s (use `git notes --ref=%s show ` to recover old SHA1)", + r.OriginNotesRef, strings.TrimPrefix(r.OriginNotesRef, "refs/notes/"))) + } + if r.MappingFile != "" { + lines = append(lines, "mapping written to: "+r.MappingFile) + } + if n := len(r.SignedTags); n > 0 { + preview := r.SignedTags + extra := 0 + if len(preview) > previewMax { + extra = len(preview) - previewMax + preview = preview[:previewMax] + } + line := fmt.Sprintf("signed %d branch attestation tag(s): %s", + n, strings.Join(preview, ", ")) + if extra > 0 { + line += fmt.Sprintf(", ... (%d more; full list in --json)", extra) + } + lines = append(lines, line) + } + if r.TempDir != "" { + lines = append(lines, "kept source objects: "+r.TempDir) + } + return lines +} + +// Run performs the conversion described by req. +// +//nolint:maintidx // Run is a linear orchestrator over distinct phases (fetch → discover → init → translate → refs → notes → mapping → sign → check); each phase is short and isolated. Splitting into helpers would obscure the pipeline rather than clarify it. +func Run(ctx context.Context, req Request) (Result, error) { + if req.SourceURL == "" { + return Result{}, errors.New("convert-sha256 requires --source-url") + } + if req.TargetDir == "" { + return Result{}, errors.New("convert-sha256 requires a target directory") + } + // Enforce the documented invariant: every branch and every tag is + // always converted. Otherwise the partial set could strand + // cross-branch hash references in commit and tag messages, which + // the message-rewrite pass is built to keep intact. + if bad := protectedExcludePrefixes(req.ExcludeRefPrefixes); len(bad) > 0 { + return Result{}, fmt.Errorf("convert-sha256 refuses --exclude-ref-prefix values that would drop branches or tags: %s (only namespaces outside refs/heads/ and refs/tags/ may be excluded)", strings.Join(bad, ", ")) + } + out := req.Out + if out == nil { + out = os.Stderr + } + + if err := ensureEmptyTarget(req.TargetDir); err != nil { + return Result{}, err + } + + tempDir, err := os.MkdirTemp("", "git-sync-sha256-src-") + if err != nil { + return Result{}, fmt.Errorf("create temp dir: %w", err) + } + cleanupTemp := true + defer func() { + if cleanupTemp { + _ = os.RemoveAll(tempDir) + } + }() + + // cleanupTarget fires when set, wiping the SHA256 bare repo we + // initialize below. ensureEmptyTarget already verified the dir was + // empty going in, so a defensive RemoveAll on failure only ever + // removes content this run created. Without it, any error after + // PlainInit leaves config/objects/refs/HEAD behind, and the next + // retry hits ensureEmptyTarget's "not empty" refusal with no + // indication of how to recover. Suppressed by --keep-source-objects + // so users can inspect partial state. + cleanupTarget := false + defer func() { + if cleanupTarget && !req.KeepSourceObjects { + _ = os.RemoveAll(req.TargetDir) + } + }() + + // Build the result struct early so error paths can surface + // what little ran successfully. In particular, --keep-source-objects + // exists to debug failures, so cleanupTemp must flip and TempDir + // must be in the result *before* any later error return; otherwise + // the temp store gets wiped on exactly the runs that need it. + res := Result{SourceURL: req.SourceURL, TargetDir: req.TargetDir} + if req.KeepSourceObjects { + cleanupTemp = false + res.TempDir = tempDir + } + + srcRepo, err := git.PlainInit(tempDir, true) + if err != nil { + return res, fmt.Errorf("init temporary SHA1 store: %w", err) + } + + // Source connection + ref discovery ----------------------------------- + // Scope is fixed: always include every branch and every tag. AllRefs + // extends to refs/notes/*, refs/pull/*, and other namespaces; + // ExcludeRefPrefixes can subtract from that under AllRefs. + planCfg := planner.PlanConfig{ + IncludeTags: true, + AllRefs: req.AllRefs, + ExcludeRefPrefixes: append([]string(nil), req.ExcludeRefPrefixes...), + } + conn, refService, sourceRefList, err := openSource(ctx, req, planCfg) + if err != nil { + return res, err + } + defer conn.Close() + refService.Verbose = req.Verbose + + sourceRefs := gitproto.RefHashMap(sourceRefList) + desired, _, err := planner.BuildDesiredRefs(sourceRefs, planCfg) + if err != nil { + return res, fmt.Errorf("build desired refs: %w", err) + } + if len(desired) == 0 { + return res, errors.New("no source refs matched the requested scope") + } + + // Refuse before any further I/O if the source carries refs that + // would collide with our side outputs. writeRefs runs before + // writeOriginNotes / signBranchTips, so without this check the + // later side-output write would silently clobber the source ref. + if err := checkSideOutputCollision(desired, req.SkipOriginNotes, req.Sign); err != nil { + return res, err + } + + // Fetch into temp SHA1 store ------------------------------------------ + fmt.Fprintf(out, "fetching %d ref(s) from %s ...\n", len(desired), req.SourceURL) + gpDesired := convert.DesiredRefs(desired) + if err := refService.FetchToStore(ctx, srcRepo.Storer, conn, gpDesired, nil); err != nil && + !errors.Is(err, git.NoErrAlreadyUpToDate) { + return res, fmt.Errorf("fetch source pack: %w", err) + } + + // Discover reachable set before initing the target. Submodule + // errors surface here, so a failed run leaves the target dir + // untouched (it was only ensured-empty so far) rather than half + // converted. + rootSHA1s := make([]plumbing.Hash, 0, len(desired)) + for _, d := range desired { + rootSHA1s = append(rootSHA1s, d.SourceHash) + } + fmt.Fprintln(out, "discovering reachable objects ...") + progressActive := req.Progress && isTTY(out) + var discCounter *atomic.Int64 + var stopDisc func() + if progressActive { + c := new(atomic.Int64) + discCounter = c + stopDisc = startProgressTick(out, func() string { + return fmt.Sprintf(" discovered %d objects", c.Load()) + }) + } + reachable, err := discoverReachable(ctx, srcRepo.Storer, rootSHA1s, discCounter) + if stopDisc != nil { + stopDisc() + } + if err != nil { + return res, fmt.Errorf("discover reachable: %w", err) + } + + // Discovery succeeded — safe to materialize the SHA256 target. + dstRepo, err := git.PlainInit(req.TargetDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + return res, fmt.Errorf("init SHA256 target at %s: %w", req.TargetDir, err) + } + // Anything that fails past here would leave the target dir + // non-empty (config + HEAD + maybe objects/refs), blocking a + // retry on ensureEmptyTarget; arm the deferred cleanup now. + cleanupTarget = true + + tr, err := newTranslator(ctx, srcRepo.Storer, dstRepo.Storer, req.TargetDir, !req.SkipMessageRewrite, reachable) + if err != nil { + return res, err + } + fmt.Fprintln(out, "translating objects to sha256 ...") + var stopTr func() + if progressActive { + stopTr = startProgressTick(out, func() string { + return fmt.Sprintf(" translated %d blobs, %d trees, %d commits, %d tags", + tr.blobs.Load(), tr.trees.Load(), tr.commitsCount.Load(), tr.tags.Load()) + }) + } + for _, d := range desired { + if _, err := tr.translate(d.SourceHash); err != nil { + if stopTr != nil { + stopTr() + } + return res, fmt.Errorf("translate %s: %w", d.SourceRef, err) + } + } + if stopTr != nil { + stopTr() + } + + // Write refs --------------------------------------------------------- + refsWritten, err := writeRefs(dstRepo.Storer, desired, tr.mapping) + if err != nil { + return res, fmt.Errorf("write target refs: %w", err) + } + + // Point HEAD at a ref that actually exists in the target. PlainInit + // defaults HEAD to refs/heads/master, which often doesn't exist + // (e.g. repos using "main"), and would then fail the --check HEAD + // step. See pickHEAD for the selection order. + if headRef := pickHEAD(refService.HeadTarget, desired); headRef != "" { + if err := dstRepo.Storer.SetReference(plumbing.NewSymbolicReference(plumbing.HEAD, headRef)); err != nil { + return res, fmt.Errorf("set HEAD: %w", err) + } + } + + res.Protocol = refService.Protocol + res.RefsConverted = refsWritten + res.Counts = tr.snapshotCounts() + res.SignaturesStripped = tr.signaturesStripped + res.MessageRewrites = tr.messageRewrites + if len(tr.ambiguousMessageRefs) > 0 { + amb := make([]string, 0, len(tr.ambiguousMessageRefs)) + for s := range tr.ambiguousMessageRefs { + amb = append(amb, s) + } + sort.Strings(amb) + res.AmbiguousMessageRefs = amb + } + + if !req.SkipOriginNotes && len(tr.commits) > 0 { + notesRef, err := tr.writeOriginNotes(originNotesRef) + if err != nil { + return res, fmt.Errorf("write origin notes: %w", err) + } + if err := dstRepo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(notesRef), tr.lastNotesCommit)); err != nil { + return res, fmt.Errorf("set %s: %w", notesRef, err) + } + res.OriginNotesRef = notesRef + } + + if req.MappingFile != "" { + if err := tr.writeMappingFile(req.MappingFile); err != nil { + return res, fmt.Errorf("write mapping file: %w", err) + } + res.MappingFile = req.MappingFile + } + + if req.Sign { + signed, err := signBranchTips(ctx, out, req.TargetDir, req.SignKey, req.SourceURL, desired) + // signBranchTips returns the tags it had already created + // when it failed mid-iteration. Surface that partial list + // even on error so the caller can clean up — without it, + // signed converted/* tags would be left on disk with no + // indication in either Result or the error. + res.SignedTags = signed + if err != nil { + return res, fmt.Errorf("sign: %w", err) + } + } + + if req.Check { + fmt.Fprintln(out, "verifying output ...") + // Collect the side outputs this run actually wrote so the + // refs check knows which target refs to ignore. Anything not + // in here is assumed to be a translated source ref. + sideOutputs := make(map[plumbing.ReferenceName]struct{}, 1+len(res.SignedTags)) + if res.OriginNotesRef != "" { + sideOutputs[plumbing.ReferenceName(res.OriginNotesRef)] = struct{}{} + } + for _, tag := range res.SignedTags { + sideOutputs[plumbing.ReferenceName(tag)] = struct{}{} + } + hasBranches := false + for _, d := range desired { + if d.TargetRef.IsBranch() { + hasBranches = true + break + } + } + res.Checks = runChecks(ctx, req.TargetDir, dstRepo, refsWritten, sideOutputs, hasBranches) + for _, c := range res.Checks { + mark := "✓" + switch { + case !c.OK: + mark = "✗" + case c.Skipped: + mark = "○" + } + fmt.Fprintf(out, " %s %s: %s\n", mark, c.Name, c.Detail) + } + for _, c := range res.Checks { + if !c.OK { + // The conversion finished; the failure is a + // post-hoc verification miss. Keep the target + // on disk so the user can inspect exactly what + // failed the check. + cleanupTarget = false + return res, fmt.Errorf("check %q failed: %s", c.Name, c.Detail) + } + } + } + + // Run completed; keep the target dir. + cleanupTarget = false + return res, nil +} + +// signBranchTips runs `git tag -s converted/ ` for every +// branch in the desired set. The converter's signing identity (whatever +// `user.signingkey` / `gpg.format` is set to in the target repo, or the +// caller-supplied signKey) attests each branch's full reachable history +// via the parent chain encoded in the tip commit's bytes. +// +// stdin/stderr are inherited so gpg/ssh-agent prompts work +// interactively. A failure short-circuits the run; tags signed before +// the failure stay in the target repo. +func signBranchTips(ctx context.Context, out io.Writer, targetDir, signKey, sourceURL string, desired map[plumbing.ReferenceName]planner.DesiredRef) ([]string, error) { + gitBin, err := exec.LookPath("git") + if err != nil { + return nil, fmt.Errorf("git binary required to sign: %w", err) + } + // Iterate in a deterministic order so re-runs over the same source + // produce the same sequence of tags (modulo the signature payload, + // which carries the signer's timestamp). + branchNames := make([]string, 0, len(desired)) + for name := range desired { + if name.IsBranch() { + branchNames = append(branchNames, string(name)) + } + } + sort.Strings(branchNames) + + var signed []string + for _, refName := range branchNames { + shortName := plumbing.ReferenceName(refName).Short() + tagName := strings.TrimPrefix(attestationTagPrefix, "refs/tags/") + shortName + fmt.Fprintf(out, "signing %s ...\n", "refs/tags/"+tagName) + + msg := fmt.Sprintf( + "SHA1 → SHA256 conversion attestation for %s.\n\n"+ + "Source: %s\nProduced by git-sync convert-sha256.\n", + refName, sourceURL) + args := []string{"-C", targetDir, "tag", "-s", "-m", msg} + if signKey != "" { + args = append(args, "-u", signKey) + } + args = append(args, tagName, refName) + + cmd := exec.CommandContext(ctx, gitBin, args...) + // Deliberate departure from the req.Out plumbing the rest of + // Run uses: gpg/ssh-agent and pinentry need a real TTY for + // passphrase prompts, so we inherit the parent's stdio + // directly. The consequence is that callers passing + // req.Out = io.Discard (e.g. tests) still see subprocess + // output on real stderr — that's the cost of working + // authentication. + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stderr // git tag -s is usually quiet on success + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return signed, fmt.Errorf("git tag -s %s: %w", tagName, err) + } + signed = append(signed, "refs/tags/"+tagName) + } + return signed, nil +} + +// runChecks performs lightweight verification of the converted repo. +// Returns one Check per step. Callers print and/or fail-on-error based +// on these. No early return so users see the full picture even when an +// earlier check fails. +// +// sideOutputs holds the exact refs the run created on top of the +// source set (the origin-notes ref, any --sign attestation tags), so +// the refs check can omit them from the resolved/expected fraction +// without false-positive-skipping a same-named source ref. +// +// hasBranches says whether any refs/heads/* landed in the target. If +// false, this is a tags-only conversion and HEAD is left at the +// PlainInit default (refs/heads/master, which won't exist); the HEAD +// check is then a no-op rather than a guaranteed failure. +func runChecks(ctx context.Context, targetDir string, repo *git.Repository, refsExpected int, sideOutputs map[plumbing.ReferenceName]struct{}, hasBranches bool) []Check { + checks := []Check{} + + // 1. Config: extensions.objectformat = sha256. Parse the file + // section-aware so we don't false-positive on a commented line or + // a similarly-named key in another section. + cfgFile, err := os.Open(filepath.Join(targetDir, "config")) + switch { + case err != nil: + checks = append(checks, Check{Name: "config", OK: false, Detail: err.Error()}) + default: + cfg := formatcfg.New() + decodeErr := formatcfg.NewDecoder(cfgFile).Decode(cfg) + _ = cfgFile.Close() + switch { + case decodeErr != nil: + checks = append(checks, Check{Name: "config", OK: false, Detail: fmt.Sprintf("parse config: %v", decodeErr)}) + case !strings.EqualFold(cfg.Section("extensions").Option("objectformat"), "sha256"): + checks = append(checks, Check{Name: "config", OK: false, Detail: "extensions.objectformat = sha256 not set"}) + default: + checks = append(checks, Check{Name: "config", OK: true, Detail: "extensions.objectformat = sha256"}) + } + } + + // 2. HEAD resolves to an existing object. Skipped on tags-only + // conversions, where the target legitimately has no branch for + // HEAD to symlink to. + switch { + case !hasBranches: + checks = append(checks, Check{Name: "HEAD", OK: true, Skipped: true, Detail: "tags-only conversion; no branch to point at"}) + default: + head, err := repo.Reference(plumbing.HEAD, true) + switch { + case err != nil: + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: err.Error()}) + case head.Hash().IsZero(): + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: "resolves to zero hash"}) + default: + if _, err := repo.Storer.EncodedObject(plumbing.AnyObject, head.Hash()); err != nil { + checks = append(checks, Check{Name: "HEAD", OK: false, Detail: fmt.Sprintf("%s: %v", head.Hash(), err)}) + } else { + checks = append(checks, Check{Name: "HEAD", OK: true, Detail: head.Hash().String()}) + } + } + } + + // 3. Every written ref resolves to an existing object. Skip the + // specific refs this run created as side outputs — they're + // accounted for in their own Result fields and would otherwise + // make the displayed fraction misleading. Skipping by exact name + // (not by prefix) avoids hiding a legitimate source ref that + // happened to share a namespace. + resolved := 0 + missing := "" + refs, err := repo.References() + if err != nil { + checks = append(checks, Check{Name: "refs", OK: false, Detail: err.Error()}) + } else { + walkErr := refs.ForEach(func(r *plumbing.Reference) error { + if r.Type() != plumbing.HashReference { + return nil + } + if _, skip := sideOutputs[r.Name()]; skip { + return nil + } + if _, err := repo.Storer.EncodedObject(plumbing.AnyObject, r.Hash()); err != nil { + if missing == "" { + missing = fmt.Sprintf("%s → %s: %v", r.Name(), r.Hash(), err) + } + return nil + } + resolved++ + return nil + }) + switch { + case walkErr != nil: + checks = append(checks, Check{Name: "refs", OK: false, Detail: walkErr.Error()}) + case missing != "": + checks = append(checks, Check{Name: "refs", OK: false, Detail: missing}) + case resolved < refsExpected: + checks = append(checks, Check{Name: "refs", OK: false, Detail: fmt.Sprintf("only %d / %d refs resolved", resolved, refsExpected)}) + default: + checks = append(checks, Check{Name: "refs", OK: true, Detail: fmt.Sprintf("%d / %d resolve to objects", resolved, refsExpected)}) + } + } + + // 4. git fsck --full (if git is on PATH). + gitBin, err := exec.LookPath("git") + if err != nil { + checks = append(checks, Check{Name: "git fsck --full", OK: true, Skipped: true, Detail: "git not in PATH"}) + return checks + } + cmd := exec.CommandContext(ctx, gitBin, "-C", targetDir, "fsck", "--full") + fsckOut, err := cmd.CombinedOutput() + switch { + case err != nil: + checks = append(checks, Check{Name: "git fsck --full", OK: false, Detail: fmt.Sprintf("%v\n%s", err, fsckOut)}) + case fsckHasError(fsckOut): + // Belt-and-braces against a hypothetical git version that prints + // "error:" / "fatal:" lines but exits zero. Match line prefixes + // rather than a substring so a branch or path containing "error" + // in a benign dangling/warning line doesn't trip the check. + checks = append(checks, Check{Name: "git fsck --full", OK: false, Detail: strings.TrimSpace(string(fsckOut))}) + default: + checks = append(checks, Check{Name: "git fsck --full", OK: true, Detail: "clean"}) + } + return checks +} + +// fsckHasError reports whether git-fsck output contains a line that +// signals a real problem. We match (case-insensitively) any line whose +// first token starts with "error" or "fatal" — covering "error:", +// "fatal:", and the rare "errorInX:" variants — plus the +// "missing " / "broken link" / "bad " object reports +// emitted by older git. Dangling and warning lines are intentionally +// ignored. +// +// Splits on raw newlines rather than using bufio.Scanner so a single +// very long line (some fsck reports include long paths) is not +// silently truncated at the scanner's 64 KiB default. +func fsckHasError(out []byte) bool { + for _, raw := range bytes.Split(out, []byte("\n")) { + line := strings.TrimSpace(string(raw)) + if line == "" { + continue + } + lower := strings.ToLower(line) + if strings.HasPrefix(lower, "error") || strings.HasPrefix(lower, "fatal") { + return true + } + if strings.HasPrefix(lower, "missing ") || strings.HasPrefix(lower, "broken link") || strings.HasPrefix(lower, "bad ") { + return true + } + } + return false +} + +const ( + originNotesRef = "refs/notes/sha1-origin" + attestationTagPrefix = "refs/tags/converted/" +) + +// protectedExcludePrefixes returns the subset of prefixes that, under +// planner.IsRefExcluded's string-prefix semantics, would knock out at +// least one branch or tag. A prefix matches a branch if either side +// is a string-prefix of the other against "refs/heads/" (and likewise +// for "refs/tags/"). That covers: +// +// - bare "" (excludes every ref) +// - "refs/" or "refs/h", "refs/heads/" (whole branch namespace) +// - "refs/heads/feature/" (some branches) +// - "refs/tags/" and any narrower suffix +// +// Returned in input order, with duplicates removed, so the error +// message shows the user exactly which flag values to drop. +func protectedExcludePrefixes(prefixes []string) []string { + protected := []string{"refs/heads/", "refs/tags/"} + var bad []string + seen := map[string]struct{}{} + for _, raw := range prefixes { + p := strings.TrimSpace(raw) + if _, dup := seen[p]; dup { + continue + } + for _, prot := range protected { + if strings.HasPrefix(p, prot) || strings.HasPrefix(prot, p) { + bad = append(bad, raw) + seen[p] = struct{}{} + break + } + } + } + return bad +} + +// checkSideOutputCollision refuses the conversion when the source set +// already contains a ref name this run would later write as a side +// output. Without this guard, writeRefs would publish the source's +// value first and writeOriginNotes / signBranchTips would silently +// overwrite it — losing the source ref and hiding the conflict. +func checkSideOutputCollision(desired map[plumbing.ReferenceName]planner.DesiredRef, skipOriginNotes, sign bool) error { + if !skipOriginNotes { + if _, conflict := desired[plumbing.ReferenceName(originNotesRef)]; conflict { + return fmt.Errorf("source already advertises %s; pass --no-origin-notes to keep that source ref, or --exclude-ref-prefix %s to drop it from the conversion", originNotesRef, originNotesRef) + } + } + if sign { + var clashes []string + for name := range desired { + if strings.HasPrefix(string(name), attestationTagPrefix) { + clashes = append(clashes, string(name)) + } + } + if len(clashes) > 0 { + sort.Strings(clashes) + return fmt.Errorf("source has %s under %s, which collides with the attestation tags --sign would create; drop --sign or rename the source tag(s)", strings.Join(clashes, ", "), attestationTagPrefix) + } + } + return nil +} + +// ensureEmptyTarget refuses to init into a non-empty directory so the user +// doesn't quietly accumulate objects into an existing repo. +func ensureEmptyTarget(path string) error { + entries, err := os.ReadDir(path) + if err != nil { + if os.IsNotExist(err) { + if mkErr := os.MkdirAll(path, 0o755); mkErr != nil { + return fmt.Errorf("create target dir: %w", mkErr) + } + return nil + } + return fmt.Errorf("read target dir: %w", err) + } + if len(entries) > 0 { + return fmt.Errorf("target directory %s is not empty", path) + } + return nil +} + +func openSource(ctx context.Context, req Request, planCfg planner.PlanConfig) (gitproto.Conn, *gitproto.RefService, []*plumbing.Reference, error) { + ep, err := url.Parse(req.SourceURL) + if err != nil { + return nil, nil, nil, fmt.Errorf("parse source URL: %w", err) + } + if ep.Scheme != "http" && ep.Scheme != "https" { + return nil, nil, nil, fmt.Errorf("convert-sha256 currently supports HTTP/HTTPS sources only; got %q", ep.Scheme) + } + authMethod, err := auth.Resolve(auth.Endpoint{ + Username: req.SourceAuth.Username, + Token: req.SourceAuth.Token, + BearerToken: req.SourceAuth.BearerToken, + SkipTLSVerify: req.SourceAuth.SkipTLSVerify, + }, ep) + if err != nil { + return nil, nil, nil, fmt.Errorf("resolve source auth: %w", err) + } + httpClient := &http.Client{Transport: gitproto.NewHTTPTransport(req.SourceAuth.SkipTLSVerify)} + conn := gitproto.NewHTTPConnWithClient(ep, "source", normalizeAuth(authMethod), httpClient) + conn.FollowInfoRefsRedirect = req.SourceFollowInfoRefsRedirect + + mode := string(req.ProtocolMode) + if mode == "" { + mode = string(gitsync.ProtocolAuto) + } + + refs, svc, err := gitproto.ListSourceRefs(ctx, conn, mode, planner.RefPrefixes(planCfg)) + if err != nil { + _ = conn.Close() + return nil, nil, nil, fmt.Errorf("list source refs: %w", err) + } + return conn, svc, refs, nil +} + +func normalizeAuth(m auth.Method) gitproto.AuthMethod { + if m == nil { + return nil + } + // auth.Method and gitproto.AuthMethod share the same Authorizer signature. + // Wrap so we can pass either *transporthttp.BasicAuth or *transporthttp.TokenAuth. + if a, ok := m.(*transporthttp.BasicAuth); ok { + return a + } + if a, ok := m.(*transporthttp.TokenAuth); ok { + return a + } + return authAdapter{m: m} +} + +type authAdapter struct{ m auth.Method } + +func (a authAdapter) Authorizer(req *http.Request) error { + if err := a.m.Authorizer(req); err != nil { + return fmt.Errorf("authorize request: %w", err) + } + return nil +} + +// translator walks the SHA1 source store, rewrites object content with +// SHA256-mapped hashes, and writes the result as loose objects under the +// target bare repo. Loose object writing is done by hand because go-git +// v6 alpha 3's objfile.Writer hardcodes SHA1 in prepareForWrite (see +// plumbing/format/objfile/writer.go:68), which would store every SHA256 +// object at a SHA1-derived path. +type translator struct { + // ctx is checked at the top of every translate() call so a Ctrl-C + // during a million-object conversion is responsive. It is the same + // context passed to Run() and is not stored to outlive its caller. + ctx context.Context //nolint:containedctx // translate() is recursive and not directly called by Run; threading ctx through every signature is noisier than a single field used for cancellation only. + src *filesystem.Storage + objectsDir string + // reachable holds every in-scope SHA1 with its object type, built up + // front by discoverReachable, which walks tree/commit/tag dependencies + // from the desired ref tips. It is the authoritative "what's in + // scope" set: abbreviated SHA1 prefixes in commit/tag messages are + // resolved against this set so a unique match is fixed before any + // encoding starts, and so message-reference edges can be added to + // the translation DFS in topological order. + reachable map[plumbing.Hash]plumbing.ObjectType + mapping map[plumbing.Hash]plumbing.Hash + // inProgress detects cycles in the translation DFS. Real Git + // histories cannot form cycles (the parent/tree/tag-target edges + // are a DAG by construction, and SHA1 message-reference cycles are + // cryptographically infeasible), but a defensive guard turns + // surprising input into a clear error instead of a stack overflow. + inProgress map[plumbing.Hash]struct{} + // commits records every translated commit's old SHA1, in DFS order, + // for use by writeOriginNotes. We track separately rather than walking + // the full mapping because notes only attach meaningfully to commits. + commits []plumbing.Hash + // ambiguousMessageRefs collects every hex prefix in a commit/tag + // message that matched more than one in-scope SHA1 and was + // therefore left unrewritten. Surfaced to the user as a warning + // so they know which references to investigate via the mapping + // file. + ambiguousMessageRefs map[string]struct{} + // resolveCache memoizes resolveMessageRef results. reachable is + // frozen before translation starts, so the (prefix → matchResult) + // mapping is stable for the lifetime of the translator. + // extractMessageReferences and rewriteHashesInMessage hit + // resolveMessageRef for the same tokens, and the abbreviated-hash + // path costs O(len(reachable)) per call — caching halves that. + resolveCache map[string]resolveCacheEntry + // Live counts updated atomically so the --progress ticker goroutine + // can sample them without racing against translation. Snapshot into + // a Counts struct at the end of the run. + blobs atomic.Int64 + trees atomic.Int64 + commitsCount atomic.Int64 + tags atomic.Int64 + signaturesStripped int + messageRewrites int + rewriteMessages bool + lastNotesCommit plumbing.Hash +} + +func (t *translator) snapshotCounts() Counts { + return Counts{ + Blobs: int(t.blobs.Load()), + Trees: int(t.trees.Load()), + Commits: int(t.commitsCount.Load()), + Tags: int(t.tags.Load()), + } +} + +func newTranslator(ctx context.Context, src, dst storer.Storer, targetDir string, rewriteMessages bool, reachable map[plumbing.Hash]plumbing.ObjectType) (*translator, error) { + srcFS, ok := src.(*filesystem.Storage) + if !ok { + return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) + } + // Type-check that the target is filesystem-backed too — we write + // loose objects by hand into targetDir/objects, bypassing the + // storer, but a memory-backed dst here would silently leave the + // caller's expected destination empty. Result is discarded: the + // translator only references targetDir directly. + if _, ok := dst.(*filesystem.Storage); !ok { + return nil, fmt.Errorf("target storage is not filesystem-backed (%T)", dst) + } + if reachable == nil { + reachable = make(map[plumbing.Hash]plumbing.ObjectType) + } + return &translator{ + ctx: ctx, + src: srcFS, + objectsDir: filepath.Join(targetDir, "objects"), + reachable: reachable, + mapping: make(map[plumbing.Hash]plumbing.Hash), + inProgress: make(map[plumbing.Hash]struct{}), + ambiguousMessageRefs: make(map[string]struct{}), + resolveCache: make(map[string]resolveCacheEntry), + rewriteMessages: rewriteMessages, + }, nil +} + +// discoverReachable walks every object reachable from roots (via tree +// entries, commit tree+parent links, and tag targets) and returns a +// (SHA1 → object type) map covering the full in-scope set. +// +// Submodule gitlinks: any submodule entry (mode 160000) fails the run +// here, before the target bare repo is initialized — failing fast +// keeps half-converted state off disk. Rewriting the gitlink to SHA256 +// would produce a tree the upstream .gitmodules repo can never +// resolve, since it advertises only SHA1. +// +// Message-reference edges are not part of this pass; those are added +// during translation, where the partial mapping is updated as we go. +// +// If progress is non-nil, it is incremented once per object visited. +// The --progress ticker samples this counter from another goroutine. +func discoverReachable(ctx context.Context, src storer.Storer, roots []plumbing.Hash, progress *atomic.Int64) (map[plumbing.Hash]plumbing.ObjectType, error) { + srcFS, ok := src.(*filesystem.Storage) + if !ok { + return nil, fmt.Errorf("source storage is not filesystem-backed (%T)", src) + } + reachable := make(map[plumbing.Hash]plumbing.ObjectType) + + // Iterative DFS with an explicit stack. The previous recursive + // implementation walked deep linear histories (50k–100k commits + // is not unheard of) one Go stack frame deep per parent edge, + // growing the goroutine stack by tens of MiB on kernel-scale + // runs. The explicit stack keeps memory usage proportional to + // the in-flight frontier, not the longest chain. + stack := make([]plumbing.Hash, 0, len(roots)) + stack = append(stack, roots...) + for len(stack) > 0 { + // Per-object cancellation check. Discovery on a kernel-scale + // repo runs for several minutes before translate() takes + // over, so without this Ctrl-C would not interrupt the run + // until the discovery phase finished on its own. + if err := ctx.Err(); err != nil { + return nil, fmt.Errorf("discover: %w", err) + } + sha1 := stack[len(stack)-1] + stack = stack[:len(stack)-1] + if _, seen := reachable[sha1]; seen { + continue + } + obj, err := srcFS.EncodedObject(plumbing.AnyObject, sha1) + if err != nil { + return nil, fmt.Errorf("discover %s: %w", sha1, err) + } + reachable[sha1] = obj.Type() + if progress != nil { + progress.Add(1) + } + switch obj.Type() { //nolint:exhaustive // OFSDelta/REFDelta/AnyObject/InvalidObject cannot reach a resolved storage. + case plumbing.BlobObject: + // No outgoing edges. + case plumbing.TreeObject: + tree := &object.Tree{} + if err := tree.Decode(obj); err != nil { + return nil, fmt.Errorf("discover decode tree %s: %w", sha1, err) + } + for _, e := range tree.Entries { + if e.Mode == filemode.Submodule { + // A submodule gitlink stores a hash that refers to a + // commit in a *different* repository — the one named + // by the matching .gitmodules URL. Even when that + // commit happens to be in our source store, the URL + // still points at an upstream SHA1 repo, so rewriting + // the gitlink to SHA256 produces a tree that fsck- + // passes but breaks `git submodule update` forever: + // the upstream advertises only SHA1 hashes. The only + // safe answer is to refuse and let the caller scope + // the offending ref out (or convert the submodule + // upstream first and re-point .gitmodules). + return nil, fmt.Errorf( + "tree %s contains a submodule gitlink %q at %s; convert-sha256 cannot rewrite submodule pointers "+ + "because the linked-to repository would still advertise SHA1 hashes — "+ + "exclude refs that reference it or convert the submodule repository first", + sha1, e.Name, e.Hash) + } + stack = append(stack, e.Hash) + } + case plumbing.CommitObject: + c := &object.Commit{} + if err := c.Decode(obj); err != nil { + return nil, fmt.Errorf("discover decode commit %s: %w", sha1, err) + } + stack = append(stack, c.TreeHash) + stack = append(stack, c.ParentHashes...) + case plumbing.TagObject: + tag := &object.Tag{} + if err := tag.Decode(obj); err != nil { + return nil, fmt.Errorf("discover decode tag %s: %w", sha1, err) + } + stack = append(stack, tag.Target) + default: + return nil, fmt.Errorf("unexpected object type %v for %s during discovery", obj.Type(), sha1) + } + } + return reachable, nil +} + +// translate is intentionally recursive. Unlike discoverReachable's +// purely-structural DFS, translate's edges are dynamic: tree entries, +// commit parents, tag targets, *and* message-reference edges resolved +// against the partial mapping built so far. Converting that to an +// explicit work stack would require an "after-children" callback per +// object type and is easy to get subtly wrong (re-encoding before all +// referenced hashes are placed silently corrupts the message rewrite). +// +// Recursion depth is bounded by the longest dependency chain in the +// source DAG — in practice the longest commit-parent chain, since +// trees and tags add at most one frame each. Linux kernel history is +// O(70k) commits along its deepest single-parent path; Go's growable +// stacks comfortably absorb that (~tens of MiB). Cycle detection above +// turns any unexpected graph shape into a clear error rather than a +// stack-overflow crash. +func (t *translator) translate(sha1 plumbing.Hash) (plumbing.Hash, error) { + // Cheap per-object cancellation check so Ctrl-C during a long + // conversion (kernel-scale: ~10M objects) returns promptly rather + // than running the whole DFS to completion. + if err := t.ctx.Err(); err != nil { + return plumbing.ZeroHash, fmt.Errorf("translate %s: %w", sha1, err) + } + if newH, ok := t.mapping[sha1]; ok { + return newH, nil + } + if _, busy := t.inProgress[sha1]; busy { + // Real Git histories cannot form cycles via parent, tree, or + // tag-target edges (those are a DAG by construction), and + // SHA1 message-reference cycles are cryptographically + // infeasible (each commit's hash depends on its content, + // including any hash it embeds). A trip here would mean an + // unexpected graph shape; surface it instead of overflowing + // the stack. + return plumbing.ZeroHash, fmt.Errorf("translation cycle detected at %s", sha1) + } + t.inProgress[sha1] = struct{}{} + defer delete(t.inProgress, sha1) + + obj, err := t.src.EncodedObject(plumbing.AnyObject, sha1) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("lookup %s: %w", sha1, err) + } + switch obj.Type() { //nolint:exhaustive // OFSDelta/REFDelta/AnyObject/InvalidObject cannot reach a resolved storage. + case plumbing.BlobObject: + return t.translateBlob(sha1, obj) + case plumbing.TreeObject: + return t.translateTree(sha1, obj) + case plumbing.CommitObject: + return t.translateCommit(sha1, obj) + case plumbing.TagObject: + return t.translateTag(sha1, obj) + default: + return plumbing.ZeroHash, fmt.Errorf("unexpected object type %v for %s", obj.Type(), sha1) + } +} + +func (t *translator) translateBlob(sha1 plumbing.Hash, src plumbing.EncodedObject) (plumbing.Hash, error) { + r, err := src.Reader() + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("blob reader: %w", err) + } + defer r.Close() + body, err := io.ReadAll(r) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("blob read: %w", err) + } + newHash, err := t.writeLoose(plumbing.BlobObject, body) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("blob store: %w", err) + } + t.mapping[sha1] = newHash + t.blobs.Add(1) + return newHash, nil +} + +func (t *translator) translateTree(sha1 plumbing.Hash, src plumbing.EncodedObject) (plumbing.Hash, error) { + tree := &object.Tree{} + if err := tree.Decode(src); err != nil { + return plumbing.ZeroHash, fmt.Errorf("decode tree %s: %w", sha1, err) + } + for i, entry := range tree.Entries { + if entry.Mode == filemode.Submodule { + // Should not be reachable: discoverReachable refuses any + // submodule gitlink up-front. Keep this as a defensive + // guard so the rewrite path never silently produces a + // SHA256 tree whose gitlink points at a hash the + // .gitmodules upstream repo cannot resolve. + return plumbing.ZeroHash, fmt.Errorf( + "tree %s contains submodule gitlink %q at %s; convert-sha256 refuses to rewrite submodule pointers", + sha1, entry.Name, entry.Hash) + } + newH, err := t.translate(entry.Hash) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("tree %s entry %q: %w", sha1, entry.Name, err) + } + tree.Entries[i].Hash = newH + } + body, err := encodeBody(plumbing.TreeObject, tree.Encode) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("encode tree %s: %w", sha1, err) + } + newHash, err := t.writeLoose(plumbing.TreeObject, body) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("store tree %s: %w", sha1, err) + } + t.mapping[sha1] = newHash + t.trees.Add(1) + return newHash, nil +} + +func (t *translator) translateCommit(sha1 plumbing.Hash, src plumbing.EncodedObject) (plumbing.Hash, error) { + c := &object.Commit{} + if err := c.Decode(src); err != nil { + return plumbing.ZeroHash, fmt.Errorf("decode commit %s: %w", sha1, err) + } + newTree, err := t.translate(c.TreeHash) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("commit %s tree: %w", sha1, err) + } + c.TreeHash = newTree + for i, p := range c.ParentHashes { + newP, err := t.translate(p) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("commit %s parent %s: %w", sha1, p, err) + } + c.ParentHashes[i] = newP + } + if t.rewriteMessages { + // Translate every in-scope SHA1 mentioned in this commit's + // message before rewriting it. This makes the message-reference + // edge part of the translation DFS, so the mapping contains + // each referenced object by the time we substitute. Without + // it, sibling-branch references (cherry-picks, etc.) would + // only resolve when ref iteration happened to process the + // referenced commit's branch first. + for _, ref := range t.extractMessageReferences(c.Message) { + if _, err := t.translate(ref); err != nil { + return plumbing.ZeroHash, fmt.Errorf("commit %s message ref %s: %w", sha1, ref, err) + } + } + if rewritten, n := t.rewriteHashesInMessage(c.Message); n > 0 { + c.Message = rewritten + t.messageRewrites += n + } + } + // A commit can carry both Signature (SHA1 form, "gpgsig") and + // SignatureSHA256 ("gpgsig-sha256") in a transitional dual-hash + // repo, but they encode the same logical signature. Strip both + // fields if present, count once. + if c.Signature != "" || c.SignatureSHA256 != "" { + c.Signature = "" + c.SignatureSHA256 = "" + t.signaturesStripped++ + } + // "mergetag" extra headers embed a copy of a signed annotated tag with + // its own signature. Drop them too — they reference the pre-rewrite + // commit/tag content and cannot be re-signed here. + if len(c.ExtraHeaders) > 0 { + filtered := c.ExtraHeaders[:0] + for _, h := range c.ExtraHeaders { + if h.Key == "mergetag" { + t.signaturesStripped++ + continue + } + filtered = append(filtered, h) + } + c.ExtraHeaders = filtered + } + body, err := encodeBody(plumbing.CommitObject, c.Encode) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("encode commit %s: %w", sha1, err) + } + newHash, err := t.writeLoose(plumbing.CommitObject, body) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("store commit %s: %w", sha1, err) + } + t.mapping[sha1] = newHash + t.commits = append(t.commits, sha1) + t.commitsCount.Add(1) + return newHash, nil +} + +func (t *translator) translateTag(sha1 plumbing.Hash, src plumbing.EncodedObject) (plumbing.Hash, error) { + tag := &object.Tag{} + if err := tag.Decode(src); err != nil { + return plumbing.ZeroHash, fmt.Errorf("decode tag %s: %w", sha1, err) + } + newTarget, err := t.translate(tag.Target) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("tag %s target: %w", sha1, err) + } + tag.Target = newTarget + if t.rewriteMessages { + // Same as translateCommit: translate every in-scope message + // reference before rewriting, so cross-branch references + // always resolve regardless of ref iteration order. + for _, ref := range t.extractMessageReferences(tag.Message) { + if _, err := t.translate(ref); err != nil { + return plumbing.ZeroHash, fmt.Errorf("tag %s message ref %s: %w", sha1, ref, err) + } + } + if rewritten, n := t.rewriteHashesInMessage(tag.Message); n > 0 { + tag.Message = rewritten + t.messageRewrites += n + } + } + // Same as commits: Signature and SignatureSHA256 are two encodings + // of the same logical signature in a transitional dual-hash repo. + if tag.Signature != "" || tag.SignatureSHA256 != "" { + tag.Signature = "" + tag.SignatureSHA256 = "" + t.signaturesStripped++ + } + body, err := encodeBody(plumbing.TagObject, tag.Encode) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("encode tag %s: %w", sha1, err) + } + newHash, err := t.writeLoose(plumbing.TagObject, body) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("store tag %s: %w", sha1, err) + } + t.mapping[sha1] = newHash + t.tags.Add(1) + return newHash, nil +} + +// encodeBody runs an object's go-git Encode method into a scratch +// MemoryObject and returns just the payload bytes — without the +// " \x00" header. writeLoose adds the SHA256-correct header. +// +// The format argument to NewMemoryObject is required by the constructor +// but unused here: we never ask the scratch object for its hash, only +// for its byte stream. +func encodeBody(typ plumbing.ObjectType, encode func(plumbing.EncodedObject) error) ([]byte, error) { + scratch := plumbing.NewMemoryObject(plumbing.FromObjectFormat(formatcfg.SHA1)) + scratch.SetType(typ) + if err := encode(scratch); err != nil { + return nil, err + } + r, err := scratch.Reader() + if err != nil { + return nil, fmt.Errorf("scratch reader: %w", err) + } + defer r.Close() + body, err := io.ReadAll(r) + if err != nil { + return nil, fmt.Errorf("read encoded body: %w", err) + } + return body, nil +} + +// writeLoose writes a single object as a SHA256-named loose object under +// objects//. Bypasses go-git's objfile.Writer, which would hash +// with SHA1. Atomic via tempfile+rename, idempotent on duplicate hashes. +// +// Durability is not guaranteed against power loss: we do not fsync the +// loose file or its parent directory before returning. The Stat-by-name +// idempotency shortcut would then accept a torn file from a previous +// crashed run as already-written. That trade-off is intentional — +// convert-sha256 is a single-shot bulk operation (not an incremental +// sync), it processes millions of objects on kernel-scale repos where +// per-object fsync would dominate runtime, and Run wipes the target +// directory on error (see the cleanupTarget defer in Run) so the only +// supported recovery is re-running from clean state. +func (t *translator) writeLoose(typ plumbing.ObjectType, body []byte) (plumbing.Hash, error) { + h := sha256.New() + header := append(typ.Bytes(), ' ') + header = strconv.AppendInt(header, int64(len(body)), 10) + header = append(header, 0) + h.Write(header) + h.Write(body) + sum := h.Sum(nil) + hexSum := hex.EncodeToString(sum) + + dir := filepath.Join(t.objectsDir, hexSum[:2]) + file := filepath.Join(dir, hexSum[2:]) + + hashID, ok := plumbing.FromBytes(sum) + if !ok { + return plumbing.ZeroHash, fmt.Errorf("internal: bad sha256 sum length %d", len(sum)) + } + + if _, err := os.Stat(file); err == nil { + return hashID, nil + } + if err := os.MkdirAll(dir, 0o755); err != nil { + return plumbing.ZeroHash, fmt.Errorf("mkdir %s: %w", dir, err) + } + + var buf bytes.Buffer + // Level 1 matches git's core.looseCompression default. Loose objects + // are short-lived (gc rolls them into packs), so optimizing for write + // speed over size is the standard trade-off. + zw, err := zlib.NewWriterLevel(&buf, zlib.BestSpeed) + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("zlib writer: %w", err) + } + if _, err := zw.Write(header); err != nil { + return plumbing.ZeroHash, fmt.Errorf("zlib write header: %w", err) + } + if _, err := zw.Write(body); err != nil { + return plumbing.ZeroHash, fmt.Errorf("zlib write body: %w", err) + } + if err := zw.Close(); err != nil { + return plumbing.ZeroHash, fmt.Errorf("zlib close: %w", err) + } + + tmp, err := os.CreateTemp(dir, "tmp_obj_") + if err != nil { + return plumbing.ZeroHash, fmt.Errorf("create temp object: %w", err) + } + if _, err := tmp.Write(buf.Bytes()); err != nil { + _ = tmp.Close() + _ = os.Remove(tmp.Name()) + return plumbing.ZeroHash, fmt.Errorf("write temp object: %w", err) + } + if err := tmp.Close(); err != nil { + _ = os.Remove(tmp.Name()) + return plumbing.ZeroHash, fmt.Errorf("close temp object: %w", err) + } + if err := os.Rename(tmp.Name(), file); err != nil { + _ = os.Remove(tmp.Name()) + return plumbing.ZeroHash, fmt.Errorf("rename %s: %w", file, err) + } + return hashID, nil +} + +// hashPattern matches hex runs that could be a git object hash. Git's +// default abbreviation is 7 chars; 40 is a full SHA1. Case-insensitive +// so messages that paste an uppercase or mixed-case hash (e.g. from +// some commit graph viewers) still resolve — the lookup canonicalizes +// to lowercase before checking the reachable set. We only rewrite a +// match if the prefix uniquely identifies a commit or tag in the +// reachable set, so false positives on incidental hex strings are +// essentially impossible (a random hex would have to collide with a +// real source SHA1). +var hashPattern = regexp.MustCompile(`(?i)\b[0-9a-f]{7,40}\b`) + +// matchResult is the 3-state outcome of resolving a hex prefix in a +// commit/tag message against the reachable set. We distinguish +// "ambiguous" from "no match" so the caller can warn the user about +// prefixes that *could* be rewritten if they were a couple of chars +// longer. +type matchResult int + +const ( + matchNone matchResult = iota + matchUnique + matchAmbiguous +) + +// rewriteHashesInMessage scans msg for short and full SHA1 hashes, +// replacing any that uniquely identify a commit or tag in t.reachable +// with the corresponding full SHA256 hex from t.mapping. Returns the +// rewritten message and the number of substitutions made. Ambiguous +// prefixes are recorded in t.ambiguousMessageRefs so the caller can +// surface a warning at the end of the run. +// +// Uniqueness is decided against t.reachable rather than t.mapping so +// that abbreviated prefixes get the same verdict during translation as +// they would after every object has been translated — the answer cannot +// flip depending on what has been processed so far. +// +// Performance: the abbreviated-hash path scans the reachable set +// linearly for each match. Fine for repos up to ~100k commits; slower +// past that. If this ever matters, build a sorted-prefix index over +// reachable SHA1 hex strings once and binary-search. +func (t *translator) rewriteHashesInMessage(msg string) (string, int) { + count := 0 + out := hashPattern.ReplaceAllStringFunc(msg, func(s string) string { + sha1, result := t.resolveMessageRef(s) + switch result { + case matchNone: + return s + case matchAmbiguous: + t.ambiguousMessageRefs[s] = struct{}{} + return s + case matchUnique: + newHash, ok := t.mapping[sha1] + if !ok { + // The reachable set says this SHA1 is in scope, but + // the translation DFS hasn't placed it yet. Shouldn't + // happen because translateCommit/translateTag add + // message-reference edges before encoding — leave the + // hex untouched if it somehow does. + return s + } + count++ + return newHash.String() + default: + return s + } + }) + return out, count +} + +// resolveMessageRef classifies a hex prefix against the reachable set. +// Returns matchUnique with the resolved SHA1 when exactly one commit +// or tag in scope matches; matchAmbiguous when more than one does; +// matchNone otherwise (no match, or the match is a blob/tree — those +// are filtered so incidental hex collisions on content hashes aren't +// rewritten). +// resolveCacheEntry holds a memoized (Hash, matchResult) pair from +// resolveMessageRef. Stored in t.resolveCache keyed by lowercased prefix. +type resolveCacheEntry struct { + hash plumbing.Hash + result matchResult +} + +func (t *translator) resolveMessageRef(prefix string) (plumbing.Hash, matchResult) { + // Canonicalize to lowercase: hashPattern is case-insensitive so + // the caller can match `ABCD1234` in a message, but reachable + // keys and plumbing.Hash.String() are always lowercase hex. + prefix = strings.ToLower(prefix) + if cached, ok := t.resolveCache[prefix]; ok { + return cached.hash, cached.result + } + hash, result := t.resolveMessageRefUncached(prefix) + t.resolveCache[prefix] = resolveCacheEntry{hash: hash, result: result} + return hash, result +} + +func (t *translator) resolveMessageRefUncached(prefix string) (plumbing.Hash, matchResult) { + if len(prefix) == 40 { + sha1, ok := plumbing.FromHex(prefix) + if !ok { + return plumbing.ZeroHash, matchNone + } + typ, in := t.reachable[sha1] + if !in { + return plumbing.ZeroHash, matchNone + } + if typ != plumbing.CommitObject && typ != plumbing.TagObject { + return plumbing.ZeroHash, matchNone + } + return sha1, matchUnique + } + var match plumbing.Hash + matches := 0 + for sha1, typ := range t.reachable { + if typ != plumbing.CommitObject && typ != plumbing.TagObject { + continue + } + if strings.HasPrefix(sha1.String(), prefix) { + matches++ + if matches > 1 { + return plumbing.ZeroHash, matchAmbiguous + } + match = sha1 + } + } + if matches == 1 { + return match, matchUnique + } + return plumbing.ZeroHash, matchNone +} + +// extractMessageReferences returns the unique commit/tag SHA1s mentioned +// by hex prefix in msg. Used by translateCommit/translateTag to add +// message-reference edges to the translation DFS so the mapping is +// fully populated by the time the message is rewritten. Ambiguous +// prefixes generate no edge — they cannot be rewritten anyway. +func (t *translator) extractMessageReferences(msg string) []plumbing.Hash { + seen := make(map[plumbing.Hash]struct{}) + var out []plumbing.Hash + for _, match := range hashPattern.FindAllString(msg, -1) { + sha1, result := t.resolveMessageRef(match) + if result != matchUnique { + continue + } + if _, dup := seen[sha1]; dup { + continue + } + seen[sha1] = struct{}{} + out = append(out, sha1) + } + return out +} + +// notesCommitTime returns the committer/author timestamp for the +// synthetic notes wrapper commit. Reads SOURCE_DATE_EPOCH (the +// reproducible-builds convention) when set, falling back to the Unix +// epoch so two runs over identical source state always produce the +// same notes-ref hash. +func notesCommitTime() time.Time { + if raw := os.Getenv("SOURCE_DATE_EPOCH"); raw != "" { + if secs, err := strconv.ParseInt(raw, 10, 64); err == nil { + return time.Unix(secs, 0).UTC() + } + } + return time.Unix(0, 0).UTC() +} + +// writeOriginNotes writes a `git notes` ref to dst that records each +// translated commit's original SHA1, keyed by its new SHA256. Standard +// git tooling (`git log --notes=`, `git notes --ref= show +// `) can then surface the old hash to anyone with the repo. +// +// The notes tree is flat (no fanout). Git supports either layout, and a +// flat layout keeps this code small; on repos with millions of commits +// lookups slow down to a linear tree scan, but the data is preserved. +func (t *translator) writeOriginNotes(refName string) (string, error) { + if len(t.commits) == 0 { + return "", nil + } + // Note for each commit: a blob containing the original SHA1 hex + newline. + // We collect (sha256-of-new-commit → blob hash) pairs so the tree entry + // path is the commit's new hash. + type entry struct { + key plumbing.Hash + blob plumbing.Hash + } + entries := make([]entry, 0, len(t.commits)) + for _, oldSHA1 := range t.commits { + newCommit, ok := t.mapping[oldSHA1] + if !ok { + continue + } + blobHash, err := t.writeLoose(plumbing.BlobObject, []byte(oldSHA1.String()+"\n")) + if err != nil { + return "", fmt.Errorf("note blob for %s: %w", oldSHA1, err) + } + entries = append(entries, entry{key: newCommit, blob: blobHash}) + } + if len(entries) == 0 { + return "", nil + } + + treeEntries := make([]object.TreeEntry, 0, len(entries)) + for _, e := range entries { + treeEntries = append(treeEntries, object.TreeEntry{ + Name: e.key.String(), + Mode: filemode.Regular, + Hash: e.blob, + }) + } + sort.Slice(treeEntries, func(i, j int) bool { + return treeEntries[i].Name < treeEntries[j].Name + }) + tree := &object.Tree{Entries: treeEntries} + treeBody, err := encodeBody(plumbing.TreeObject, tree.Encode) + if err != nil { + return "", fmt.Errorf("encode notes tree: %w", err) + } + treeHash, err := t.writeLoose(plumbing.TreeObject, treeBody) + if err != nil { + return "", fmt.Errorf("store notes tree: %w", err) + } + + // Honor SOURCE_DATE_EPOCH for reproducible builds; otherwise pin to + // the Unix epoch so the notes-ref hash is identical across runs over + // the same source state. The notes commit is bookkeeping — its + // timestamp carries no meaningful information about when the + // underlying SHA1 history was created. + sig := object.Signature{Name: "git-sync", Email: "noreply@entire.io", When: notesCommitTime()} + commit := &object.Commit{ + Author: sig, + Committer: sig, + Message: "git-sync convert-sha256: SHA1 origin notes\n", + TreeHash: treeHash, + } + commitBody, err := encodeBody(plumbing.CommitObject, commit.Encode) + if err != nil { + return "", fmt.Errorf("encode notes commit: %w", err) + } + commitHash, err := t.writeLoose(plumbing.CommitObject, commitBody) + if err != nil { + return "", fmt.Errorf("store notes commit: %w", err) + } + t.lastNotesCommit = commitHash + return refName, nil +} + +// startProgressTick spawns a goroutine that, every 500 ms, rewrites a +// single line in place on out with the string returned by render. The +// returned stop function halts the goroutine and emits a trailing +// newline so subsequent prints start on a fresh row. +// +// Only intended for TTY output: the rendered line uses '\r\x1b[K' to +// overwrite itself, which looks fine on a terminal and ugly anywhere +// else. Callers gate on isTTY before calling. +func startProgressTick(out io.Writer, render func() string) func() { + stop := make(chan struct{}) + done := make(chan struct{}) + go func() { + defer close(done) + t := time.NewTicker(500 * time.Millisecond) + defer t.Stop() + for { + select { + case <-stop: + return + case <-t.C: + fmt.Fprintf(out, "\r\x1b[K%s", render()) + } + } + }() + stopOnce := false + return func() { + if stopOnce { + return + } + stopOnce = true + close(stop) + <-done + // Last frame + newline so subsequent output is on a clean row. + fmt.Fprintf(out, "\r\x1b[K%s\n", render()) + } +} + +// isTTY reports whether w is a writable terminal. The --progress +// ticker is suppressed on non-TTY destinations because the '\r'-style +// in-place updates would otherwise show up as literal control +// characters in log files and pipes. +func isTTY(w io.Writer) bool { + f, ok := w.(*os.File) + if !ok { + return false + } + fi, err := f.Stat() + if err != nil { + return false + } + return (fi.Mode() & os.ModeCharDevice) != 0 +} + +// writeMappingFile dumps the SHA1 → SHA256 mapping as a TSV. Lines are +// sorted by SHA1 so diffs across runs are stable. Includes every +// translated object (blob/tree/commit/tag), so external tooling can use +// it for content-addressed lookups regardless of object kind. +func (t *translator) writeMappingFile(path string) error { + type pair struct{ sha1, sha256 string } + pairs := make([]pair, 0, len(t.mapping)) + for old, newH := range t.mapping { + pairs = append(pairs, pair{sha1: old.String(), sha256: newH.String()}) + } + sort.Slice(pairs, func(i, j int) bool { return pairs[i].sha1 < pairs[j].sha1 }) + + if dir := filepath.Dir(path); dir != "" { + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + } + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create %s: %w", path, err) + } + // Close is best-effort on the failure path (the underlying issue + // will already have surfaced via Flush). On the success path the + // explicit Close below propagates its error — networked / quota'd + // filesystems can defer write failures until close. + closed := false + defer func() { + if !closed { + _ = f.Close() + } + }() + w := bufio.NewWriter(f) + if _, err := fmt.Fprintln(w, "# sha1\tsha256"); err != nil { + return fmt.Errorf("write mapping header: %w", err) + } + for _, p := range pairs { + if _, err := fmt.Fprintf(w, "%s\t%s\n", p.sha1, p.sha256); err != nil { + return fmt.Errorf("write mapping line: %w", err) + } + } + if err := w.Flush(); err != nil { + return fmt.Errorf("flush mapping file: %w", err) + } + if err := f.Close(); err != nil { + return fmt.Errorf("close mapping file: %w", err) + } + closed = true + return nil +} + +// pickHEAD chooses which target-side ref the bare repo's HEAD should +// symlink to. It returns "" when no suitable branch exists (e.g. a +// tags-only conversion), in which case the caller leaves HEAD at the +// PlainInit default. +// +// Selection order: +// 1. The source's advertised HEAD, if it landed in the converted set. +// Resolved via the desired entry's TargetRef so a user-supplied ref +// mapping is honored. +// 2. refs/heads/main, then refs/heads/master, if either is present in +// the converted target refs. Some HTTP v1 servers do not advertise +// HEAD, so we pattern-match on conventional defaults. +// 3. The lexicographically first refs/heads/* in the target set, for +// a deterministic fallback when neither convention is present. +func pickHEAD(advertised plumbing.ReferenceName, desired map[plumbing.ReferenceName]planner.DesiredRef) plumbing.ReferenceName { + if advertised != "" { + if d, ok := desired[advertised]; ok { + return d.TargetRef + } + } + branches := make(map[plumbing.ReferenceName]struct{}, len(desired)) + for _, d := range desired { + if d.TargetRef.IsBranch() { + branches[d.TargetRef] = struct{}{} + } + } + for _, candidate := range []plumbing.ReferenceName{"refs/heads/main", "refs/heads/master"} { + if _, ok := branches[candidate]; ok { + return candidate + } + } + if len(branches) == 0 { + return "" + } + names := make([]string, 0, len(branches)) + for name := range branches { + names = append(names, string(name)) + } + sort.Strings(names) + return plumbing.ReferenceName(names[0]) +} + +func writeRefs( + dst storer.Storer, + desired map[plumbing.ReferenceName]planner.DesiredRef, + mapping map[plumbing.Hash]plumbing.Hash, +) (int, error) { + written := 0 + for _, d := range desired { + newHash, ok := mapping[d.SourceHash] + if !ok { + return written, fmt.Errorf("ref %s tip %s missing from translation map", d.TargetRef, d.SourceHash) + } + if err := dst.SetReference(plumbing.NewHashReference(d.TargetRef, newHash)); err != nil { + return written, fmt.Errorf("set ref %s: %w", d.TargetRef, err) + } + written++ + } + return written, nil +} diff --git a/cmd/git-sync/internal/sha256convert/sha256convert_test.go b/cmd/git-sync/internal/sha256convert/sha256convert_test.go new file mode 100644 index 00000000..d9a0e8fc --- /dev/null +++ b/cmd/git-sync/internal/sha256convert/sha256convert_test.go @@ -0,0 +1,1464 @@ +package sha256convert + +import ( + "bytes" + "compress/zlib" + "context" + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "io" + "net/http" + "net/http/cgi" + "net/http/httptest" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + git "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/filemode" + formatcfg "github.com/go-git/go-git/v6/plumbing/format/config" + "github.com/go-git/go-git/v6/plumbing/object" + gogitstorer "github.com/go-git/go-git/v6/plumbing/storer" + "github.com/go-git/go-git/v6/storage/filesystem" + + "entire.io/entire/git-sync/internal/planner" +) + +// TestTranslator builds a small SHA1 source repo with blobs, trees, commits, +// and an annotated tag — including signed commit/tag — then runs the +// translator and asserts both the bookkeeping counts and the on-disk +// invariant: every loose object's filename equals sha256(headered content). +// That invariant is the one go-git v6 alpha 3 gets wrong via its +// SetEncodedObject path; verifying it directly prevents regressing back +// onto the broken loose-object writer. +func TestTranslator(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init SHA1 source: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target: %v", err) + } + + blobHash := writeBlob(t, srcRepo.Storer, []byte("hello world\n")) + treeHash := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "README", Mode: filemode.Regular, Hash: blobHash}, + }) + + sig := object.Signature{Name: "Test", Email: "test@example.com", When: time.Unix(1700000000, 0).UTC()} + commit1 := &object.Commit{ + Author: sig, + Committer: sig, + Message: "initial\n", + TreeHash: treeHash, + Signature: "-----BEGIN PGP SIGNATURE-----\nfake sig data\n-----END PGP SIGNATURE-----", + } + c1Hash := writeObject(t, srcRepo.Storer, commit1.Encode) + + commit2 := &object.Commit{ + Author: sig, + Committer: sig, + Message: "second\n", + TreeHash: treeHash, + ParentHashes: []plumbing.Hash{c1Hash}, + } + c2Hash := writeObject(t, srcRepo.Storer, commit2.Encode) + + tag := &object.Tag{ + Name: "v1", + Tagger: sig, + Message: "annotated tag\n", + TargetType: plumbing.CommitObject, + Target: c2Hash, + Signature: "-----BEGIN PGP SIGNATURE-----\nfake tag sig\n-----END PGP SIGNATURE-----", + } + tagHash := writeObject(t, srcRepo.Storer, tag.Encode) + + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{tagHash}, nil) + if err != nil { + t.Fatalf("discoverReachable: %v", err) + } + tr, err := newTranslator(t.Context(), srcRepo.Storer, dstRepo.Storer, dstDir, false, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + newTagHash, err := tr.translate(tagHash) + if err != nil { + t.Fatalf("translate tag: %v", err) + } + + wantCounts := Counts{Blobs: 1, Trees: 1, Commits: 2, Tags: 1} + if got := tr.snapshotCounts(); got != wantCounts { + t.Errorf("counts: got %+v, want %+v", got, wantCounts) + } + if tr.signaturesStripped != 2 { + t.Errorf("signatures stripped: got %d, want 2 (commit + tag)", tr.signaturesStripped) + } + + // Idempotency: translating the same hash again must reuse the mapping + // without writing more objects or bumping counters. + startBlobs := tr.blobs.Load() + if _, err := tr.translate(tagHash); err != nil { + t.Fatalf("re-translate tag: %v", err) + } + if tr.blobs.Load() != startBlobs { + t.Errorf("re-translate increased blob count; memoization broken") + } + + // Every translated hash must point at a loose object whose filename + // equals sha256(headered content). This is the precise invariant the + // go-git bug violates — keep it as a test. + objectsDir := filepath.Join(dstDir, "objects") + verified := 0 + for _, h := range tr.mapping { + assertLooseObjectHashMatches(t, objectsDir, h) + verified++ + } + if verified == 0 { + t.Fatal("no objects in mapping; nothing was verified") + } + + // The translated tag must decode under the SHA256 target and point at + // a SHA256 commit whose tree resolves to a SHA256 tree. + tagObj, err := object.GetTag(dstRepo.Storer, newTagHash) + if err != nil { + t.Fatalf("read translated tag: %v", err) + } + if tagObj.Signature != "" { + t.Errorf("translated tag still carries a signature: %q", tagObj.Signature) + } + if tagObj.Target != tr.mapping[c2Hash] { + t.Errorf("translated tag target: got %s, want %s", tagObj.Target, tr.mapping[c2Hash]) + } + + commit, err := object.GetCommit(dstRepo.Storer, tagObj.Target) + if err != nil { + t.Fatalf("read translated commit: %v", err) + } + if commit.Signature != "" { + t.Errorf("translated commit still carries a signature: %q", commit.Signature) + } + if len(commit.ParentHashes) != 1 || commit.ParentHashes[0] != tr.mapping[c1Hash] { + t.Errorf("translated commit parents: got %v, want [%s]", commit.ParentHashes, tr.mapping[c1Hash]) + } + if commit.TreeHash != tr.mapping[treeHash] { + t.Errorf("translated commit tree: got %s, want %s", commit.TreeHash, tr.mapping[treeHash]) + } +} + +// TestTranslator_RewritesMessageHashes confirms that SHA1 hash references +// in commit and tag messages — both full 40-char and short forms — are +// rewritten to the corresponding SHA256 when those SHA1s are translated +// objects in the same conversion, and that ambiguous/unknown short +// prefixes are left alone. +func TestTranslator_RewritesMessageHashes(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init SHA1 source: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target: %v", err) + } + + blobHash := writeBlob(t, srcRepo.Storer, []byte("x\n")) + treeHash := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "f", Mode: filemode.Regular, Hash: blobHash}, + }) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + parent := &object.Commit{Author: sig, Committer: sig, Message: "first\n", TreeHash: treeHash} + parentSHA1 := writeObject(t, srcRepo.Storer, parent.Encode) + + // Child commit's message references the parent by full hash, by 7-char + // short prefix, and includes an unrelated 7-char hex string that should + // not match anything in the mapping. + parentHex := parentSHA1.String() + childMsg := fmt.Sprintf( + "reverts %s\nsee short %s for context\nunrelated hex 1234567 follows\n", + parentHex, parentHex[:7]) + child := &object.Commit{ + Author: sig, + Committer: sig, + Message: childMsg, + TreeHash: treeHash, + ParentHashes: []plumbing.Hash{parentSHA1}, + } + childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) + + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{childSHA1}, nil) + if err != nil { + t.Fatalf("discoverReachable: %v", err) + } + tr, err := newTranslator(t.Context(), srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + if _, err := tr.translate(childSHA1); err != nil { + t.Fatalf("translate child: %v", err) + } + + // 2 references should have been rewritten (full + short). The unrelated + // 7-char hex string is not in the mapping, so it stays. + if tr.messageRewrites != 2 { + t.Errorf("message rewrites: got %d, want 2", tr.messageRewrites) + } + + childNew := tr.mapping[childSHA1] + parentNew := tr.mapping[parentSHA1] + gotChild, err := object.GetCommit(dstRepo.Storer, childNew) + if err != nil { + t.Fatalf("read translated child: %v", err) + } + if !strings.Contains(gotChild.Message, parentNew.String()) { + t.Errorf("child message missing full SHA256 of parent:\n%s", gotChild.Message) + } + if strings.Contains(gotChild.Message, parentHex) { + t.Errorf("child message still contains original parent SHA1:\n%s", gotChild.Message) + } + if !strings.Contains(gotChild.Message, "1234567") { + t.Errorf("unrelated short hex was wrongly substituted:\n%s", gotChild.Message) + } +} + +// TestTranslator_RewritesCrossBranchReferences is the test that proves the +// discovery-plus-topological-DFS design fixes the cross-branch limitation +// the older inline-only rewriter had. Two unrelated branches share no +// ancestry. Branch A has a single commit cA. Branch B has commit cB whose +// message references cA by both full and abbreviated SHA1. We translate B +// first, *then* A — the order under which the older code would have left +// cB's message un-rewritten because cA was not yet in the mapping when cB +// was encoded. With message-reference edges in the DFS, translating cB +// pulls cA in via t.translate, so the mapping is populated and the +// rewrite succeeds. +func TestTranslator_RewritesCrossBranchReferences(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + srcRepo := initSHA1(t, srcDir) + dstRepo := initSHA256(t, dstDir) + + blobA := writeBlob(t, srcRepo.Storer, []byte("a\n")) + treeA := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "a", Mode: filemode.Regular, Hash: blobA}, + }) + blobB := writeBlob(t, srcRepo.Storer, []byte("b\n")) + treeB := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "b", Mode: filemode.Regular, Hash: blobB}, + }) + + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + cA := writeObject(t, srcRepo.Storer, (&object.Commit{ + Author: sig, Committer: sig, Message: "branch A tip\n", TreeHash: treeA, + }).Encode) + // cB has no parent in common with cA — they are siblings under + // no ancestor, exactly the case where ancestor-only inline + // rewriting would have failed. + cAHex := cA.String() + cB := writeObject(t, srcRepo.Storer, (&object.Commit{ + Author: sig, + Committer: sig, + Message: fmt.Sprintf("branch B tip\n\nCherry-picked from %s\nsee short %s\n", + cAHex, cAHex[:8]), + TreeHash: treeB, + }).Encode) + + // Discovery must see both branches so the reachable set covers cA + // before cB is encoded. + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{cB, cA}, nil) + if err != nil { + t.Fatalf("discoverReachable: %v", err) + } + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + // Translate B first — the order that would have left the rewrite + // stranded under the old design. + if _, err := tr.translate(cB); err != nil { + t.Fatalf("translate cB: %v", err) + } + if _, err := tr.translate(cA); err != nil { + t.Fatalf("translate cA: %v", err) + } + + if tr.messageRewrites != 2 { + t.Errorf("expected 2 rewrites (full + short SHA1 of cA), got %d", tr.messageRewrites) + } + cBNew := tr.mapping[cB] + cANew := tr.mapping[cA] + if cBNew.IsZero() || cANew.IsZero() { + t.Fatalf("missing mapping entries: cB=%s cA=%s", cBNew, cANew) + } + gotB, err := object.GetCommit(dstRepo.Storer, cBNew) + if err != nil { + t.Fatalf("read cB: %v", err) + } + if !strings.Contains(gotB.Message, cANew.String()) { + t.Errorf("cB's message missing cA's SHA256:\n%s", gotB.Message) + } + if strings.Contains(gotB.Message, cAHex) { + t.Errorf("cB's message still contains cA's original SHA1:\n%s", gotB.Message) + } +} + +// TestTranslator_SkipMessageRewrite confirms that with rewriteMessages +// false, the translator leaves message content (including SHA1 hashes) +// untouched. +func TestTranslator_SkipMessageRewrite(t *testing.T) { + root := t.TempDir() + srcRepo := initSHA1(t, filepath.Join(root, "src.git")) + dstRepo := initSHA256(t, filepath.Join(root, "dst.git")) + + blob := writeBlob(t, srcRepo.Storer, []byte("x\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1, 0).UTC()} + parent := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "p\n", TreeHash: tree}).Encode) + parentHex := parent.String() + + child := &object.Commit{ + Author: sig, Committer: sig, TreeHash: tree, ParentHashes: []plumbing.Hash{parent}, + Message: "reverts " + parentHex + "\n", + } + childSHA1 := writeObject(t, srcRepo.Storer, child.Encode) + + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{childSHA1}, nil) + if err != nil { + t.Fatalf("discoverReachable: %v", err) + } + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) + if _, err := tr.translate(childSHA1); err != nil { + t.Fatalf("translate: %v", err) + } + if tr.messageRewrites != 0 { + t.Errorf("expected no rewrites when disabled; got %d", tr.messageRewrites) + } + got, err := object.GetCommit(dstRepo.Storer, tr.mapping[childSHA1]) + if err != nil { + t.Fatalf("read translated child: %v", err) + } + if !strings.Contains(got.Message, parentHex) { + t.Errorf("rewrite-disabled run still mutated the message: %q", got.Message) + } +} + +// TestTranslator_WriteOriginNotes builds a small history and verifies that +// the notes tree contains one entry per translated commit and that each +// entry resolves to a blob whose content is the commit's original SHA1. +func TestTranslator_WriteOriginNotes(t *testing.T) { + root := t.TempDir() + srcRepo := initSHA1(t, filepath.Join(root, "src.git")) + dstRepo := initSHA256(t, filepath.Join(root, "dst.git")) + + blob := writeBlob(t, srcRepo.Storer, []byte("hi\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + c1 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c1\n", TreeHash: tree}).Encode) + c2 := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c2\n", TreeHash: tree, ParentHashes: []plumbing.Hash{c1}}).Encode) + + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{c2}, nil) + if err != nil { + t.Fatalf("discoverReachable: %v", err) + } + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) + if _, err := tr.translate(c2); err != nil { + t.Fatalf("translate: %v", err) + } + + refName, err := tr.writeOriginNotes(originNotesRef) + if err != nil { + t.Fatalf("writeOriginNotes: %v", err) + } + if refName != originNotesRef { + t.Errorf("ref name: got %q, want %q", refName, originNotesRef) + } + notesCommit, err := object.GetCommit(dstRepo.Storer, tr.lastNotesCommit) + if err != nil { + t.Fatalf("read notes commit: %v", err) + } + notesTree, err := notesCommit.Tree() + if err != nil { + t.Fatalf("read notes tree: %v", err) + } + if len(notesTree.Entries) != 2 { + t.Fatalf("notes entries: got %d, want 2", len(notesTree.Entries)) + } + for _, mapped := range []plumbing.Hash{tr.mapping[c1], tr.mapping[c2]} { + entry, err := notesTree.FindEntry(mapped.String()) + if err != nil { + t.Fatalf("no notes entry for %s: %v", mapped, err) + } + blob, err := object.GetBlob(dstRepo.Storer, entry.Hash) + if err != nil { + t.Fatalf("read note blob: %v", err) + } + reader, err := blob.Reader() + if err != nil { + t.Fatalf("open note blob: %v", err) + } + buf, err := io.ReadAll(reader) + if err != nil { + _ = reader.Close() + t.Fatalf("read note blob: %v", err) + } + _ = reader.Close() + got := strings.TrimSpace(string(buf)) + var origSHA1 plumbing.Hash + for s, n := range tr.mapping { + if n == mapped { + origSHA1 = s + break + } + } + if got != origSHA1.String() { + t.Errorf("note for %s: got %q, want %q", mapped, got, origSHA1.String()) + } + } +} + +// TestTranslator_WriteMappingFile checks the sidecar TSV format: header +// line, sorted by SHA1, one entry per translated object. +func TestTranslator_WriteMappingFile(t *testing.T) { + root := t.TempDir() + srcRepo := initSHA1(t, filepath.Join(root, "src.git")) + dstRepo := initSHA256(t, filepath.Join(root, "dst.git")) + + blob := writeBlob(t, srcRepo.Storer, []byte("hi\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + commit := writeObject(t, srcRepo.Storer, (&object.Commit{Author: sig, Committer: sig, Message: "c\n", TreeHash: tree}).Encode) + + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{commit}, nil) + if err != nil { + t.Fatalf("discoverReachable: %v", err) + } + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), false, reachable) + if _, err := tr.translate(commit); err != nil { + t.Fatalf("translate: %v", err) + } + + path := filepath.Join(root, "mapping.tsv") + if err := tr.writeMappingFile(path); err != nil { + t.Fatalf("writeMappingFile: %v", err) + } + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read mapping: %v", err) + } + lines := strings.Split(strings.TrimRight(string(raw), "\n"), "\n") + if !strings.HasPrefix(lines[0], "#") { + t.Errorf("first line should be a header comment, got %q", lines[0]) + } + data := lines[1:] + if len(data) != len(tr.mapping) { + t.Errorf("mapping line count: got %d, want %d", len(data), len(tr.mapping)) + } + // Sorted by SHA1. + for i := 1; i < len(data); i++ { + prev := strings.Split(data[i-1], "\t")[0] + cur := strings.Split(data[i], "\t")[0] + if prev >= cur { + t.Errorf("mapping not sorted: %q >= %q", prev, cur) + } + } + // Every translated hash present. + mapped := map[string]string{} + for _, line := range data { + parts := strings.Split(line, "\t") + if len(parts) != 2 { + t.Errorf("malformed line %q", line) + continue + } + mapped[parts[0]] = parts[1] + } + for old, newH := range tr.mapping { + if mapped[old.String()] != newH.String() { + t.Errorf("missing or wrong mapping for %s: got %q, want %s", old, mapped[old.String()], newH) + } + } +} + +// TestTranslator_AmbiguousMessageRefWarning verifies that when an +// abbreviated SHA1 prefix in a commit message matches more than one +// in-scope commit, the prefix is left unrewritten and recorded in +// t.ambiguousMessageRefs so the caller can surface a warning. +// +// We can't easily force a real SHA1 prefix collision in a test, so +// we install two synthetic entries in the reachable map after the +// translator is constructed and then run rewriteHashesInMessage +// directly. This exercises the same code path the production +// pipeline takes. +func TestTranslator_AmbiguousMessageRefWarning(t *testing.T) { + root := t.TempDir() + srcRepo := initSHA1(t, filepath.Join(root, "src.git")) + dstRepo := initSHA256(t, filepath.Join(root, "dst.git")) + tr := mustTranslator(t, srcRepo.Storer, dstRepo.Storer, filepath.Join(root, "dst.git"), true, nil) + + // Two real-looking SHA1 hashes that share the prefix "deadbee". + one := plumbing.NewHash("deadbee100000000000000000000000000000001") + two := plumbing.NewHash("deadbee200000000000000000000000000000002") + tr.reachable[one] = plumbing.CommitObject + tr.reachable[two] = plumbing.CommitObject + + out, count := tr.rewriteHashesInMessage("see commit deadbee for details\n") + if count != 0 { + t.Errorf("ambiguous prefix should not be rewritten; got count=%d", count) + } + if !strings.Contains(out, "deadbee") { + t.Errorf("ambiguous prefix should be left in message; got %q", out) + } + if _, recorded := tr.ambiguousMessageRefs["deadbee"]; !recorded { + t.Errorf("expected %q to be recorded in ambiguousMessageRefs, got %v", + "deadbee", tr.ambiguousMessageRefs) + } +} + +// TestTranslator_UnresolvableSubmodule confirms that a tree entry with +// Submodule mode pointing at a commit not in the source repo is +// rejected during discovery (fail-fast), before any object is written +// to the target. +func TestTranslator_UnresolvableSubmodule(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init SHA1 source: %v", err) + } + + blobHash := writeBlob(t, srcRepo.Storer, []byte("contents\n")) + // External-looking SHA1 — not in source. + external := plumbing.NewHash("0123456789abcdef0123456789abcdef01234567") + treeHash := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "file", Mode: filemode.Regular, Hash: blobHash}, + {Name: "sub", Mode: filemode.Submodule, Hash: external}, + }) + + _, err = discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{treeHash}, nil) + if err == nil { + t.Fatal("expected discoverReachable to fail on unresolvable submodule, got nil") + } + if !strings.Contains(err.Error(), "submodule") { + t.Errorf("error should mention submodule; got: %v", err) + } +} + +// TestTranslator_VendoredSubmoduleStillRefused locks in the rule that +// even a submodule whose commit happens to live in the source store is +// rejected. The earlier "vendored" carve-out rewrote such gitlinks to +// SHA256, but .gitmodules still points at an upstream SHA1 repo, so +// `git submodule update` would fail in clones of the converted repo. +func TestTranslator_VendoredSubmoduleStillRefused(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init SHA1 source: %v", err) + } + + // Create a commit that lives in this source store, then point a + // tree's submodule gitlink at it. discoverReachable used to recurse + // into that commit ("vendored") and translate the gitlink; now it + // refuses regardless. + blobHash := writeBlob(t, srcRepo.Storer, []byte("inner\n")) + innerTree := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "f", Mode: filemode.Regular, Hash: blobHash}, + }) + sig := object.Signature{Name: "Test", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + innerCommit := &object.Commit{Author: sig, Committer: sig, Message: "inner\n", TreeHash: innerTree} + innerSHA1 := writeObject(t, srcRepo.Storer, innerCommit.Encode) + + outerTree := writeTree(t, srcRepo.Storer, []object.TreeEntry{ + {Name: "sub", Mode: filemode.Submodule, Hash: innerSHA1}, + }) + + _, err = discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{outerTree}, nil) + if err == nil { + t.Fatal("expected discoverReachable to refuse vendored submodule, got nil") + } + if !strings.Contains(err.Error(), "submodule") { + t.Errorf("error should mention submodule; got: %v", err) + } +} + +// --- helpers --- + +// initSHA1 and initSHA256 are t.Fatalf-wrapping `git.PlainInit` shortcuts +// used to keep test bodies focused on the translator logic rather than +// error-handling boilerplate. +func initSHA1(t *testing.T, path string) *git.Repository { + t.Helper() + r, err := git.PlainInit(path, true) + if err != nil { + t.Fatalf("init SHA1 source at %s: %v", path, err) + } + return r +} + +func initSHA256(t *testing.T, path string) *git.Repository { + t.Helper() + r, err := git.PlainInit(path, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target at %s: %v", path, err) + } + return r +} + +func mustTranslator(t *testing.T, src, dst gogitstorer.Storer, dir string, rewrite bool, reachable map[plumbing.Hash]plumbing.ObjectType) *translator { + t.Helper() + tr, err := newTranslator(t.Context(), src, dst, dir, rewrite, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + return tr +} + +func writeBlob(t *testing.T, storer interface { + NewEncodedObject() plumbing.EncodedObject + SetEncodedObject(obj plumbing.EncodedObject) (plumbing.Hash, error) +}, content []byte) plumbing.Hash { + t.Helper() + obj := storer.NewEncodedObject() + obj.SetType(plumbing.BlobObject) + obj.SetSize(int64(len(content))) + w, err := obj.Writer() + if err != nil { + t.Fatalf("blob writer: %v", err) + } + if _, err := w.Write(content); err != nil { + t.Fatalf("blob write: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("blob close: %v", err) + } + h, err := storer.SetEncodedObject(obj) + if err != nil { + t.Fatalf("blob store: %v", err) + } + return h +} + +func writeTree(t *testing.T, storer interface { + NewEncodedObject() plumbing.EncodedObject + SetEncodedObject(obj plumbing.EncodedObject) (plumbing.Hash, error) +}, entries []object.TreeEntry) plumbing.Hash { + t.Helper() + tree := &object.Tree{Entries: entries} + // object.Tree.Encode requires the slice to be sorted by name; tests + // pre-sort their entries, but be safe. + return writeObject(t, storer, tree.Encode) +} + +func writeObject(t *testing.T, storer interface { + NewEncodedObject() plumbing.EncodedObject + SetEncodedObject(obj plumbing.EncodedObject) (plumbing.Hash, error) +}, encode func(plumbing.EncodedObject) error) plumbing.Hash { + t.Helper() + obj := storer.NewEncodedObject() + if err := encode(obj); err != nil { + t.Fatalf("encode: %v", err) + } + h, err := storer.SetEncodedObject(obj) + if err != nil { + t.Fatalf("store: %v", err) + } + return h +} + +// assertLooseObjectHashMatches reads the on-disk loose object for h, zlib- +// decompresses it, and confirms sha256(decompressed bytes) == h. The +// decompressed bytes include the " \x00" header, which is what +// git hashes — so this is a direct check on the loose writer's correctness. +func assertLooseObjectHashMatches(t *testing.T, objectsDir string, h plumbing.Hash) { + t.Helper() + hex := h.String() + if len(hex) != 64 { + t.Errorf("hash %s is not 64 hex chars (sha256)", hex) + return + } + path := filepath.Join(objectsDir, hex[:2], hex[2:]) + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + zr, err := zlib.NewReader(bytes.NewReader(raw)) + if err != nil { + t.Fatalf("zlib %s: %v", path, err) + } + defer zr.Close() + plain, err := io.ReadAll(zr) + if err != nil { + t.Fatalf("decompress %s: %v", path, err) + } + sum := sha256.Sum256(plain) + got := makeHex(sum[:]) + if got != hex { + t.Errorf("loose object %s: sha256(content) = %s; filename and content disagree", hex, got) + } +} + +func makeHex(b []byte) string { + return hex.EncodeToString(b) +} + +// --- Integration test (gated) --- + +const gitHTTPBackendEnv = "GITSYNC_E2E_SHA256_HTTP_BACKEND" + +// TestRun_GitHTTPBackend exercises the full convert-sha256 pipeline against +// a local git http-backend serving a real SHA1 source repo. Gated like the +// other end-to-end git-http-backend tests to keep the default test runs +// hermetic (no external binaries required). +func TestRun_GitHTTPBackend(t *testing.T) { + if os.Getenv(gitHTTPBackendEnv) == "" { + t.Skipf("set %s=1 to run the convert-sha256 git-http-backend integration test", gitHTTPBackendEnv) + } + gitBin, err := exec.LookPath("git") + if err != nil { + t.Skipf("git binary not available: %v", err) + } + + root := t.TempDir() + srcBare := filepath.Join(root, "source.git") + worktree := filepath.Join(root, "work") + dstDir := filepath.Join(root, "target.git") + + mustGit(t, root, "init", "--bare", srcBare) + mustGit(t, root, "init", "-b", "main", worktree) + mustGit(t, worktree, "config", "user.name", "convert-sha256 test") + mustGit(t, worktree, "config", "user.email", "test@example.com") + mustWrite(t, filepath.Join(worktree, "README"), "hello\n") + mustGit(t, worktree, "add", "README") + mustGit(t, worktree, "commit", "-m", "initial") + // Capture the first commit's SHA1 so the second commit's message can + // reference it (both full and abbreviated). The conversion should + // rewrite both to the new SHA256 hash. + firstSHA1 := strings.TrimSpace(mustGitOutput(t, worktree, "rev-parse", "HEAD")) + mustWrite(t, filepath.Join(worktree, "second.txt"), "world\n") + mustGit(t, worktree, "add", "second.txt") + mustGit(t, worktree, "commit", "-m", + fmt.Sprintf("second\n\nreverts %s\nsee short %s", firstSHA1, firstSHA1[:7])) + mustGit(t, worktree, "tag", "-a", "v1", "-m", "first tag") + mustGit(t, worktree, "remote", "add", "origin", srcBare) + mustGit(t, worktree, "push", "origin", "HEAD:refs/heads/main") + mustGit(t, worktree, "push", "origin", "v1") + + srv := newCGIBackend(t, gitBin, root) + defer srv.Close() + + mappingPath := filepath.Join(root, "mapping.tsv") + res, err := Run(context.Background(), Request{ + SourceURL: srv.URL + "/source.git", + TargetDir: dstDir, + MappingFile: mappingPath, + Check: true, + Out: io.Discard, + }) + if err != nil { + t.Fatalf("convert-sha256 run: %v", err) + } + if res.Counts.Commits < 2 { + t.Errorf("expected at least 2 commits converted, got %+v", res.Counts) + } + if res.Counts.Tags != 1 { + t.Errorf("expected 1 tag converted, got %d", res.Counts.Tags) + } + if res.RefsConverted < 2 { + t.Errorf("expected at least 2 refs (main + v1), got %d", res.RefsConverted) + } + + // The converted repo must be self-consistent under SHA256. + fsckOut, err := exec.CommandContext(t.Context(), gitBin, "-C", dstDir, "fsck", "--full").CombinedOutput() + if err != nil { + t.Fatalf("git fsck failed: %v\n%s", err, fsckOut) + } + if strings.Contains(string(fsckOut), "error") || strings.Contains(string(fsckOut), "bad sha") { + t.Fatalf("git fsck reported errors:\n%s", fsckOut) + } + + // Sanity: extensions.objectformat is set, and git can walk the history. + format := mustGitOutput(t, dstDir, "config", "extensions.objectformat") + if strings.TrimSpace(format) != "sha256" { + t.Errorf("extensions.objectformat: got %q, want %q", strings.TrimSpace(format), "sha256") + } + log := mustGitOutput(t, dstDir, "log", "--oneline", "refs/heads/main") + if !strings.Contains(log, "initial") || !strings.Contains(log, "second") { + t.Errorf("git log missing expected commit subjects:\n%s", log) + } + tagShow := mustGitOutput(t, dstDir, "cat-file", "-p", "refs/tags/v1") + if !strings.Contains(tagShow, "first tag") { + t.Errorf("annotated tag did not round-trip:\n%s", tagShow) + } + + // Message rewriting: the second commit's body referenced firstSHA1 + // twice (full + 7-char short). Both should now be SHA256 hashes. + if res.MessageRewrites != 2 { + t.Errorf("message rewrites: got %d, want 2", res.MessageRewrites) + } + secondMsg := mustGitOutput(t, dstDir, "log", "-1", "--format=%B", "refs/heads/main") + if strings.Contains(secondMsg, firstSHA1) { + t.Errorf("second commit message still contains the original SHA1:\n%s", secondMsg) + } + + // Origin notes: the ref exists, and the head commit's note resolves + // to the original SHA1 it was rewritten from. + if res.OriginNotesRef != "refs/notes/sha1-origin" { + t.Errorf("OriginNotesRef: got %q, want refs/notes/sha1-origin", res.OriginNotesRef) + } + headSHA256 := strings.TrimSpace(mustGitOutput(t, dstDir, "rev-parse", "refs/heads/main")) + note := strings.TrimSpace(mustGitOutput(t, dstDir, "notes", "--ref=sha1-origin", "show", headSHA256)) + // The note for the second (head) commit holds its pre-conversion SHA1. + headSHA1 := strings.TrimSpace(mustGitOutput(t, srcBare, "rev-parse", "refs/heads/main")) + if note != headSHA1 { + t.Errorf("origin note for head: got %q, want %q", note, headSHA1) + } + + // Mapping file: present, sorted, has at least one entry per + // translated commit/tree/blob/tag. + if res.MappingFile != mappingPath { + t.Errorf("MappingFile: got %q, want %q", res.MappingFile, mappingPath) + } + mapping, err := os.ReadFile(mappingPath) + if err != nil { + t.Fatalf("read mapping file: %v", err) + } + if !strings.Contains(string(mapping), headSHA1) { + t.Errorf("mapping file missing head SHA1 %s:\n%s", headSHA1, mapping) + } + + // --check: every step should pass against a freshly-converted repo, + // including git fsck --full (available since we already need the + // git binary to drive the source side of this test). + if len(res.Checks) == 0 { + t.Fatal("expected Checks to be populated when --check is enabled") + } + for _, c := range res.Checks { + if !c.OK { + t.Errorf("check %q failed: %s", c.Name, c.Detail) + } + } + expected := map[string]bool{"config": false, "HEAD": false, "refs": false, "git fsck --full": false} + for _, c := range res.Checks { + expected[c.Name] = true + } + for name, present := range expected { + if !present { + t.Errorf("--check did not run %q step", name) + } + } +} + +// TestRun_GitHTTPBackend_Sign verifies the --sign path end-to-end. SSH +// signing is used (not GPG) because it can be set up from scratch in the +// test with just ssh-keygen, no agent required. +func TestRun_GitHTTPBackend_Sign(t *testing.T) { + if os.Getenv(gitHTTPBackendEnv) == "" { + t.Skipf("set %s=1 to run the convert-sha256 git-http-backend integration test", gitHTTPBackendEnv) + } + gitBin, err := exec.LookPath("git") + if err != nil { + t.Skipf("git binary not available: %v", err) + } + sshKeygenBin, err := exec.LookPath("ssh-keygen") + if err != nil { + t.Skipf("ssh-keygen not available: %v", err) + } + + root := t.TempDir() + srcBare := filepath.Join(root, "source.git") + worktree := filepath.Join(root, "work") + dstDir := filepath.Join(root, "target.git") + + mustGit(t, root, "init", "--bare", srcBare) + mustGit(t, root, "init", "-b", "main", worktree) + mustGit(t, worktree, "config", "user.name", "convert-sha256 test") + mustGit(t, worktree, "config", "user.email", "test@example.com") + mustWrite(t, filepath.Join(worktree, "README"), "hello\n") + mustGit(t, worktree, "add", "README") + mustGit(t, worktree, "commit", "-m", "initial") + mustGit(t, worktree, "remote", "add", "origin", srcBare) + mustGit(t, worktree, "push", "origin", "HEAD:refs/heads/main") + + // Generate an ephemeral ed25519 SSH key for signing. + keyPath := filepath.Join(root, "signkey") + keygen := exec.CommandContext(t.Context(), sshKeygenBin, "-q", "-t", "ed25519", "-N", "", "-f", keyPath, "-C", "test@example.com") + if out, err := keygen.CombinedOutput(); err != nil { + t.Fatalf("ssh-keygen: %v\n%s", err, out) + } + + // Write a global gitconfig that points git at SSH signing using the + // ephemeral key, and route GIT_CONFIG_GLOBAL at it so signBranchTips' + // subprocess inherits the config. + globalCfg := filepath.Join(root, "global.gitconfig") + if err := os.WriteFile(globalCfg, []byte(fmt.Sprintf(` +[user] + name = Conversion Test + email = test@example.com + signingkey = %s +[gpg] + format = ssh +`, keyPath)), 0o600); err != nil { + t.Fatalf("write global gitconfig: %v", err) + } + t.Setenv("GIT_CONFIG_GLOBAL", globalCfg) + // Disable any system gitconfig so the test isn't influenced by host + // signing config. + t.Setenv("GIT_CONFIG_SYSTEM", "/dev/null") + + srv := newCGIBackend(t, gitBin, root) + defer srv.Close() + + res, err := Run(context.Background(), Request{ + SourceURL: srv.URL + "/source.git", + TargetDir: dstDir, + Sign: true, + Out: io.Discard, + }) + if err != nil { + t.Fatalf("convert-sha256 run: %v", err) + } + + wantTag := "refs/tags/converted/main" + if len(res.SignedTags) != 1 || res.SignedTags[0] != wantTag { + t.Errorf("SignedTags: got %v, want [%s]", res.SignedTags, wantTag) + } + + // The tag exists in the target and is an annotated, signed tag (the + // body contains a SSH SIGNATURE block; cat-file -p shows the tag + // object including the signature). + tagShow := mustGitOutput(t, dstDir, "cat-file", "-p", wantTag) + if !strings.Contains(tagShow, "BEGIN SSH SIGNATURE") { + t.Errorf("expected signed tag to contain an SSH SIGNATURE block:\n%s", tagShow) + } + if !strings.Contains(tagShow, "SHA1 → SHA256 conversion attestation") { + t.Errorf("expected signed tag message to contain attestation text:\n%s", tagShow) + } + + // Tag's target should be the branch tip (the SHA256 hash of the + // converted main). + mainTip := strings.TrimSpace(mustGitOutput(t, dstDir, "rev-parse", "refs/heads/main")) + tagTarget := strings.TrimSpace(mustGitOutput(t, dstDir, "rev-list", "-n", "1", wantTag)) + if tagTarget != mainTip { + t.Errorf("signed tag target: got %s, want %s (main tip)", tagTarget, mainTip) + } +} + +func mustGit(t *testing.T, dir string, args ...string) { + t.Helper() + cmd := exec.CommandContext(t.Context(), "git", args...) + cmd.Dir = dir + cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, out) + } +} + +func mustGitOutput(t *testing.T, dir string, args ...string) string { + t.Helper() + cmd := exec.CommandContext(t.Context(), "git", args...) + cmd.Dir = dir + cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, out) + } + return string(out) +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("write %s: %v", path, err) + } +} + +type cgiBackend struct { + *httptest.Server +} + +func newCGIBackend(t *testing.T, gitBin, root string) *cgiBackend { + t.Helper() + handler := &cgi.Handler{ + Path: gitBin, + Args: []string{"http-backend"}, + Env: []string{ + "GIT_PROJECT_ROOT=" + root, + "GIT_HTTP_EXPORT_ALL=1", + }, + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handler.ServeHTTP(w, r) + })) + return &cgiBackend{Server: srv} +} + +// Compile-time sanity: confirm the storers the translator expects are still +// the filesystem-backed type that PlainInit returns. If a future go-git +// release changes the concrete storer, the type assertion in newTranslator +// will start failing in this package's tests rather than only at runtime +// against a real repo. +var _ = (*filesystem.Storage)(nil) + +func TestProtectedExcludePrefixes(t *testing.T) { + tests := []struct { + name string + prefixes []string + want []string + }{ + {"nil input", nil, nil}, + {"single benign namespace", []string{"refs/pull/"}, nil}, + {"multiple benign namespaces", []string{"refs/pull/", "refs/notes/", "refs/changes/"}, nil}, + {"whole branches namespace banned", []string{"refs/heads/"}, []string{"refs/heads/"}}, + {"whole tags namespace banned", []string{"refs/tags/"}, []string{"refs/tags/"}}, + {"branch sub-namespace banned", []string{"refs/heads/feature/"}, []string{"refs/heads/feature/"}}, + {"tag sub-namespace banned", []string{"refs/tags/v1/"}, []string{"refs/tags/v1/"}}, + {"refs/ banned because it would drop everything", []string{"refs/"}, []string{"refs/"}}, + {"empty string banned (would drop every ref)", []string{""}, []string{""}}, + {"partial refs/h banned (covers refs/heads/)", []string{"refs/h"}, []string{"refs/h"}}, + {"mixed input reports only the bad ones, in order", []string{"refs/pull/", "refs/heads/", "refs/notes/", "refs/tags/v1.0"}, []string{"refs/heads/", "refs/tags/v1.0"}}, + {"duplicates collapsed", []string{"refs/heads/", "refs/heads/"}, []string{"refs/heads/"}}, + {"trims whitespace before matching", []string{" refs/heads/ "}, []string{" refs/heads/ "}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := protectedExcludePrefixes(tt.prefixes) + if len(got) != len(tt.want) { + t.Fatalf("protectedExcludePrefixes(%v) = %v, want %v", tt.prefixes, got, tt.want) + } + for i := range got { + if got[i] != tt.want[i] { + t.Fatalf("protectedExcludePrefixes(%v)[%d] = %q, want %q", tt.prefixes, i, got[i], tt.want[i]) + } + } + }) + } +} + +func TestRun_RejectsExcludePrefixesThatDropBranchesOrTags(t *testing.T) { + // We never reach the network here — the validation fires before + // any I/O — so a non-empty target dir is the only thing the early + // path needs. + dst := t.TempDir() + req := Request{ + SourceURL: "http://example.invalid/repo.git", + TargetDir: filepath.Join(dst, "out"), + ExcludeRefPrefixes: []string{"refs/pull/", "refs/heads/feature/"}, + } + _, err := Run(t.Context(), req) + if err == nil { + t.Fatalf("Run accepted --exclude-ref-prefix refs/heads/feature/, expected refusal") + } + msg := err.Error() + if !strings.Contains(msg, "refs/heads/feature/") { + t.Fatalf("error did not name the offending prefix: %v", err) + } + if !strings.Contains(msg, "exclude-ref-prefix") { + t.Fatalf("error did not mention the flag: %v", err) + } +} + +func TestCheckSideOutputCollision(t *testing.T) { + mk := func(name string) planner.DesiredRef { + ref := plumbing.ReferenceName(name) + return planner.DesiredRef{SourceRef: ref, TargetRef: ref} + } + tests := []struct { + name string + desired map[plumbing.ReferenceName]planner.DesiredRef + skipOriginNotes bool + sign bool + wantErrSubstring string + }{ + { + name: "no collisions accepted", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": mk("refs/heads/main"), + "refs/tags/v1": mk("refs/tags/v1"), + }, + wantErrSubstring: "", + }, + { + name: "origin-notes collision refused by default", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": mk("refs/heads/main"), + "refs/notes/sha1-origin": mk("refs/notes/sha1-origin"), + }, + wantErrSubstring: "refs/notes/sha1-origin", + }, + { + name: "origin-notes collision allowed when --no-origin-notes set", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/notes/sha1-origin": mk("refs/notes/sha1-origin"), + }, + skipOriginNotes: true, + wantErrSubstring: "", + }, + { + name: "converted-tag collision refused only when --sign", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": mk("refs/heads/main"), + "refs/tags/converted/main": mk("refs/tags/converted/main"), + }, + sign: true, + wantErrSubstring: "refs/tags/converted/main", + }, + { + name: "converted-tag without --sign passes through", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/tags/converted/main": mk("refs/tags/converted/main"), + }, + sign: false, + wantErrSubstring: "", + }, + { + name: "multiple converted-tag collisions listed in sorted order", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/tags/converted/zeta": mk("refs/tags/converted/zeta"), + "refs/tags/converted/alpha": mk("refs/tags/converted/alpha"), + }, + sign: true, + wantErrSubstring: "refs/tags/converted/alpha, refs/tags/converted/zeta", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := checkSideOutputCollision(tt.desired, tt.skipOriginNotes, tt.sign) + switch { + case tt.wantErrSubstring == "" && err != nil: + t.Fatalf("unexpected error: %v", err) + case tt.wantErrSubstring != "" && err == nil: + t.Fatalf("expected error containing %q, got nil", tt.wantErrSubstring) + case tt.wantErrSubstring != "" && !strings.Contains(err.Error(), tt.wantErrSubstring): + t.Fatalf("error %q does not contain %q", err.Error(), tt.wantErrSubstring) + } + }) + } +} + +// TestDiscoverReachable_HonorsCtxCancellation confirms discovery +// returns promptly when its context is canceled before it starts, +// matching the per-object check translate() already does. +func TestDiscoverReachable_HonorsCtxCancellation(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init source: %v", err) + } + blob := writeBlob(t, srcRepo.Storer, []byte("x\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + + ctx, cancel := context.WithCancel(t.Context()) + cancel() + _, err = discoverReachable(ctx, srcRepo.Storer, []plumbing.Hash{tree}, nil) + if err == nil { + t.Fatal("expected canceled ctx to surface as error") + } + if !errors.Is(err, context.Canceled) { + t.Errorf("error should wrap context.Canceled; got %v", err) + } +} + +// TestRunChecks_FsckSkippedWhenGitMissing locks in the Skipped flag +// for the fsck check. Callers that gate on Check.OK alone now can't +// tell a real fsck pass from a skip; Skipped resolves the ambiguity. +func TestRunChecks_FsckSkippedWhenGitMissing(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git binary available; this test exercises the missing-git path via PATH override") + } + // Force LookPath("git") to fail by overriding PATH. + t.Setenv("PATH", "") + dir := t.TempDir() + repo, err := git.PlainInit(dir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init: %v", err) + } + checks := runChecks(t.Context(), dir, repo, 0, nil, false) + var fsck Check + for _, c := range checks { + if c.Name == "git fsck --full" { + fsck = c + break + } + } + if fsck.Name == "" { + t.Fatalf("fsck check missing from output") + } + if !fsck.Skipped { + t.Errorf("fsck should be Skipped when git is missing, got %+v", fsck) + } + if !fsck.OK { + t.Errorf("Skipped implies OK; got %+v", fsck) + } +} + +// TestHashPattern_CaseInsensitive locks in the (?i) on hashPattern — +// uppercase or mixed-case SHA1 references in messages must resolve +// against the (lowercase-canonical) reachable set. +func TestHashPattern_CaseInsensitive(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init src: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init dst: %v", err) + } + blob := writeBlob(t, srcRepo.Storer, []byte("x\n")) + tree := writeTree(t, srcRepo.Storer, []object.TreeEntry{{Name: "f", Mode: filemode.Regular, Hash: blob}}) + sig := object.Signature{Name: "T", Email: "t@example.com", When: time.Unix(1700000000, 0).UTC()} + parent := &object.Commit{Author: sig, Committer: sig, Message: "first\n", TreeHash: tree} + parentHash := writeObject(t, srcRepo.Storer, parent.Encode) + + // Reference the parent with an UPPERCASE full hash. + upper := strings.ToUpper(parentHash.String()) + child := &object.Commit{ + Author: sig, Committer: sig, + Message: "see " + upper + " for context\n", + TreeHash: tree, + ParentHashes: []plumbing.Hash{parentHash}, + } + childHash := writeObject(t, srcRepo.Storer, child.Encode) + + reachable, err := discoverReachable(t.Context(), srcRepo.Storer, []plumbing.Hash{childHash}, nil) + if err != nil { + t.Fatalf("discover: %v", err) + } + tr, err := newTranslator(t.Context(), srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + newChild, err := tr.translate(childHash) + if err != nil { + t.Fatalf("translate: %v", err) + } + if tr.messageRewrites != 1 { + t.Errorf("expected 1 rewrite (case-insensitive match), got %d", tr.messageRewrites) + } + c, err := object.GetCommit(dstRepo.Storer, newChild) + if err != nil { + t.Fatalf("read translated child: %v", err) + } + if strings.Contains(c.Message, upper) { + t.Errorf("uppercase SHA1 should have been rewritten; message: %q", c.Message) + } +} + +// TestFsckHasError_HandlesLongLinesAndCase covers two fragility +// fixes: lines longer than bufio.Scanner's 64 KiB default must not be +// silently truncated (we use bytes.Split now), and the "error" / +// "fatal" prefix match must be case-insensitive so e.g. older or +// custom git builds emitting "ERROR:" still trip the check. +func TestFsckHasError_HandlesLongLinesAndCase(t *testing.T) { + t.Run("long line still scanned", func(t *testing.T) { + // 100 KiB of dangling-blob filler followed by an error line. + out := append(bytes.Repeat([]byte("a"), 100*1024), []byte("\nerror: bad ref\n")...) + if !fsckHasError(out) { + t.Errorf("fsckHasError should detect error line after a long preceding line") + } + }) + t.Run("uppercase ERROR matches", func(t *testing.T) { + if !fsckHasError([]byte("ERROR: corruption\n")) { + t.Errorf("fsckHasError should match uppercase ERROR") + } + }) + t.Run("fatal without colon matches", func(t *testing.T) { + if !fsckHasError([]byte("Fatal failure in pack\n")) { + t.Errorf("fsckHasError should match Fatal prefix even without colon") + } + }) + t.Run("dangling warnings are not errors", func(t *testing.T) { + if fsckHasError([]byte("dangling commit abc123\n")) { + t.Errorf("dangling lines should not trip fsckHasError") + } + }) +} + +// TestResolveMessageRef_Memoizes confirms a second call for the same +// prefix doesn't re-scan reachable. We do not have a counter on the +// scan, so we test the cache by mutating reachable between calls and +// verifying the second call returns the original result. (In real +// usage, reachable is frozen — this is just a behavioral observation +// to lock in cache effectiveness.) +func TestResolveMessageRef_Memoizes(t *testing.T) { + root := t.TempDir() + srcDir := filepath.Join(root, "src.git") + dstDir := filepath.Join(root, "dst.git") + srcRepo, err := git.PlainInit(srcDir, true) + if err != nil { + t.Fatalf("init src: %v", err) + } + dstRepo, err := git.PlainInit(dstDir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init dst: %v", err) + } + reachable := map[plumbing.Hash]plumbing.ObjectType{ + plumbing.NewHash("abc1234567890abcdef1234567890abcdef12345"): plumbing.CommitObject, + } + tr, err := newTranslator(t.Context(), srcRepo.Storer, dstRepo.Storer, dstDir, true, reachable) + if err != nil { + t.Fatalf("newTranslator: %v", err) + } + + prefix := "abc12345" + h1, r1 := tr.resolveMessageRef(prefix) + // Mutate reachable; if the cache works, the next call must return + // the same answer as the first. + for k := range tr.reachable { + delete(tr.reachable, k) + } + h2, r2 := tr.resolveMessageRef(prefix) + if h1 != h2 || r1 != r2 { + t.Errorf("resolveMessageRef should return cached value; got first (%s, %v) vs second (%s, %v)", h1, r1, h2, r2) + } + if _, cached := tr.resolveCache[strings.ToLower(prefix)]; !cached { + t.Errorf("resolveCache should contain entry for %q", prefix) + } +} + +// TestRunChecks_TagOnlyConversionSkipsHEAD locks in the rule that a +// tags-only conversion does not fail --check on HEAD. PlainInit leaves +// HEAD pointing at refs/heads/master (which won't exist), and pickHEAD +// returns "" because the desired set has no branches; runChecks must +// detect that and mark HEAD as "skipped" rather than "missing". +func TestRunChecks_TagOnlyConversionSkipsHEAD(t *testing.T) { + dir := t.TempDir() + repo, err := git.PlainInit(dir, true, git.WithObjectFormat(formatcfg.SHA256)) + if err != nil { + t.Fatalf("init SHA256 target: %v", err) + } + + checks := runChecks(t.Context(), dir, repo, 0, nil, false) + var head Check + for _, c := range checks { + if c.Name == "HEAD" { + head = c + break + } + } + if head.Name == "" { + t.Fatalf("HEAD check missing from runChecks output") + } + if !head.OK { + t.Errorf("HEAD should be OK for tags-only conversion, got %+v", head) + } + if !head.Skipped { + t.Errorf("HEAD should be marked Skipped on tags-only conversion, got %+v", head) + } + if !strings.Contains(head.Detail, "tags-only") { + t.Errorf("HEAD detail should explain the skip reason, got %q", head.Detail) + } +} + +func TestPickHEAD(t *testing.T) { + branch := func(name string) planner.DesiredRef { + ref := plumbing.ReferenceName("refs/heads/" + name) + return planner.DesiredRef{Kind: planner.RefKindBranch, SourceRef: ref, TargetRef: ref} + } + tag := func(name string) planner.DesiredRef { + ref := plumbing.ReferenceName("refs/tags/" + name) + return planner.DesiredRef{Kind: planner.RefKindTag, SourceRef: ref, TargetRef: ref} + } + tests := []struct { + name string + advertised plumbing.ReferenceName + desired map[plumbing.ReferenceName]planner.DesiredRef + want plumbing.ReferenceName + }{ + { + name: "advertised HEAD wins when present in desired", + advertised: "refs/heads/develop", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": branch("main"), + "refs/heads/develop": branch("develop"), + }, + want: "refs/heads/develop", + }, + { + name: "advertised HEAD respects ref mapping (target side)", + advertised: "refs/heads/source-name", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/source-name": { + Kind: planner.RefKindBranch, + SourceRef: "refs/heads/source-name", + TargetRef: "refs/heads/target-name", + }, + }, + want: "refs/heads/target-name", + }, + { + name: "falls back to main when advertised HEAD missing", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": branch("main"), + "refs/heads/master": branch("master"), + }, + want: "refs/heads/main", + }, + { + name: "falls back to master when no main", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/master": branch("master"), + "refs/heads/feature": branch("feature"), + }, + want: "refs/heads/master", + }, + { + name: "falls back to first sorted branch when neither main nor master", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/zeta": branch("zeta"), + "refs/heads/alpha": branch("alpha"), + "refs/heads/beta": branch("beta"), + }, + want: "refs/heads/alpha", + }, + { + name: "advertised HEAD pointing outside desired falls back to convention", + advertised: "refs/heads/dropped", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/heads/main": branch("main"), + }, + want: "refs/heads/main", + }, + { + name: "tags-only conversion returns empty so HEAD stays at PlainInit default", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{ + "refs/tags/v1.0": tag("v1.0"), + }, + want: "", + }, + { + name: "empty desired returns empty", + advertised: "", + desired: map[plumbing.ReferenceName]planner.DesiredRef{}, + want: "", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := pickHEAD(tt.advertised, tt.desired) + if got != tt.want { + t.Fatalf("pickHEAD = %q, want %q", got, tt.want) + } + }) + } +} diff --git a/cmd/git-sync/root.go b/cmd/git-sync/root.go index 651eb4d6..f0c73da5 100644 --- a/cmd/git-sync/root.go +++ b/cmd/git-sync/root.go @@ -36,6 +36,7 @@ seed an empty target (bootstrap), or inspect either side (probe, fetch).`, cmd.AddCommand(newBootstrapCmd()) cmd.AddCommand(newProbeCmd()) cmd.AddCommand(newFetchCmd()) + cmd.AddCommand(newConvertSHA256Cmd()) cmd.AddCommand(newVersionCmd()) return cmd diff --git a/docs/convert-sha256.md b/docs/convert-sha256.md new file mode 100644 index 00000000..3d7fa155 --- /dev/null +++ b/docs/convert-sha256.md @@ -0,0 +1,333 @@ +# SHA1 → SHA256 Conversion + +`git-sync convert-sha256` is a one-off migration command that fetches a pack +from a SHA1 HTTP source and writes a new SHA256 bare repository on disk. +Every reachable object is re-hashed under SHA256 and tree, commit, and tag +references are rewritten accordingly. The command does not push to a +remote, does not modify the source, and is meant to run once per repo. +SHA256 hashes have no relation to the original SHA1 hashes beyond a +mapping the command can optionally emit. + +## Quick Start + +```bash +git-sync convert-sha256 \ + https://github.com/source-org/source-repo.git \ + /path/to/out.git +``` + +The target directory must not exist or must be empty. The result is a bare +repository with `extensions.objectformat = sha256` and a +`refs/notes/sha1-origin` ref recording each commit's pre-conversion SHA1. + +Scope is fixed: every branch and every tag on the source is always +converted. Pass `--all-refs` to also include `refs/notes/*`, +`refs/pull/*`, and other custom namespaces; pair with +`--exclude-ref-prefix` to subtract specific namespaces (e.g. +`--exclude-ref-prefix refs/pull/` on GitHub mirrors). + +For a private source, pass the token via the environment so it isn't +exposed in `ps`: + +```bash +GITSYNC_SOURCE_TOKEN=ghp_xxx git-sync convert-sha256 \ + https://github.com/source-org/private-repo.git \ + /path/to/out.git +``` + +## What It Does + +1. Probes the source via smart HTTP and lists every in-scope ref. +2. Fetches a single self-contained pack via `upload-pack` into a + temporary on-disk SHA1 bare repo (cleaned up at the end unless + `--keep-source-objects` is passed). +3. Discovers every reachable object — walking trees, commits, and tags + — and records each one's SHA1 and object type. Submodule gitlinks + are checked here; unresolvable ones fail-fast before any output is + written. +4. Initializes the target as a bare SHA256 repository + (`git init --object-format=sha256` equivalent). +5. Translates every reachable object in topological order via memoized + DFS: + - **Blobs**: re-hashed under SHA256; content unchanged. + - **Trees**: each entry's hash translated. + - **Commits**: `tree` and `parent` hashes translated; GPG signatures + and `mergetag` headers dropped; in-scope SHA1 references in the + message are translated first and then substituted. + - **Tags**: target hash translated; signatures dropped; message + hashes rewritten the same way. +6. Writes refs at the translated tip hashes; repoints HEAD to the + source's symbolic HEAD; builds `refs/notes/sha1-origin` (unless + `--no-origin-notes`); emits the `--write-mapping` TSV (if requested). + +## Side Outputs + +The conversion deliberately decouples SHA1 from SHA256 — two runs of +this tool against the same source produce SHA256 hashes that share +nothing with the originals. Three on-ramps help bridge the gap. + +### Inline message rewriting (default on) + +Commit and tag messages are scanned for 7-to-40-character hex runs. +When a run uniquely matches a commit or tag SHA1 in the reachable set, +it is replaced with the full SHA256 hex: + +``` +Reverts: a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0 → full SHA256 +Cherry-picked from a1b2c3d → full SHA256 +``` + +Two properties make this robust: + +- **Uniqueness is decided against the reachable set, not the in-flight + mapping.** The discovery pass enumerates every reachable SHA1 before + any encoding starts, so abbreviated prefixes get the same verdict + regardless of how far the translation has progressed. Ambiguous + prefixes are left unrewritten and reported (warning on stderr + + `--json`'s `ambiguousMessageRefs`); look them up in the mapping file. +- **Cross-branch references resolve.** Each in-scope SHA1 mentioned in + a message is added as a dependency edge in the translation DFS, so + the referenced commit is translated before the referencing commit is + encoded. A cherry-pick from a sibling branch resolves just as + reliably as a revert of an ancestor. + +False positives are essentially impossible: a run is substituted only +if its prefix uniquely matches a commit or tag in scope. Blob and tree +hashes are excluded from the match set. Disable with +`--no-rewrite-messages` if you prefer untouched messages. + +### Origin notes ref (default on) + +`refs/notes/sha1-origin` holds, for each translated commit, the +pre-conversion SHA1 keyed by the new SHA256: + +```bash +git -C /path/to/out.git notes --ref=sha1-origin show +# prints the original SHA1 + +git -C /path/to/out.git log --notes=sha1-origin +# shows the original SHA1 below each commit's body +``` + +Notes attach meaningfully only to commits; blobs, trees, and tags are +not represented. Disable with `--no-origin-notes`. + +### Sidecar mapping file (opt in via `--write-mapping`) + +`--write-mapping ` emits a TSV with one line per translated +object, sorted by SHA1: + +``` +# sha1 sha256 +00027b675386b21c4ca05316145671fb7034d251 d80415fa21bebb... +000bb155604d06f1c48fc7feb4b025d991ef3366 a23cf98db5abfa... +... +``` + +Useful for bulk rewriting external systems: feed the file to a script +that walks Jira tickets, PR bodies, deploy manifests, or any other +system that holds frozen SHA1 references. + +### Branch-tip attestation tags (opt in via `--sign`) + +`--sign` shells out to `git tag -s converted/ ` for every +converted branch after the conversion completes. Each resulting +signed annotated tag is a cryptographic attestation by the converter +that the entire reachable history of that branch — every parent, tree, +and blob — is what the converter saw at conversion time. Anyone can +verify the chain afterwards with `git verify-tag refs/tags/converted/`. + +The mechanism is the standard one: parent hashes are part of each +commit's bytes, so the tip's hash transitively commits to the whole +history. Signing the tip attests every ancestor. + +Important nuance: the signature is by the *converter*, not by the +original authors (whose own signatures are necessarily lost — see +"GPG signatures are stripped" under Sharp Edges). The attestation +chain becomes "*X attests this is the conversion they produced*" +rather than "*the original authors wrote this commit*". For internal +mirrors or single-identity repos that's a strict improvement over +unsigned-everywhere; for broad public repos it is weaker than the +pre-conversion chain. + +Signing uses the target repo's git signing config (`user.signingkey`, +`gpg.format`) by default — same as `git commit -S` or a normal +`git tag -s`. Override with `--sign-key `, which is passed to +`git tag -s -u `. SSH signing (`gpg.format = ssh`) and OpenPGP +both work because we shell out to `git`. + +Requires the `git` binary on `PATH`. Signing failures (no key +configured, gpg/ssh-agent unavailable, etc.) abort the run after the +conversion has already completed — the target repo is left in a +valid converted state, just without the attestation tags. Re-run +`git tag -s converted/ ` manually once the signing +identity is set up. + +## Flags + +``` +--source-url source repository URL +--source-token source password/token (prefer env) +--source-username source basic auth username (default git) +--source-bearer-token source bearer token +--source-insecure-skip-tls-verify skip TLS verification (testing only) +--source-follow-info-refs-redirect follow /info/refs cross-host redirects +--target-dir SHA256 bare repo directory (must be empty) + +--all-refs also include refs/* outside heads/tags + (notes, pulls, custom namespaces) +--exclude-ref-prefix subtract refs by prefix; repeatable + +--protocol protocol mode (auto, v1, v2) +--write-mapping write SHA1 → SHA256 TSV to this path +--no-rewrite-messages skip inline hash rewrites in messages +--no-origin-notes skip refs/notes/sha1-origin +--check verify the output (config, HEAD, refs, git fsck) +--sign sign each branch tip via `git tag -s converted/` +--sign-key signing key id passed to `git tag -s -u ` +--keep-source-objects leave the temp SHA1 store on disk +--progress live per-phase object counts (TTY only) +--json machine-readable output +--verbose, -v verbose logging +``` + +There are no `--branch`, `--tags`, or `--map` flags: scope is fixed to +every branch and every tag on the source. + +Environment fallbacks: `GITSYNC_SOURCE_TOKEN`, `GITSYNC_SOURCE_USERNAME`, +`GITSYNC_SOURCE_BEARER_TOKEN`, `GITSYNC_SOURCE_INSECURE_SKIP_TLS_VERIFY`, +`GITSYNC_SOURCE_FOLLOW_INFO_REFS_REDIRECT`, `GITSYNC_PROTOCOL`. + +## Sharp Edges + +**GPG signatures are stripped.** A signature is bytes signed over the +commit's pre-conversion content (including the SHA1 hashes in `tree` +and `parent` lines). After rewriting, the bytes no longer match the +signature, so verification would always fail; the command drops them +and prints a count. Signed annotated tags lose their signature the +same way. `mergetag` headers on merge commits — which embed a signed +tag with its own signature — are removed entirely, since the embedded +tag references original SHA1s and the signature was computed over +those original bytes. + +**Submodule gitlinks must resolve in-repo.** Tree entries with mode +`160000` reference a commit in another repository, but a SHA1 hash +cannot be embedded in a SHA256 tree. The command fails-fast in the +discovery pass — before the target bare repo is initialized — naming +the offending tree, entry, and hash. Convert the submodule repository +first so its commit hashes are available in SHA256. + +**Replace refs and source notes refs become detached.** +`refs/replace/` encodes a SHA1 in the ref name, so the name +doesn't match under SHA256 and the replacement never triggers. +`refs/notes/*` trees from the source (copied under `--all-refs`) +encode the target object's hash as the entry name, so notes survive +as data but no longer attach to their original commits. Use the +tool's own `refs/notes/sha1-origin` for the inverse lookup. + +## Operational Notes + +**One-off, not incremental.** Each run produces a fresh SHA256 repo +from scratch — there is no "fetch the new SHA1 commits and append to +the existing SHA256 repo" mode. Realistic use: convert once, then +make the converted repo the new canonical store. Branch and tag +hashes are deterministic across runs against the same source state; +only `refs/notes/sha1-origin` differs because its wrapper commit +carries `time.Now()` as the committer timestamp. + +**Loose-object storage.** Every translated object is written as a +loose file under `objects//` — no pack file is produced. +Correct, but slow on filesystems that dislike millions of small files. +Run `git -C gc --aggressive` afterwards to pack the converted +repo down to a single packfile. + +**Memory linear in reachable object count.** Two `map[Hash]…` +structures stay live for the whole run: `reachable` (SHA1 → object +type, built by discovery) and `mapping` (SHA1 → SHA256, built by +translation). At cobra scale (~5k objects), kilobytes; at Linux kernel +scale (~16M objects), roughly 2 GB peak. + +**Discovery adds a ~1.5× decode pass.** Every reachable object is +decoded twice: once in discovery (no encoding) and once in translation +(decode + encode). The cost buys consistent uniqueness verdicts for +message rewriting and submodule fail-fast. + +**Abbreviated-prefix lookup is a linear scan.** Each abbreviated SHA1 +in a message triggers an O(reachable) scan to check uniqueness. Fine +to ~100k commits; slower past that. A sorted-prefix index would make +it O(log N), an easy optimization if someone hits the wall. + +## Verifying the Output + +Pass `--check` and the command runs four sanity checks against the +converted repo at the end of the run, printing one line each: + +``` +verifying output ... + ✓ config: extensions.objectformat = sha256 + ✓ HEAD: ffe9fff421b77f2dcc049a95b3b8ba7b9da8976dd61bcf35e9fe2d993babc470 + ✓ refs: 37 / 37 resolve to objects + ✓ git fsck --full: clean +``` + +The checks are: + +1. **config** — `extensions.objectformat = sha256` is present in + `/config`. +2. **HEAD** — resolves to a non-zero hash and that object exists in + the store. +3. **refs** — every written ref (except `refs/notes/sha1-origin`, + counted separately) resolves to an object in the store. The count + matches `RefsConverted`. +4. **git fsck --full** — the external `git` binary runs a full + integrity check. Skipped (and reported as such) when `git` isn't + on `PATH`; the conversion still succeeds. + +If any check fails the command exits non-zero. The full per-check +results are also in `--json`'s `checks` array. Without `--check` no +verification runs and the run completes as soon as the conversion +itself finishes. + +You can also run the checks by hand on a converted repo, with or +without `--check`: + +```bash +git -C /path/to/out.git fsck --full # zero errors expected +git -C /path/to/out.git config extensions.objectformat # prints sha256 +git -C /path/to/out.git log --oneline -5 # SHA256 hashes +git -C /path/to/out.git log --notes=sha1-origin -5 # with original SHA1 +``` + +To use the result as a working repo: + +```bash +git clone /path/to/out.git /path/to/checkout +``` + +To serve it from a host that accepts SHA256: + +```bash +git -C /path/to/out.git push --mirror +``` + +## Implementation Notes + +The pipeline runs in four phases (pack fetch → discovery → target init → +translation), with refs and side outputs written at the end. Submodule +errors surface in discovery, before the target repo is materialized. + +Translation is a memoized recursive DFS. Tree, parent, tag-target, and +message-reference edges are all part of the DFS, so the mapping is +populated by the time any object's bytes are encoded. A defensive +`inProgress` set guards against cycles; real Git histories can't form +them (parent/tree/tag-target edges are a DAG, and SHA1 message- +reference cycles are cryptographically infeasible), but a trip into +the guard becomes a hard error rather than a stack overflow. + +Loose object writing is done by hand rather than via go-git's +`SetEncodedObject`. The underlying `plumbing/format/objfile.Writer` +in `go-git/v6@v6.0.0-alpha.3` hardcodes SHA1 in its hasher, which +would put every translated object at a SHA1-derived path even though +the content references SHA256. A unit test recomputes `sha256` of +every loose object's decompressed content and compares against the +filename to prevent regression. diff --git a/internal/syncer/syncer.go b/internal/syncer/syncer.go index 6f990235..92761c43 100644 --- a/internal/syncer/syncer.go +++ b/internal/syncer/syncer.go @@ -349,7 +349,7 @@ func measurementLine(m Measurement) []string { // --- Session setup --- -func newConn(raw Endpoint, label string, stats *statsCollector, httpClient *http.Client) (gitproto.Conn, error) { //nolint:ireturn // transport selection intentionally returns the shared connection interface +func newConn(raw Endpoint, label string, stats *statsCollector, httpClient *http.Client) (gitproto.Conn, error) { ep, err := transport.ParseURL(raw.URL) if err != nil { return nil, fmt.Errorf("parse endpoint: %w", err)