From a926a1a901cf015524cd076c5be2d9e7d58de527 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 16 Sep 2022 12:58:08 +0200 Subject: [PATCH 01/29] wip --- cmd/git-sg/main.go | 242 ++++++++++++++++++++++++++++++++++++++++ cmd/git-sg/main_test.go | 10 ++ 2 files changed, 252 insertions(+) create mode 100644 cmd/git-sg/main.go create mode 100644 cmd/git-sg/main_test.go diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go new file mode 100644 index 000000000..8124fe41d --- /dev/null +++ b/cmd/git-sg/main.go @@ -0,0 +1,242 @@ +package main + +import ( + "archive/tar" + "bytes" + "fmt" + "io" + "log" + "os" + "strings" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/filemode" + "github.com/go-git/go-git/v5/plumbing/object" + "github.com/sourcegraph/zoekt/ignore" +) + +func do() error { + gitdir, err := getGitDir() + if err != nil { + return err + } + + // TODO PERF skip object caching since we don't need it for archive. See + // cache for filesystem.NewStorage + r, err := git.PlainOpenWithOptions(gitdir, &git.PlainOpenOptions{ + DetectDotGit: true, + }) + if err != nil { + return err + } + + head, err := r.Head() + if err != nil { + return err + } + + commit, err := r.CommitObject(head.Hash()) + if err != nil { + return err + } + + root, err := r.TreeObject(commit.TreeHash) + if err != nil { + return err + } + + return archiveWrite(io.Discard, r, root, &archiveOpts{ + Ignore: getIgnoreFilter(r, root), + SkipContent: func(hdr *tar.Header) string { + if hdr.Size > 2<<20 { + return "large file" + } + return "" + }, + }) +} + +type archiveOpts struct { + // Ignore if true will exclude path from the archive + Ignore func(path string) bool + // SkipContent if returning a non-empty string will include an entry for + // path but with no content. The PAX header SOURCEGRAPH.skip will contain + // the returned string (a reason for skipping). + SkipContent func(hdr *tar.Header) string +} + +func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) error { + tw := tar.NewWriter(w) + err := archiveWriteTree(tw, repo, tree, "", opts) + if err != nil { + return err + } + return tw.Close() +} + +func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, path string, opts *archiveOpts) error { + // TODO share + // 32*1024 is the same size used by io.Copy + buf := make([]byte, 32*1024) + + for _, e := range tree.Entries { + p := path + "/" + e.Name + if opts.Ignore(p) { + continue + } + switch e.Mode { + case filemode.Dir: + child, err := repo.TreeObject(e.Hash) + if err != nil { + log.Printf("failed to fetch tree object for %s %v: %v", p, e.Hash, err) + continue + } + archiveWriteTree(w, repo, child, p, opts) + + case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: + blob, err := repo.BlobObject(e.Hash) + if err != nil { + log.Printf("failed to get blob object for %s %v: %v", p, e.Hash, err) + continue + } + + // TODO symlinks, mode, etc. Handle large Linkname + hdr := &tar.Header{ + Typeflag: tar.TypeReg, + Name: p, + Size: blob.Size, + + Format: tar.FormatPAX, // TODO ? + } + + if reason := opts.SkipContent(hdr); reason != "" { + hdr.PAXRecords = map[string]string{"SOURCEGRAPH.skipped": reason} + if err := w.WriteHeader(hdr); err != nil { + return err + } + continue + } + + r, err := blob.Reader() + if err != nil { + log.Printf("failed to read blob object for %s %v: %v", p, e.Hash, err) + continue + } + + // Heuristic: Assume file is binary if first 256 bytes contain a 0x00. + blobSample := buf[:256] + if n, err := io.ReadAtLeast(r, blobSample, 256); err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { + _ = r.Close() + log.Printf("failed to read blob object for %s %v: %v", p, e.Hash, err) + continue + } else { + blobSample = blobSample[:n] + } + isBinary := bytes.IndexByte(blobSample, 0x00) >= 0 + + if err := w.WriteHeader(hdr); err != nil { + r.Close() + return err + } + + // Only skip binary files at this point so we still write the header. + if isBinary { + r.Close() + continue + } + + // We read some bytes from r already, first write those. + if _, err := w.Write(blobSample); err != nil { + r.Close() + return err + } + + // Write out the rest of r. + if _, err := io.CopyBuffer(w, r, buf); err != nil { + r.Close() + return err + } + + if err := r.Close(); err != nil { + return err + } + + case filemode.Submodule: + // TODO what do? + continue + + default: + log.Printf("WARN: unexpected filemode %+v", e) + } + } + + return nil +} + +func getIgnoreFilter(r *git.Repository, root *object.Tree) func(string) bool { + m, err := parseIgnoreFile(r, root) + if err != nil { + // likely malformed, just log and ignore + log.Printf("WARN: failed to parse sourcegraph ignore file: %v", err) + return func(_ string) bool { return false } + } + + return m.Match +} + +func parseIgnoreFile(r *git.Repository, root *object.Tree) (*ignore.Matcher, error) { + entry, err := root.FindEntry(ignore.IgnoreFile) + if isNotExist(err) { + return &ignore.Matcher{}, nil + } else if err != nil { + return nil, fmt.Errorf("failed to find %s: %w", ignore.IgnoreFile, err) + } + + if !entry.Mode.IsFile() { + return &ignore.Matcher{}, nil + } + + blob, err := r.BlobObject(entry.Hash) + if err != nil { + return nil, err + } + + reader, err := blob.Reader() + if err != nil { + return nil, err + } + defer reader.Close() + + m, err := ignore.ParseIgnoreFile(reader) + if err != nil { + return nil, err + } + + return m, nil +} + +func isNotExist(err error) bool { + if err == nil { + return false + } + // go-git does not have an interface to check for not found, and can + // returned a myraid of errors for not found depending on were along looking + // for a file it failed (object, tree, entry, etc). So strings are the best + // we can do. + return os.IsNotExist(err) || strings.Contains(err.Error(), "not found") +} + +func getGitDir() (string, error) { + dir := os.Getenv("GIT_DIR") + if dir == "" { + return os.Getwd() + } + return dir, nil +} + +func main() { + err := do() + if err != nil { + log.Fatal(err) + } +} diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go new file mode 100644 index 000000000..d40713169 --- /dev/null +++ b/cmd/git-sg/main_test.go @@ -0,0 +1,10 @@ +package main + +import "testing" + +func TestDo(t *testing.T) { + err := do() + if err != nil { + t.Fatal(err) + } +} From e7e65e6775db170d393480d87a55283c1c2ac2c5 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 16 Sep 2022 13:20:21 +0200 Subject: [PATCH 02/29] fix test --- cmd/git-sg/main.go | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 8124fe41d..ba12bd735 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -109,9 +109,14 @@ func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, pa Format: tar.FormatPAX, // TODO ? } - if reason := opts.SkipContent(hdr); reason != "" { + skip := func(reason string) error { hdr.PAXRecords = map[string]string{"SOURCEGRAPH.skipped": reason} - if err := w.WriteHeader(hdr); err != nil { + hdr.Size = 0 + return w.WriteHeader(hdr) + } + + if reason := opts.SkipContent(hdr); reason != "" { + if err := skip(reason); err != nil { return err } continue @@ -132,28 +137,29 @@ func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, pa } else { blobSample = blobSample[:n] } - isBinary := bytes.IndexByte(blobSample, 0x00) >= 0 - if err := w.WriteHeader(hdr); err != nil { - r.Close() - return err + if bytes.IndexByte(blobSample, 0x00) >= 0 { + _ = r.Close() + if err := skip("binary"); err != nil { + return err + } + continue } - // Only skip binary files at this point so we still write the header. - if isBinary { - r.Close() - continue + if err := w.WriteHeader(hdr); err != nil { + _= r.Close() + return err } // We read some bytes from r already, first write those. if _, err := w.Write(blobSample); err != nil { - r.Close() + _ = r.Close() return err } // Write out the rest of r. if _, err := io.CopyBuffer(w, r, buf); err != nil { - r.Close() + _ = r.Close() return err } From ccf69b368ed0e682cda686c83187d84fbc6b6b7b Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 16 Sep 2022 15:56:04 +0200 Subject: [PATCH 03/29] same output as git archive for "tar t" --- cmd/git-sg/main.go | 33 +++++++++++++++++++++++++++------ cmd/git-sg/main_test.go | 7 +++++-- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index ba12bd735..bb089b6f5 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -15,7 +15,7 @@ import ( "github.com/sourcegraph/zoekt/ignore" ) -func do() error { +func do(w io.Writer) error { gitdir, err := getGitDir() if err != nil { return err @@ -45,7 +45,7 @@ func do() error { return err } - return archiveWrite(io.Discard, r, root, &archiveOpts{ + return archiveWrite(w, r, root, &archiveOpts{ Ignore: getIgnoreFilter(r, root), SkipContent: func(hdr *tar.Header) string { if hdr.Size > 2<<20 { @@ -80,10 +80,17 @@ func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, pa buf := make([]byte, 32*1024) for _, e := range tree.Entries { - p := path + "/" + e.Name + var p string + if e.Mode == filemode.Dir { + p = path + e.Name + "/" + } else { + p = path + e.Name + } + if opts.Ignore(p) { continue } + switch e.Mode { case filemode.Dir: child, err := repo.TreeObject(e.Hash) @@ -91,7 +98,18 @@ func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, pa log.Printf("failed to fetch tree object for %s %v: %v", p, e.Hash, err) continue } - archiveWriteTree(w, repo, child, p, opts) + + if err := w.WriteHeader(&tar.Header{ + Typeflag: tar.TypeDir, + Name: p, + Format: tar.FormatPAX, // TODO ? + }); err != nil { + return err + } + + if err := archiveWriteTree(w, repo, child, p, opts); err != nil { + return err + } case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: blob, err := repo.BlobObject(e.Hash) @@ -110,7 +128,7 @@ func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, pa } skip := func(reason string) error { - hdr.PAXRecords = map[string]string{"SOURCEGRAPH.skipped": reason} + hdr.PAXRecords = map[string]string{"SG.skip": reason} hdr.Size = 0 return w.WriteHeader(hdr) } @@ -138,6 +156,9 @@ func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, pa blobSample = blobSample[:n] } + // TODO instead of just binary, should we only allow utf8? utf.Valid + // works except for the fact we may be invalid utf8 at the 256 boundary + // since we cut it off. So will need to copypasta that. if bytes.IndexByte(blobSample, 0x00) >= 0 { _ = r.Close() if err := skip("binary"); err != nil { @@ -241,7 +262,7 @@ func getGitDir() (string, error) { } func main() { - err := do() + err := do(os.Stdout) if err != nil { log.Fatal(err) } diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go index d40713169..96089460d 100644 --- a/cmd/git-sg/main_test.go +++ b/cmd/git-sg/main_test.go @@ -1,9 +1,12 @@ package main -import "testing" +import ( + "io" + "testing" +) func TestDo(t *testing.T) { - err := do() + err := do(io.Discard) if err != nil { t.Fatal(err) } From 950b26ae43846183b2f9e0167122e1854a8b8686 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 16 Sep 2022 16:21:19 +0200 Subject: [PATCH 04/29] capture state in archiveWriter struct for better readability --- cmd/git-sg/main.go | 175 ++++++++++++++++++++++++--------------------- 1 file changed, 93 insertions(+), 82 deletions(-) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index bb089b6f5..aa370aebd 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -66,19 +66,33 @@ type archiveOpts struct { } func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) error { - tw := tar.NewWriter(w) - err := archiveWriteTree(tw, repo, tree, "", opts) + a := &archiveWriter{ + w: tar.NewWriter(w), + repo: repo, + opts: opts, + + // 32*1024 is the same size used by io.Copy + buf: make([]byte, 32*1024), + } + + err := a.writeTree(tree, "") if err != nil { + _ = a.w.Close() return err } - return tw.Close() + + return a.w.Close() } -func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, path string, opts *archiveOpts) error { - // TODO share - // 32*1024 is the same size used by io.Copy - buf := make([]byte, 32*1024) +type archiveWriter struct { + w *tar.Writer + repo *git.Repository + opts *archiveOpts + buf []byte +} + +func (a *archiveWriter) writeTree(tree *object.Tree, path string) error { for _, e := range tree.Entries { var p string if e.Mode == filemode.Dir { @@ -87,117 +101,114 @@ func archiveWriteTree(w *tar.Writer, repo *git.Repository, tree *object.Tree, pa p = path + e.Name } - if opts.Ignore(p) { + if a.opts.Ignore(p) { continue } switch e.Mode { case filemode.Dir: - child, err := repo.TreeObject(e.Hash) + child, err := a.repo.TreeObject(e.Hash) if err != nil { log.Printf("failed to fetch tree object for %s %v: %v", p, e.Hash, err) continue } - if err := w.WriteHeader(&tar.Header{ + if err := a.w.WriteHeader(&tar.Header{ Typeflag: tar.TypeDir, Name: p, - Format: tar.FormatPAX, // TODO ? + Format: tar.FormatPAX, // TODO ? }); err != nil { return err } - if err := archiveWriteTree(w, repo, child, p, opts); err != nil { + if err := a.writeTree(child, p); err != nil { return err } case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: - blob, err := repo.BlobObject(e.Hash) - if err != nil { - log.Printf("failed to get blob object for %s %v: %v", p, e.Hash, err) - continue + if err := a.writeRegularTreeEntry(e, p); err != nil { + return err } - // TODO symlinks, mode, etc. Handle large Linkname - hdr := &tar.Header{ - Typeflag: tar.TypeReg, - Name: p, - Size: blob.Size, + case filemode.Submodule: + // TODO what do? + continue - Format: tar.FormatPAX, // TODO ? - } + default: + log.Printf("WARN: unexpected filemode %+v", e) + } + } - skip := func(reason string) error { - hdr.PAXRecords = map[string]string{"SG.skip": reason} - hdr.Size = 0 - return w.WriteHeader(hdr) - } + return nil +} - if reason := opts.SkipContent(hdr); reason != "" { - if err := skip(reason); err != nil { - return err - } - continue - } +func (a *archiveWriter) writeRegularTreeEntry(entry object.TreeEntry, path string) error { + blob, err := a.repo.BlobObject(entry.Hash) + if err != nil { + log.Printf("failed to get blob object for %s %v: %v", path, entry.Hash, err) + return nil + } - r, err := blob.Reader() - if err != nil { - log.Printf("failed to read blob object for %s %v: %v", p, e.Hash, err) - continue - } + // TODO symlinks, mode, etc. Handle large Linkname + hdr := &tar.Header{ + Typeflag: tar.TypeReg, + Name: path, + Size: blob.Size, - // Heuristic: Assume file is binary if first 256 bytes contain a 0x00. - blobSample := buf[:256] - if n, err := io.ReadAtLeast(r, blobSample, 256); err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { - _ = r.Close() - log.Printf("failed to read blob object for %s %v: %v", p, e.Hash, err) - continue - } else { - blobSample = blobSample[:n] - } + Format: tar.FormatPAX, // TODO ? + } - // TODO instead of just binary, should we only allow utf8? utf.Valid - // works except for the fact we may be invalid utf8 at the 256 boundary - // since we cut it off. So will need to copypasta that. - if bytes.IndexByte(blobSample, 0x00) >= 0 { - _ = r.Close() - if err := skip("binary"); err != nil { - return err - } - continue - } + if reason := a.opts.SkipContent(hdr); reason != "" { + return a.writeSkipHeader(hdr, reason) + } - if err := w.WriteHeader(hdr); err != nil { - _= r.Close() - return err - } + r, err := blob.Reader() + if err != nil { + log.Printf("failed to read blob object for %s %v: %v", path, entry.Hash, err) + return nil + } - // We read some bytes from r already, first write those. - if _, err := w.Write(blobSample); err != nil { - _ = r.Close() - return err - } + // TODO confirm it is fine to call Close twice. From initial reading of + // go-git it relies on io.Pipe for readers, so this should be fine. + defer r.Close() + + // Heuristic: Assume file is binary if first 256 bytes contain a 0x00. + blobSample := a.buf[:256] + if n, err := io.ReadAtLeast(r, blobSample, 256); err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { + log.Printf("failed to read blob object for %s %v: %v", path, entry.Hash, err) + return nil + } else { + blobSample = blobSample[:n] + } - // Write out the rest of r. - if _, err := io.CopyBuffer(w, r, buf); err != nil { - _ = r.Close() - return err - } + // TODO instead of just binary, should we only allow utf8? utf.Valid + // works except for the fact we may be invalid utf8 at the 256 boundary + // since we cut it off. So will need to copypasta that. + if bytes.IndexByte(blobSample, 0x00) >= 0 { + return a.writeSkipHeader(hdr, "binary") + } - if err := r.Close(); err != nil { - return err - } + if err := a.w.WriteHeader(hdr); err != nil { + return err + } - case filemode.Submodule: - // TODO what do? - continue + // We read some bytes from r already, first write those. + if _, err := a.w.Write(blobSample); err != nil { + return err + } - default: - log.Printf("WARN: unexpected filemode %+v", e) - } + // Write out the rest of r. + if _, err := io.CopyBuffer(a.w, r, a.buf); err != nil { + return err } - return nil + return r.Close() +} + +func (a *archiveWriter) writeSkipHeader(hdr *tar.Header, reason string) error { + hdr.PAXRecords = map[string]string{"SG.skip": reason} + hdr.Size = 0 // clear out size since we won't write the body + return a.w.WriteHeader(hdr) } func getIgnoreFilter(r *git.Repository, root *object.Tree) func(string) bool { From 89c6970eab139e7d8b46aabf8d0f0086d34e4ff3 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 16 Sep 2022 16:27:59 +0200 Subject: [PATCH 05/29] set mode --- cmd/git-sg/main.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index aa370aebd..fcfb4077b 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -116,6 +116,7 @@ func (a *archiveWriter) writeTree(tree *object.Tree, path string) error { if err := a.w.WriteHeader(&tar.Header{ Typeflag: tar.TypeDir, Name: p, + Mode: 0777, Format: tar.FormatPAX, // TODO ? }); err != nil { return err @@ -154,6 +155,7 @@ func (a *archiveWriter) writeRegularTreeEntry(entry object.TreeEntry, path strin Typeflag: tar.TypeReg, Name: path, Size: blob.Size, + Mode: 0666, Format: tar.FormatPAX, // TODO ? } From db2d11060c39730bbbe1b60f8413df06bf23a223 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Sat, 17 Sep 2022 07:16:17 +0200 Subject: [PATCH 06/29] do not do dotgit detection since it brakes bare repos --- cmd/git-sg/main.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index fcfb4077b..694ba23ef 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -23,9 +23,7 @@ func do(w io.Writer) error { // TODO PERF skip object caching since we don't need it for archive. See // cache for filesystem.NewStorage - r, err := git.PlainOpenWithOptions(gitdir, &git.PlainOpenOptions{ - DetectDotGit: true, - }) + r, err := git.PlainOpen(gitdir) if err != nil { return err } From 91ffa5b6b55621adca76c3d0d75660eb58b8a8cd Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Mon, 19 Sep 2022 13:18:02 +0200 Subject: [PATCH 07/29] cpu_profile flag --- cmd/git-sg/main.go | 15 +++++++++++++++ cmd/git-sg/main_test.go | 1 + 2 files changed, 16 insertions(+) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 694ba23ef..9999a277f 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -3,10 +3,12 @@ package main import ( "archive/tar" "bytes" + "flag" "fmt" "io" "log" "os" + "runtime/pprof" "strings" "github.com/go-git/go-git/v5" @@ -273,6 +275,19 @@ func getGitDir() (string, error) { } func main() { + cpuProfile := flag.String("cpu_profile", "", "write cpu profile to `file`") + flag.Parse() + + if *cpuProfile != "" { + f, err := os.Create(*cpuProfile) + if err != nil { + log.Fatal(err) + } + defer f.Close() + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + } + err := do(os.Stdout) if err != nil { log.Fatal(err) diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go index 96089460d..380ab070b 100644 --- a/cmd/git-sg/main_test.go +++ b/cmd/git-sg/main_test.go @@ -6,6 +6,7 @@ import ( ) func TestDo(t *testing.T) { + t.Setenv("GIT_DIR", "../../.git") err := do(io.Discard) if err != nil { t.Fatal(err) From ee1cc0286475eb9783135a6bd66265dcd253a549 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Mon, 19 Sep 2022 15:25:40 +0200 Subject: [PATCH 08/29] introduce manual stack for better profile output The profile output was really hard to read due to the arb nesting of calls to writeTree. This introduces a manual stack, but slightly adjusts the order of output. I'd prefer the normal DFS order to match git archive, but atleast for profiling this is good for now. --- cmd/git-sg/main.go | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 9999a277f..f2afbef2a 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -71,24 +71,38 @@ func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *ar repo: repo, opts: opts, + stack: []item{{tree: tree, path: ""}}, + // 32*1024 is the same size used by io.Copy buf: make([]byte, 32*1024), } - err := a.writeTree(tree, "") - if err != nil { - _ = a.w.Close() - return err + for len(a.stack) > 0 { + item := a.stack[len(a.stack)-1] + a.stack = a.stack[:len(a.stack)-1] + + err := a.writeTree(item.tree, item.path) + if err != nil { + _ = a.w.Close() + return err + } } return a.w.Close() } +type item struct { + tree *object.Tree + path string +} + type archiveWriter struct { w *tar.Writer repo *git.Repository opts *archiveOpts + stack []item + buf []byte } @@ -122,9 +136,7 @@ func (a *archiveWriter) writeTree(tree *object.Tree, path string) error { return err } - if err := a.writeTree(child, p); err != nil { - return err - } + a.stack = append(a.stack, item{tree: child, path: p}) case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: if err := a.writeRegularTreeEntry(e, p); err != nil { From 04081cc8850382a029d85680000944b1210d5866 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Mon, 19 Sep 2022 15:49:43 +0200 Subject: [PATCH 09/29] try out keepdescriptors --- cmd/git-sg/main.go | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index f2afbef2a..1a23f8d8d 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -11,21 +11,17 @@ import ( "runtime/pprof" "strings" + "github.com/go-git/go-billy/v5/osfs" "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/cache" "github.com/go-git/go-git/v5/plumbing/filemode" "github.com/go-git/go-git/v5/plumbing/object" + "github.com/go-git/go-git/v5/storage/filesystem" "github.com/sourcegraph/zoekt/ignore" ) func do(w io.Writer) error { - gitdir, err := getGitDir() - if err != nil { - return err - } - - // TODO PERF skip object caching since we don't need it for archive. See - // cache for filesystem.NewStorage - r, err := git.PlainOpen(gitdir) + r, err := openGitRepo() if err != nil { return err } @@ -278,12 +274,32 @@ func isNotExist(err error) bool { return os.IsNotExist(err) || strings.Contains(err.Error(), "not found") } -func getGitDir() (string, error) { +func openGitRepo() (*git.Repository, error) { dir := os.Getenv("GIT_DIR") if dir == "" { - return os.Getwd() + var err error + dir, err = os.Getwd() + if err != nil { + return nil, err + } } - return dir, nil + + fs := osfs.New(dir) + if _, err := fs.Stat(git.GitDirName); err == nil { + fs, err = fs.Chroot(git.GitDirName) + if err != nil { + return nil, err + } + } + + // TODO PERF try skip object caching since we don't need it for archive. + s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{ + // PERF: important, otherwise we pay the cost of opening and closing + // packfiles per object access and read. + KeepDescriptors: true, + }) + + return git.Open(s, fs) } func main() { From 6500eb19f526198674cfc14dd05294c496255dbe Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 20 Sep 2022 09:30:27 +0200 Subject: [PATCH 10/29] add memprofile --- cmd/git-sg/main.go | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 1a23f8d8d..70253a425 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -8,6 +8,7 @@ import ( "io" "log" "os" + "runtime" "runtime/pprof" "strings" @@ -303,16 +304,19 @@ func openGitRepo() (*git.Repository, error) { } func main() { - cpuProfile := flag.String("cpu_profile", "", "write cpu profile to `file`") + cpuprofile := flag.String("cpuprofile", "", "write cpu profile to `file`") + memprofile := flag.String("memprofile", "", "write memory profile to `file`") flag.Parse() - if *cpuProfile != "" { - f, err := os.Create(*cpuProfile) + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) if err != nil { - log.Fatal(err) + log.Fatal("could not create CPU profile: ", err) + } + defer f.Close() // error handling omitted for example + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) } - defer f.Close() - pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } @@ -320,4 +324,16 @@ func main() { if err != nil { log.Fatal(err) } + + if *memprofile != "" { + f, err := os.Create(*memprofile) + if err != nil { + log.Fatal("could not create memory profile: ", err) + } + defer f.Close() // error handling omitted for example + runtime.GC() // get up-to-date statistics + if err := pprof.WriteHeapProfile(f); err != nil { + log.Fatal("could not write memory profile: ", err) + } + } } From 753864c07bd6ec72defe5b2f8dff35cc967185cd Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 20 Sep 2022 09:38:45 +0200 Subject: [PATCH 11/29] optionally buffer output if GIT_SG_BUFFER is set --- cmd/git-sg/main.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 70253a425..022510a8b 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -2,6 +2,7 @@ package main import ( "archive/tar" + "bufio" "bytes" "flag" "fmt" @@ -63,6 +64,15 @@ type archiveOpts struct { } func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) error { + // Gating this right now because I get inconsistent performance on my + // macbook. Want to test on linux and larger repos. + if os.Getenv("GIT_SG_BUFFER") != "" { + log.Println("buffering output") + bw := bufio.NewWriter(w) + defer bw.Flush() + w = bw + } + a := &archiveWriter{ w: tar.NewWriter(w), repo: repo, From cebf65b038d7697a371767cf995eb0b3f0d10d11 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 20 Sep 2022 10:11:46 +0200 Subject: [PATCH 12/29] add experimental GIT_SG_FILTER which just filters git archive This is significantly faster than using go-git. --- cmd/git-sg/filter.go | 75 ++++++++++++++++++++++++++++++++++++++++++++ cmd/git-sg/main.go | 29 ++++++++++------- 2 files changed, 93 insertions(+), 11 deletions(-) create mode 100644 cmd/git-sg/filter.go diff --git a/cmd/git-sg/filter.go b/cmd/git-sg/filter.go new file mode 100644 index 000000000..7dbc72874 --- /dev/null +++ b/cmd/git-sg/filter.go @@ -0,0 +1,75 @@ +package main + +import ( + "archive/tar" + "io" + "os/exec" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/object" +) + +func archiveFilter(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) (err error) { + // 32*1024 is the same size used by io.Copy + buf := make([]byte, 32*1024) + + cmd := exec.Command("git", "archive", "--worktree-attributes", "--format=tar", tree.Hash.String(), "--") + r, err := cmd.StdoutPipe() + if err != nil { + return err + } + defer r.Close() + + tr := tar.NewReader(r) + tw := tar.NewWriter(w) + + err = cmd.Start() + if err != nil { + return err + } + + done := false + defer func() { + if done { + return + } + err2 := cmd.Process.Kill() + if err == nil { + err = err2 + } + }() + + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + + if opts.Ignore(hdr.Name) { + continue + } else if reason := opts.SkipContent(hdr); reason != "" { + hdr.Size = 0 + hdr.PAXRecords = map[string]string{"SG.skip": reason} + hdr.Format = tar.FormatPAX + if err := tw.WriteHeader(hdr); err != nil { + return err + } + continue + } + + tw.WriteHeader(hdr) + if _, err := io.CopyBuffer(tw, tr, buf); err != nil { + return err + } + } + + if err := tw.Close(); err != nil { + return err + } + + done = true + return cmd.Wait() +} diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 022510a8b..2d1a0f651 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -43,7 +43,16 @@ func do(w io.Writer) error { return err } - return archiveWrite(w, r, root, &archiveOpts{ + // Gating this right now because I get inconsistent performance on my + // macbook. Want to test on linux and larger repos. + if os.Getenv("GIT_SG_BUFFER") != "" { + log.Println("buffering output") + bw := bufio.NewWriter(w) + defer bw.Flush() + w = bw + } + + opts := &archiveOpts{ Ignore: getIgnoreFilter(r, root), SkipContent: func(hdr *tar.Header) string { if hdr.Size > 2<<20 { @@ -51,7 +60,14 @@ func do(w io.Writer) error { } return "" }, - }) + } + + if os.Getenv("GIT_SG_FILTER") != "" { + log.Println("filtering git archive output") + return archiveFilter(w, r, root, opts) + } + + return archiveWrite(w, r, root, opts) } type archiveOpts struct { @@ -64,15 +80,6 @@ type archiveOpts struct { } func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) error { - // Gating this right now because I get inconsistent performance on my - // macbook. Want to test on linux and larger repos. - if os.Getenv("GIT_SG_BUFFER") != "" { - log.Println("buffering output") - bw := bufio.NewWriter(w) - defer bw.Flush() - w = bw - } - a := &archiveWriter{ w: tar.NewWriter(w), repo: repo, From 83325cf1c2caff52b9e0a4925fc88e239f12a276 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 21 Sep 2022 12:43:52 +0200 Subject: [PATCH 13/29] getting started on git-cat-file integration lunch time --- cmd/git-sg/catfile.go | 162 +++++++++++++++++++++++++++++++++++++ cmd/git-sg/catfile_test.go | 41 ++++++++++ 2 files changed, 203 insertions(+) create mode 100644 cmd/git-sg/catfile.go create mode 100644 cmd/git-sg/catfile_test.go diff --git a/cmd/git-sg/catfile.go b/cmd/git-sg/catfile.go new file mode 100644 index 000000000..a554b8cf8 --- /dev/null +++ b/cmd/git-sg/catfile.go @@ -0,0 +1,162 @@ +package main + +import ( + "bufio" + "bytes" + "encoding/hex" + "io" + "log" + "os" + "os/exec" + "strconv" + + "github.com/go-git/go-git/v5/plumbing" +) + +type gitCatFileBatch struct { + cmd *exec.Cmd + in *bufio.Writer + inCloser io.Closer + out *bufio.Reader + outCloser io.Closer +} + +func startGitCatFileBatch(dir string) (_ *gitCatFileBatch, err error) { + cmd := exec.Command("git", "cat-file", "--batch-command") + cmd.Dir = dir + + closeIfErr := func(closer io.Closer) { + if err != nil { + closer.Close() + } + } + + stdin, err := cmd.StdinPipe() + if err != nil { + return nil, err + } + defer closeIfErr(stdin) + + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil, err + } + defer closeIfErr(stdin) + + // TODO should capture somehow and put into error + cmd.Stderr = os.Stderr + + if err := cmd.Start(); err != nil { + return nil, err + } + + return &gitCatFileBatch{ + cmd: cmd, + in: bufio.NewWriter(stdin), + inCloser: stdin, + out: bufio.NewReader(stdout), + outCloser: stdout, + }, nil +} + +type gitCatFileBatchInfo struct { + Hash plumbing.Hash + Type plumbing.ObjectType + Size int64 +} + +func (g *gitCatFileBatch) Info(ref string) (gitCatFileBatchInfo, error) { + g.in.WriteString("info ") + g.in.WriteString(ref) + g.in.WriteByte('\n') + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + line, err := g.out.ReadSlice('\n') + if err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + info, err := parseGitCatFileBatchInfoLine(line) + if err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + return info, nil +} + +// parseGitCatFileBatchInfoLine parses the info line from git-cat-file. It +// expects the default format of: +// +// SP SP LF +func parseGitCatFileBatchInfoLine(line []byte) (gitCatFileBatchInfo, error) { + line = bytes.TrimRight(line, "\n") + + next := func() []byte { + i := bytes.IndexByte(line, ' ') + if i < 0 { + pre := line + line = nil + return pre + } + pre := line[:i] + line = line[i+1:] + return pre + } + + info := gitCatFileBatchInfo{} + + var err error + _, err = hex.Decode(info.Hash[:], next()) + if err != nil { + return info, err + } + + info.Type, err = plumbing.ParseObjectType(string(next())) + if err != nil { + return info, err + } + + info.Size, err = strconv.ParseInt(string(next()), 10, 64) + if err != nil { + return info, err + } + + return info, nil +} + +func (g *gitCatFileBatch) Close() (err error) { + defer func() { + if err != nil { + g.kill() + } + }() + + // This Close will tell git to shutdown + if err := g.inCloser.Close(); err != nil { + return err + } + + // Drain and check we have no output left (to detect mistakes) + if n, err := io.Copy(io.Discard, g.out); err != nil { + return err + } else if n > 0 { + log.Printf("unexpected %d bytes of remaining output when calling close", n) + } + + if err := g.outCloser.Close(); err != nil { + return err + } + + return g.cmd.Wait() +} + +func (g *gitCatFileBatch) kill() { + _ = g.cmd.Process.Kill() + _ = g.inCloser.Close() + _ = g.outCloser.Close() +} diff --git a/cmd/git-sg/catfile_test.go b/cmd/git-sg/catfile_test.go new file mode 100644 index 000000000..648a78bed --- /dev/null +++ b/cmd/git-sg/catfile_test.go @@ -0,0 +1,41 @@ +package main + +import "testing" + +func TestInfo(t *testing.T) { + p, err := startGitCatFileBatch("") + if err != nil { + t.Fatal(err) + } + defer p.Close() + + info, err := p.Info("HEAD") + if err != nil { + t.Fatal(err) + } + + t.Log(info.Hash, info.Type, info.Size) + + if err := p.Close(); err != nil { + t.Fatal(err) + } +} + +func BenchmarkInfo(b *testing.B) { + p, err := startGitCatFileBatch("") + if err != nil { + b.Fatal(err) + } + defer p.Close() + + for i := 0; i < b.N; i++ { + _, err := p.Info("HEAD") + if err != nil { + b.Fatal(err) + } + } + + if err := p.Close(); err != nil { + b.Fatal(err) + } +} From 15902a28bb798d1e84e2e64e72125d68258d6bd3 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 21 Sep 2022 13:17:39 +0200 Subject: [PATCH 14/29] add contents method for git-cat-file --- cmd/git-sg/catfile.go | 81 ++++++++++++++++++++++++++++++++++++-- cmd/git-sg/catfile_test.go | 58 ++++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 4 deletions(-) diff --git a/cmd/git-sg/catfile.go b/cmd/git-sg/catfile.go index a554b8cf8..bc71dfa98 100644 --- a/cmd/git-sg/catfile.go +++ b/cmd/git-sg/catfile.go @@ -4,6 +4,7 @@ import ( "bufio" "bytes" "encoding/hex" + "fmt" "io" "log" "os" @@ -19,6 +20,11 @@ type gitCatFileBatch struct { inCloser io.Closer out *bufio.Reader outCloser io.Closer + + // readerN is the amount left to read for Read. Note: git-cat-file always + // has a trailing new line, so this will always be the size of an object + + // 1. + readerN int64 } func startGitCatFileBatch(dir string) (_ *gitCatFileBatch, err error) { @@ -74,6 +80,42 @@ func (g *gitCatFileBatch) Info(ref string) (gitCatFileBatchInfo, error) { return gitCatFileBatchInfo{}, err } + if err := g.discard(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + line, err := g.out.ReadSlice('\n') + if err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + info, err := parseGitCatFileBatchInfoLine(line) + if err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + g.readerN = 0 + + return info, nil +} + +func (g *gitCatFileBatch) Contents(ref string) (gitCatFileBatchInfo, error) { + g.in.WriteString("contents ") + g.in.WriteString(ref) + g.in.WriteByte('\n') + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + if err := g.discard(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + line, err := g.out.ReadSlice('\n') if err != nil { g.kill() @@ -86,16 +128,45 @@ func (g *gitCatFileBatch) Info(ref string) (gitCatFileBatchInfo, error) { return gitCatFileBatchInfo{}, err } + g.readerN = info.Size + 1 + return info, nil } +func (g *gitCatFileBatch) Read(p []byte) (n int, err error) { + // We avoid reading the final byte (a newline). That will be handled by + // discard. + if g.readerN <= 1 { + return 0, io.EOF + } + if max := g.readerN - 1; int64(len(p)) > max { + p = p[0:max] + } + n, err = g.out.Read(p) + g.readerN -= int64(n) + return +} + +// discard should be called before parsing a response to flush out any unread +// data since the last command. +func (g *gitCatFileBatch) discard() error { + if g.readerN > 0 { + n, err := g.out.Discard(int(g.readerN)) + g.readerN -= int64(n) + return err + } + return nil +} + // parseGitCatFileBatchInfoLine parses the info line from git-cat-file. It // expects the default format of: // // SP SP LF func parseGitCatFileBatchInfoLine(line []byte) (gitCatFileBatchInfo, error) { line = bytes.TrimRight(line, "\n") + origLine := line + // PERF this allocates much less than bytes.Split next := func() []byte { i := bytes.IndexByte(line, ' ') if i < 0 { @@ -113,17 +184,17 @@ func parseGitCatFileBatchInfoLine(line []byte) (gitCatFileBatchInfo, error) { var err error _, err = hex.Decode(info.Hash[:], next()) if err != nil { - return info, err + return info, fmt.Errorf("unexpected git-cat-file --batch info line %q: %w", string(origLine), err) } info.Type, err = plumbing.ParseObjectType(string(next())) if err != nil { - return info, err + return info, fmt.Errorf("unexpected git-cat-file --batch info line %q: %w", string(origLine), err) } info.Size, err = strconv.ParseInt(string(next()), 10, 64) if err != nil { - return info, err + return info, fmt.Errorf("unexpected git-cat-file --batch info line %q: %w", string(origLine), err) } return info, nil @@ -136,6 +207,10 @@ func (g *gitCatFileBatch) Close() (err error) { } }() + if err := g.discard(); err != nil { + return err + } + // This Close will tell git to shutdown if err := g.inCloser.Close(); err != nil { return err diff --git a/cmd/git-sg/catfile_test.go b/cmd/git-sg/catfile_test.go index 648a78bed..8f276d1c4 100644 --- a/cmd/git-sg/catfile_test.go +++ b/cmd/git-sg/catfile_test.go @@ -1,6 +1,12 @@ package main -import "testing" +import ( + "io" + "testing" + + "github.com/go-git/go-git/v5/plumbing" + "github.com/google/go-cmp/cmp" +) func TestInfo(t *testing.T) { p, err := startGitCatFileBatch("") @@ -21,6 +27,56 @@ func TestInfo(t *testing.T) { } } +func TestContents(t *testing.T) { + p, err := startGitCatFileBatch("") + if err != nil { + t.Fatal(err) + } + defer p.Close() + + info, err := p.Contents("HEAD") + if err != nil { + t.Fatal(err) + } + + t.Log(info.Hash, info.Type, info.Size) + + b, err := io.ReadAll(p) + if err != nil { + t.Fatal(err) + } + t.Log(string(b)) + + if len(b) != int(info.Size) { + t.Fatalf("amount read (%d) is different to object size (%d)", len(b), info.Size) + } + if info.Type != plumbing.CommitObject { + t.Fatalf("expected HEAD to be a commit, got %s", info.Type) + } + + // Now lets fetch the object again via hash and see if it stays the same. + info2, err := p.Contents(info.Hash.String()) + if err != nil { + t.Fatal(err) + } + + if d := cmp.Diff(info, info2); d != "" { + t.Fatalf("info changed (-first, +second):\n%s", d) + } + + b2, err := io.ReadAll(p) + if err != nil { + t.Fatal(err) + } + if d := cmp.Diff(b, b2); d != "" { + t.Fatalf("content changed (-first, +second):\n%s", d) + } + + if err := p.Close(); err != nil { + t.Fatal(err) + } +} + func BenchmarkInfo(b *testing.B) { p, err := startGitCatFileBatch("") if err != nil { From 0027f4172d95808000cbebf1ebd10c4ac3436941 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 21 Sep 2022 20:23:26 +0200 Subject: [PATCH 15/29] handle missing refs --- cmd/git-sg/catfile.go | 36 +++++++++++++++++++++++++++++++++--- cmd/git-sg/catfile_test.go | 20 ++++++++++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/cmd/git-sg/catfile.go b/cmd/git-sg/catfile.go index bc71dfa98..a625a2a4a 100644 --- a/cmd/git-sg/catfile.go +++ b/cmd/git-sg/catfile.go @@ -4,6 +4,7 @@ import ( "bufio" "bytes" "encoding/hex" + "errors" "fmt" "io" "log" @@ -14,6 +15,9 @@ import ( "github.com/go-git/go-git/v5/plumbing" ) +// gitCatFileBatch is a wrapper around a git-cat-file --batch-command process. +// This provides an efficient means to interact with the git object store of a +// repository. type gitCatFileBatch struct { cmd *exec.Cmd in *bufio.Writer @@ -27,6 +31,23 @@ type gitCatFileBatch struct { readerN int64 } +type missingError struct { + ref string +} + +func (e *missingError) Error() string { + return e.ref + " missing" +} + +func isMissingError(err error) bool { + var e *missingError + return errors.As(err, &e) +} + +// startGitCatFileBatch returns a gitCatFileBatch for the repository at dir. +// +// Callers must ensure to call gitCatFileBatch.Close() to ensure the +// associated subprocess and file descriptors are cleaned up. func startGitCatFileBatch(dir string) (_ *gitCatFileBatch, err error) { cmd := exec.Command("git", "cat-file", "--batch-command") cmd.Dir = dir @@ -93,7 +114,9 @@ func (g *gitCatFileBatch) Info(ref string) (gitCatFileBatchInfo, error) { info, err := parseGitCatFileBatchInfoLine(line) if err != nil { - g.kill() + if !isMissingError(err) { // missingError is recoverable + g.kill() + } return gitCatFileBatchInfo{}, err } @@ -124,7 +147,9 @@ func (g *gitCatFileBatch) Contents(ref string) (gitCatFileBatchInfo, error) { info, err := parseGitCatFileBatchInfoLine(line) if err != nil { - g.kill() + if !isMissingError(err) { // missingError is recoverable + g.kill() + } return gitCatFileBatchInfo{}, err } @@ -161,11 +186,16 @@ func (g *gitCatFileBatch) discard() error { // parseGitCatFileBatchInfoLine parses the info line from git-cat-file. It // expects the default format of: // -// SP SP LF +// SP SP LF func parseGitCatFileBatchInfoLine(line []byte) (gitCatFileBatchInfo, error) { line = bytes.TrimRight(line, "\n") origLine := line + if bytes.HasSuffix(line, []byte(" missing")) { + ref := bytes.TrimSuffix(line, []byte(" missing")) + return gitCatFileBatchInfo{}, &missingError{ref: string(ref)} + } + // PERF this allocates much less than bytes.Split next := func() []byte { i := bytes.IndexByte(line, ' ') diff --git a/cmd/git-sg/catfile_test.go b/cmd/git-sg/catfile_test.go index 8f276d1c4..5a50e0fa0 100644 --- a/cmd/git-sg/catfile_test.go +++ b/cmd/git-sg/catfile_test.go @@ -22,6 +22,21 @@ func TestInfo(t *testing.T) { t.Log(info.Hash, info.Type, info.Size) + // Test that we can recover from missing + if info, err := p.Contents("sdflkjsdfDoesNOTexist"); !isMissingError(err) { + t.Fatalf("expected missing error got info=%v err=%v", info, err) + } + + // Now lets fetch the object again via hash and see if it stays the same. + info2, err := p.Info(info.Hash.String()) + if err != nil { + t.Fatal(err) + } + + if d := cmp.Diff(info, info2); d != "" { + t.Fatalf("info changed (-first, +second):\n%s", d) + } + if err := p.Close(); err != nil { t.Fatal(err) } @@ -54,6 +69,11 @@ func TestContents(t *testing.T) { t.Fatalf("expected HEAD to be a commit, got %s", info.Type) } + // Test that we can recover from missing + if info, err := p.Contents("sdflkjsdfDoesNOTexist"); !isMissingError(err) { + t.Fatalf("expected missing error got info=%v err=%v", info, err) + } + // Now lets fetch the object again via hash and see if it stays the same. info2, err := p.Contents(info.Hash.String()) if err != nil { From c95ac4f403543faed7e598ae0396f334d83548f9 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 21 Sep 2022 20:27:05 +0200 Subject: [PATCH 16/29] factor out common logic in cat-file --- cmd/git-sg/catfile.go | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/cmd/git-sg/catfile.go b/cmd/git-sg/catfile.go index a625a2a4a..136f0467b 100644 --- a/cmd/git-sg/catfile.go +++ b/cmd/git-sg/catfile.go @@ -96,28 +96,10 @@ func (g *gitCatFileBatch) Info(ref string) (gitCatFileBatchInfo, error) { g.in.WriteString("info ") g.in.WriteString(ref) g.in.WriteByte('\n') - if err := g.in.Flush(); err != nil { - g.kill() - return gitCatFileBatchInfo{}, err - } - - if err := g.discard(); err != nil { - g.kill() - return gitCatFileBatchInfo{}, err - } - line, err := g.out.ReadSlice('\n') + info, err := g.sendCommand() if err != nil { - g.kill() - return gitCatFileBatchInfo{}, err - } - - info, err := parseGitCatFileBatchInfoLine(line) - if err != nil { - if !isMissingError(err) { // missingError is recoverable - g.kill() - } - return gitCatFileBatchInfo{}, err + return info, err } g.readerN = 0 @@ -129,6 +111,18 @@ func (g *gitCatFileBatch) Contents(ref string) (gitCatFileBatchInfo, error) { g.in.WriteString("contents ") g.in.WriteString(ref) g.in.WriteByte('\n') + + info, err := g.sendCommand() + if err != nil { + return info, err + } + + g.readerN = info.Size + 1 + + return info, nil +} + +func (g *gitCatFileBatch) sendCommand() (gitCatFileBatchInfo, error) { if err := g.in.Flush(); err != nil { g.kill() return gitCatFileBatchInfo{}, err @@ -153,8 +147,6 @@ func (g *gitCatFileBatch) Contents(ref string) (gitCatFileBatchInfo, error) { return gitCatFileBatchInfo{}, err } - g.readerN = info.Size + 1 - return info, nil } From 5daae6759576fd15067e0463eaeed4fec645c859 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 21 Sep 2022 21:22:42 +0200 Subject: [PATCH 17/29] add hash native API to catfile This will avoid allocations when using it. --- cmd/git-sg/catfile.go | 42 ++++++++++++++++++++++++++++++++++++-- cmd/git-sg/catfile_test.go | 19 ++++++++++------- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/cmd/git-sg/catfile.go b/cmd/git-sg/catfile.go index 136f0467b..13749654e 100644 --- a/cmd/git-sg/catfile.go +++ b/cmd/git-sg/catfile.go @@ -25,6 +25,9 @@ type gitCatFileBatch struct { out *bufio.Reader outCloser io.Closer + // hashBuf is encoded to for plumbing.Hash + hashBuf [20 * 2]byte + // readerN is the amount left to read for Read. Note: git-cat-file always // has a trailing new line, so this will always be the size of an object + // 1. @@ -92,7 +95,7 @@ type gitCatFileBatchInfo struct { Size int64 } -func (g *gitCatFileBatch) Info(ref string) (gitCatFileBatchInfo, error) { +func (g *gitCatFileBatch) InfoString(ref string) (gitCatFileBatchInfo, error) { g.in.WriteString("info ") g.in.WriteString(ref) g.in.WriteByte('\n') @@ -107,7 +110,22 @@ func (g *gitCatFileBatch) Info(ref string) (gitCatFileBatchInfo, error) { return info, nil } -func (g *gitCatFileBatch) Contents(ref string) (gitCatFileBatchInfo, error) { +func (g *gitCatFileBatch) Info(hash plumbing.Hash) (gitCatFileBatchInfo, error) { + g.in.WriteString("info ") + g.writeHash(hash) + g.in.WriteByte('\n') + + info, err := g.sendCommand() + if err != nil { + return info, err + } + + g.readerN = 0 + + return info, nil +} + +func (g *gitCatFileBatch) ContentsString(ref string) (gitCatFileBatchInfo, error) { g.in.WriteString("contents ") g.in.WriteString(ref) g.in.WriteByte('\n') @@ -122,6 +140,26 @@ func (g *gitCatFileBatch) Contents(ref string) (gitCatFileBatchInfo, error) { return info, nil } +func (g *gitCatFileBatch) Contents(hash plumbing.Hash) (gitCatFileBatchInfo, error) { + g.in.WriteString("contents ") + g.writeHash(hash) + g.in.WriteByte('\n') + + info, err := g.sendCommand() + if err != nil { + return info, err + } + + g.readerN = info.Size + 1 + + return info, nil +} + +func (g *gitCatFileBatch) writeHash(hash plumbing.Hash) { + hex.Encode(g.hashBuf[:], hash[:]) + g.in.Write(g.hashBuf[:]) +} + func (g *gitCatFileBatch) sendCommand() (gitCatFileBatchInfo, error) { if err := g.in.Flush(); err != nil { g.kill() diff --git a/cmd/git-sg/catfile_test.go b/cmd/git-sg/catfile_test.go index 5a50e0fa0..b6682da0d 100644 --- a/cmd/git-sg/catfile_test.go +++ b/cmd/git-sg/catfile_test.go @@ -15,7 +15,7 @@ func TestInfo(t *testing.T) { } defer p.Close() - info, err := p.Info("HEAD") + info, err := p.InfoString("HEAD") if err != nil { t.Fatal(err) } @@ -23,12 +23,12 @@ func TestInfo(t *testing.T) { t.Log(info.Hash, info.Type, info.Size) // Test that we can recover from missing - if info, err := p.Contents("sdflkjsdfDoesNOTexist"); !isMissingError(err) { + if info, err := p.InfoString("sdflkjsdfDoesNOTexist"); !isMissingError(err) { t.Fatalf("expected missing error got info=%v err=%v", info, err) } // Now lets fetch the object again via hash and see if it stays the same. - info2, err := p.Info(info.Hash.String()) + info2, err := p.Info(info.Hash) if err != nil { t.Fatal(err) } @@ -49,7 +49,7 @@ func TestContents(t *testing.T) { } defer p.Close() - info, err := p.Contents("HEAD") + info, err := p.ContentsString("HEAD") if err != nil { t.Fatal(err) } @@ -70,12 +70,12 @@ func TestContents(t *testing.T) { } // Test that we can recover from missing - if info, err := p.Contents("sdflkjsdfDoesNOTexist"); !isMissingError(err) { + if info, err := p.ContentsString("sdflkjsdfDoesNOTexist"); !isMissingError(err) { t.Fatalf("expected missing error got info=%v err=%v", info, err) } // Now lets fetch the object again via hash and see if it stays the same. - info2, err := p.Contents(info.Hash.String()) + info2, err := p.Contents(info.Hash) if err != nil { t.Fatal(err) } @@ -104,8 +104,13 @@ func BenchmarkInfo(b *testing.B) { } defer p.Close() + info, err := p.InfoString("HEAD") + if err != nil { + b.Fatal(err) + } + for i := 0; i < b.N; i++ { - _, err := p.Info("HEAD") + _, err := p.Info(info.Hash) if err != nil { b.Fatal(err) } From 801d2a484e37a014e20421c7b3370b35954b506e Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 21 Sep 2022 21:35:10 +0200 Subject: [PATCH 18/29] make archive writer based on tree entries instead of object.TreeEntry We only read the entries field, so this makes it easier to use a different impl. --- cmd/git-sg/main.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 2d1a0f651..d16a9a0d3 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -85,7 +85,7 @@ func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *ar repo: repo, opts: opts, - stack: []item{{tree: tree, path: ""}}, + stack: []item{{entries: tree.Entries, path: ""}}, // 32*1024 is the same size used by io.Copy buf: make([]byte, 32*1024), @@ -95,7 +95,7 @@ func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *ar item := a.stack[len(a.stack)-1] a.stack = a.stack[:len(a.stack)-1] - err := a.writeTree(item.tree, item.path) + err := a.writeTree(item.entries, item.path) if err != nil { _ = a.w.Close() return err @@ -106,22 +106,23 @@ func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *ar } type item struct { - tree *object.Tree - path string + entries []object.TreeEntry + path string } type archiveWriter struct { w *tar.Writer - repo *git.Repository opts *archiveOpts + repo *git.Repository + stack []item buf []byte } -func (a *archiveWriter) writeTree(tree *object.Tree, path string) error { - for _, e := range tree.Entries { +func (a *archiveWriter) writeTree(entries []object.TreeEntry, path string) error { + for _, e := range entries { var p string if e.Mode == filemode.Dir { p = path + e.Name + "/" @@ -150,7 +151,7 @@ func (a *archiveWriter) writeTree(tree *object.Tree, path string) error { return err } - a.stack = append(a.stack, item{tree: child, path: p}) + a.stack = append(a.stack, item{entries: child.Entries, path: p}) case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: if err := a.writeRegularTreeEntry(e, p); err != nil { From f7a1f9cd15f66736e80572417f9d43019dc26567 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 21 Sep 2022 21:39:01 +0200 Subject: [PATCH 19/29] move archive code into own file --- cmd/git-sg/archive.go | 182 ++++++++++++++++++++++++++++++++++++++++++ cmd/git-sg/main.go | 172 --------------------------------------- 2 files changed, 182 insertions(+), 172 deletions(-) create mode 100644 cmd/git-sg/archive.go diff --git a/cmd/git-sg/archive.go b/cmd/git-sg/archive.go new file mode 100644 index 000000000..77b2cd11c --- /dev/null +++ b/cmd/git-sg/archive.go @@ -0,0 +1,182 @@ +package main + +import ( + "archive/tar" + "bytes" + "io" + "log" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/filemode" + "github.com/go-git/go-git/v5/plumbing/object" +) + +type archiveOpts struct { + // Ignore if true will exclude path from the archive + Ignore func(path string) bool + // SkipContent if returning a non-empty string will include an entry for + // path but with no content. The PAX header SOURCEGRAPH.skip will contain + // the returned string (a reason for skipping). + SkipContent func(hdr *tar.Header) string +} + +func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) error { + a := &archiveWriter{ + w: tar.NewWriter(w), + repo: repo, + opts: opts, + + stack: []item{{entries: tree.Entries, path: ""}}, + + // 32*1024 is the same size used by io.Copy + buf: make([]byte, 32*1024), + } + + for len(a.stack) > 0 { + item := a.stack[len(a.stack)-1] + a.stack = a.stack[:len(a.stack)-1] + + err := a.writeTree(item.entries, item.path) + if err != nil { + _ = a.w.Close() + return err + } + } + + return a.w.Close() +} + +type item struct { + entries []object.TreeEntry + path string +} + +type archiveWriter struct { + w *tar.Writer + opts *archiveOpts + + repo *git.Repository + + stack []item + + buf []byte +} + +func (a *archiveWriter) writeTree(entries []object.TreeEntry, path string) error { + for _, e := range entries { + var p string + if e.Mode == filemode.Dir { + p = path + e.Name + "/" + } else { + p = path + e.Name + } + + if a.opts.Ignore(p) { + continue + } + + switch e.Mode { + case filemode.Dir: + child, err := a.repo.TreeObject(e.Hash) + if err != nil { + log.Printf("failed to fetch tree object for %s %v: %v", p, e.Hash, err) + continue + } + + if err := a.w.WriteHeader(&tar.Header{ + Typeflag: tar.TypeDir, + Name: p, + Mode: 0777, + Format: tar.FormatPAX, // TODO ? + }); err != nil { + return err + } + + a.stack = append(a.stack, item{entries: child.Entries, path: p}) + + case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: + if err := a.writeRegularTreeEntry(e, p); err != nil { + return err + } + + case filemode.Submodule: + // TODO what do? + continue + + default: + log.Printf("WARN: unexpected filemode %+v", e) + } + } + + return nil +} + +func (a *archiveWriter) writeRegularTreeEntry(entry object.TreeEntry, path string) error { + blob, err := a.repo.BlobObject(entry.Hash) + if err != nil { + log.Printf("failed to get blob object for %s %v: %v", path, entry.Hash, err) + return nil + } + + // TODO symlinks, mode, etc. Handle large Linkname + hdr := &tar.Header{ + Typeflag: tar.TypeReg, + Name: path, + Size: blob.Size, + Mode: 0666, + + Format: tar.FormatPAX, // TODO ? + } + + if reason := a.opts.SkipContent(hdr); reason != "" { + return a.writeSkipHeader(hdr, reason) + } + + r, err := blob.Reader() + if err != nil { + log.Printf("failed to read blob object for %s %v: %v", path, entry.Hash, err) + return nil + } + + // TODO confirm it is fine to call Close twice. From initial reading of + // go-git it relies on io.Pipe for readers, so this should be fine. + defer r.Close() + + // Heuristic: Assume file is binary if first 256 bytes contain a 0x00. + blobSample := a.buf[:256] + if n, err := io.ReadAtLeast(r, blobSample, 256); err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { + log.Printf("failed to read blob object for %s %v: %v", path, entry.Hash, err) + return nil + } else { + blobSample = blobSample[:n] + } + + // TODO instead of just binary, should we only allow utf8? utf.Valid + // works except for the fact we may be invalid utf8 at the 256 boundary + // since we cut it off. So will need to copypasta that. + if bytes.IndexByte(blobSample, 0x00) >= 0 { + return a.writeSkipHeader(hdr, "binary") + } + + if err := a.w.WriteHeader(hdr); err != nil { + return err + } + + // We read some bytes from r already, first write those. + if _, err := a.w.Write(blobSample); err != nil { + return err + } + + // Write out the rest of r. + if _, err := io.CopyBuffer(a.w, r, a.buf); err != nil { + return err + } + + return r.Close() +} + +func (a *archiveWriter) writeSkipHeader(hdr *tar.Header, reason string) error { + hdr.PAXRecords = map[string]string{"SG.skip": reason} + hdr.Size = 0 // clear out size since we won't write the body + return a.w.WriteHeader(hdr) +} diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index d16a9a0d3..064290025 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -3,7 +3,6 @@ package main import ( "archive/tar" "bufio" - "bytes" "flag" "fmt" "io" @@ -16,7 +15,6 @@ import ( "github.com/go-git/go-billy/v5/osfs" "github.com/go-git/go-git/v5" "github.com/go-git/go-git/v5/plumbing/cache" - "github.com/go-git/go-git/v5/plumbing/filemode" "github.com/go-git/go-git/v5/plumbing/object" "github.com/go-git/go-git/v5/storage/filesystem" "github.com/sourcegraph/zoekt/ignore" @@ -70,176 +68,6 @@ func do(w io.Writer) error { return archiveWrite(w, r, root, opts) } -type archiveOpts struct { - // Ignore if true will exclude path from the archive - Ignore func(path string) bool - // SkipContent if returning a non-empty string will include an entry for - // path but with no content. The PAX header SOURCEGRAPH.skip will contain - // the returned string (a reason for skipping). - SkipContent func(hdr *tar.Header) string -} - -func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) error { - a := &archiveWriter{ - w: tar.NewWriter(w), - repo: repo, - opts: opts, - - stack: []item{{entries: tree.Entries, path: ""}}, - - // 32*1024 is the same size used by io.Copy - buf: make([]byte, 32*1024), - } - - for len(a.stack) > 0 { - item := a.stack[len(a.stack)-1] - a.stack = a.stack[:len(a.stack)-1] - - err := a.writeTree(item.entries, item.path) - if err != nil { - _ = a.w.Close() - return err - } - } - - return a.w.Close() -} - -type item struct { - entries []object.TreeEntry - path string -} - -type archiveWriter struct { - w *tar.Writer - opts *archiveOpts - - repo *git.Repository - - stack []item - - buf []byte -} - -func (a *archiveWriter) writeTree(entries []object.TreeEntry, path string) error { - for _, e := range entries { - var p string - if e.Mode == filemode.Dir { - p = path + e.Name + "/" - } else { - p = path + e.Name - } - - if a.opts.Ignore(p) { - continue - } - - switch e.Mode { - case filemode.Dir: - child, err := a.repo.TreeObject(e.Hash) - if err != nil { - log.Printf("failed to fetch tree object for %s %v: %v", p, e.Hash, err) - continue - } - - if err := a.w.WriteHeader(&tar.Header{ - Typeflag: tar.TypeDir, - Name: p, - Mode: 0777, - Format: tar.FormatPAX, // TODO ? - }); err != nil { - return err - } - - a.stack = append(a.stack, item{entries: child.Entries, path: p}) - - case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: - if err := a.writeRegularTreeEntry(e, p); err != nil { - return err - } - - case filemode.Submodule: - // TODO what do? - continue - - default: - log.Printf("WARN: unexpected filemode %+v", e) - } - } - - return nil -} - -func (a *archiveWriter) writeRegularTreeEntry(entry object.TreeEntry, path string) error { - blob, err := a.repo.BlobObject(entry.Hash) - if err != nil { - log.Printf("failed to get blob object for %s %v: %v", path, entry.Hash, err) - return nil - } - - // TODO symlinks, mode, etc. Handle large Linkname - hdr := &tar.Header{ - Typeflag: tar.TypeReg, - Name: path, - Size: blob.Size, - Mode: 0666, - - Format: tar.FormatPAX, // TODO ? - } - - if reason := a.opts.SkipContent(hdr); reason != "" { - return a.writeSkipHeader(hdr, reason) - } - - r, err := blob.Reader() - if err != nil { - log.Printf("failed to read blob object for %s %v: %v", path, entry.Hash, err) - return nil - } - - // TODO confirm it is fine to call Close twice. From initial reading of - // go-git it relies on io.Pipe for readers, so this should be fine. - defer r.Close() - - // Heuristic: Assume file is binary if first 256 bytes contain a 0x00. - blobSample := a.buf[:256] - if n, err := io.ReadAtLeast(r, blobSample, 256); err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { - log.Printf("failed to read blob object for %s %v: %v", path, entry.Hash, err) - return nil - } else { - blobSample = blobSample[:n] - } - - // TODO instead of just binary, should we only allow utf8? utf.Valid - // works except for the fact we may be invalid utf8 at the 256 boundary - // since we cut it off. So will need to copypasta that. - if bytes.IndexByte(blobSample, 0x00) >= 0 { - return a.writeSkipHeader(hdr, "binary") - } - - if err := a.w.WriteHeader(hdr); err != nil { - return err - } - - // We read some bytes from r already, first write those. - if _, err := a.w.Write(blobSample); err != nil { - return err - } - - // Write out the rest of r. - if _, err := io.CopyBuffer(a.w, r, a.buf); err != nil { - return err - } - - return r.Close() -} - -func (a *archiveWriter) writeSkipHeader(hdr *tar.Header, reason string) error { - hdr.PAXRecords = map[string]string{"SG.skip": reason} - hdr.Size = 0 // clear out size since we won't write the body - return a.w.WriteHeader(hdr) -} - func getIgnoreFilter(r *git.Repository, root *object.Tree) func(string) bool { m, err := parseIgnoreFile(r, root) if err != nil { From a4cdae075f62248006cc36a7d11bad05b4394049 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 21 Sep 2022 21:59:01 +0200 Subject: [PATCH 20/29] wip interface to allow swapping out backend for archive writer --- cmd/git-sg/archive.go | 94 ++++++++++++++++++++++++++++++++++++++++--- cmd/git-sg/main.go | 2 +- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/cmd/git-sg/archive.go b/cmd/git-sg/archive.go index 77b2cd11c..23a708179 100644 --- a/cmd/git-sg/archive.go +++ b/cmd/git-sg/archive.go @@ -7,6 +7,7 @@ import ( "log" "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing" "github.com/go-git/go-git/v5/plumbing/filemode" "github.com/go-git/go-git/v5/plumbing/object" ) @@ -20,7 +21,7 @@ type archiveOpts struct { SkipContent func(hdr *tar.Header) string } -func archiveWrite(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) error { +func archiveWrite(w io.Writer, repo archiveWriterRepo, tree *object.Tree, opts *archiveOpts) error { a := &archiveWriter{ w: tar.NewWriter(w), repo: repo, @@ -51,11 +52,21 @@ type item struct { path string } +type archiveWriterBlob interface { + Size() int64 + Reader() (io.ReadCloser, error) +} + +type archiveWriterRepo interface { + TreeEntries(plumbing.Hash) ([]object.TreeEntry, error) + Blob(plumbing.Hash) (archiveWriterBlob, error) +} + type archiveWriter struct { w *tar.Writer opts *archiveOpts - repo *git.Repository + repo archiveWriterRepo stack []item @@ -77,7 +88,7 @@ func (a *archiveWriter) writeTree(entries []object.TreeEntry, path string) error switch e.Mode { case filemode.Dir: - child, err := a.repo.TreeObject(e.Hash) + child, err := a.repo.TreeEntries(e.Hash) if err != nil { log.Printf("failed to fetch tree object for %s %v: %v", p, e.Hash, err) continue @@ -92,7 +103,7 @@ func (a *archiveWriter) writeTree(entries []object.TreeEntry, path string) error return err } - a.stack = append(a.stack, item{entries: child.Entries, path: p}) + a.stack = append(a.stack, item{entries: child, path: p}) case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: if err := a.writeRegularTreeEntry(e, p); err != nil { @@ -112,7 +123,7 @@ func (a *archiveWriter) writeTree(entries []object.TreeEntry, path string) error } func (a *archiveWriter) writeRegularTreeEntry(entry object.TreeEntry, path string) error { - blob, err := a.repo.BlobObject(entry.Hash) + blob, err := a.repo.Blob(entry.Hash) if err != nil { log.Printf("failed to get blob object for %s %v: %v", path, entry.Hash, err) return nil @@ -122,7 +133,7 @@ func (a *archiveWriter) writeRegularTreeEntry(entry object.TreeEntry, path strin hdr := &tar.Header{ Typeflag: tar.TypeReg, Name: path, - Size: blob.Size, + Size: blob.Size(), Mode: 0666, Format: tar.FormatPAX, // TODO ? @@ -180,3 +191,74 @@ func (a *archiveWriter) writeSkipHeader(hdr *tar.Header, reason string) error { hdr.Size = 0 // clear out size since we won't write the body return a.w.WriteHeader(hdr) } + +type archiveWriterBlobGoGit struct { + blob *object.Blob +} + +func (b archiveWriterBlobGoGit) Size() int64 { + return b.blob.Size +} + +func (b archiveWriterBlobGoGit) Reader() (io.ReadCloser, error) { + return b.blob.Reader() +} + +type archiveWriterRepoGoGit git.Repository + +func (repo *archiveWriterRepoGoGit) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { + tree, err := (*git.Repository)(repo).TreeObject(hash) + if err != nil { + return nil, err + } + return tree.Entries, nil +} + +func (repo *archiveWriterRepoGoGit) Blob(hash plumbing.Hash) (archiveWriterBlob, error) { + blob, err := (*git.Repository)(repo).BlobObject(hash) + if err != nil { + return nil, err + } + return archiveWriterBlobGoGit{blob: blob}, nil +} + +type archiveWriterBlobCatFile struct { + catFile *gitCatFileBatch + info gitCatFileBatchInfo +} + +func (b archiveWriterBlobCatFile) Size() int64 { + return b.info.Size +} + +func (b archiveWriterBlobCatFile) Reader() (io.ReadCloser, error) { + _, err := b.catFile.Contents(b.info.Hash) + if err != nil { + return nil, err + } + return io.NopCloser(b.catFile), nil +} + +type archiveWriterRepoCatFile struct { + catFile *gitCatFileBatch +} + +func (r archiveWriterRepoCatFile) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { + _, err := r.catFile.Contents(hash) + if err != nil { + return nil, err + } + return nil, nil + //return tree.Entries, nil +} + +func (r archiveWriterRepoCatFile) Blob(hash plumbing.Hash) (archiveWriterBlob, error) { + info, err := r.catFile.Info(hash) + if err != nil { + return nil, err + } + return archiveWriterBlobCatFile{ + catFile: r.catFile, + info: info, + }, nil +} diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 064290025..176a6edf5 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -65,7 +65,7 @@ func do(w io.Writer) error { return archiveFilter(w, r, root, opts) } - return archiveWrite(w, r, root, opts) + return archiveWrite(w, (*archiveWriterRepoGoGit)(r), root, opts) } func getIgnoreFilter(r *git.Repository, root *object.Tree) func(string) bool { From ae81031e60190a9782bd2d0e9a6bd6bf0b6fa203 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 22 Sep 2022 08:31:38 +0200 Subject: [PATCH 21/29] implement TreeEntries for cat-file --- cmd/git-sg/archive.go | 62 ++++++++++++++++++++++++++++++++++++----- cmd/git-sg/main.go | 34 ++++++++++++++++------ cmd/git-sg/main_test.go | 9 ++++-- 3 files changed, 88 insertions(+), 17 deletions(-) diff --git a/cmd/git-sg/archive.go b/cmd/git-sg/archive.go index 23a708179..ae76cc8e2 100644 --- a/cmd/git-sg/archive.go +++ b/cmd/git-sg/archive.go @@ -2,9 +2,11 @@ package main import ( "archive/tar" + "bufio" "bytes" "io" "log" + "sync" "github.com/go-git/go-git/v5" "github.com/go-git/go-git/v5/plumbing" @@ -243,22 +245,68 @@ type archiveWriterRepoCatFile struct { catFile *gitCatFileBatch } -func (r archiveWriterRepoCatFile) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { - _, err := r.catFile.Contents(hash) +var bufPool = sync.Pool{ + New: func() interface{} { + return bufio.NewReader(nil) + }, +} + +func (w archiveWriterRepoCatFile) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { + _, err := w.catFile.Contents(hash) if err != nil { return nil, err } - return nil, nil - //return tree.Entries, nil + + var entries []object.TreeEntry + + // Copy-pasta from go-git/plumbing/object/tree.go + r := bufPool.Get().(*bufio.Reader) + defer bufPool.Put(r) + r.Reset(w.catFile) + for { + str, err := r.ReadString(' ') + if err != nil { + if err == io.EOF { + break + } + + return nil, err + } + str = str[:len(str)-1] // strip last byte (' ') + + mode, err := filemode.New(str) + if err != nil { + return nil, err + } + + name, err := r.ReadString(0) + if err != nil && err != io.EOF { + return nil, err + } + + var hash plumbing.Hash + if _, err = io.ReadFull(r, hash[:]); err != nil { + return nil, err + } + + baseName := name[:len(name)-1] + entries = append(entries, object.TreeEntry{ + Hash: hash, + Mode: mode, + Name: baseName, + }) + } + + return entries, nil } -func (r archiveWriterRepoCatFile) Blob(hash plumbing.Hash) (archiveWriterBlob, error) { - info, err := r.catFile.Info(hash) +func (w archiveWriterRepoCatFile) Blob(hash plumbing.Hash) (archiveWriterBlob, error) { + info, err := w.catFile.Info(hash) if err != nil { return nil, err } return archiveWriterBlobCatFile{ - catFile: r.catFile, + catFile: w.catFile, info: info, }, nil } diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 176a6edf5..00226078b 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -65,7 +65,22 @@ func do(w io.Writer) error { return archiveFilter(w, r, root, opts) } - return archiveWrite(w, (*archiveWriterRepoGoGit)(r), root, opts) + var repo archiveWriterRepo = (*archiveWriterRepoGoGit)(r) + if os.Getenv("GIT_SG_CATFILE") != "" { + log.Println("using git-cat-file") + dir, err := gitDir() + if err != nil { + return err + } + catFile, err := startGitCatFileBatch(dir) + if err != nil { + return err + } + defer catFile.Close() + repo = archiveWriterRepoCatFile{catFile: catFile} + } + + return archiveWrite(w, repo, root, opts) } func getIgnoreFilter(r *git.Repository, root *object.Tree) func(string) bool { @@ -121,14 +136,17 @@ func isNotExist(err error) bool { return os.IsNotExist(err) || strings.Contains(err.Error(), "not found") } +func gitDir() (string, error) { + if dir := os.Getenv("GIT_DIR"); dir != "" { + return dir, nil + } + return os.Getwd() +} + func openGitRepo() (*git.Repository, error) { - dir := os.Getenv("GIT_DIR") - if dir == "" { - var err error - dir, err = os.Getwd() - if err != nil { - return nil, err - } + dir, err := gitDir() + if err != nil { + return nil, err } fs := osfs.New(dir) diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go index 380ab070b..ccaa9f293 100644 --- a/cmd/git-sg/main_test.go +++ b/cmd/git-sg/main_test.go @@ -2,12 +2,17 @@ package main import ( "io" + "path/filepath" "testing" ) func TestDo(t *testing.T) { - t.Setenv("GIT_DIR", "../../.git") - err := do(io.Discard) + dir, err := filepath.Abs("../../.git") + if err != nil { + t.Fatal(err) + } + t.Setenv("GIT_DIR", dir) + err = do(io.Discard) if err != nil { t.Fatal(err) } From d707154c336a29f307245333b3108e8bc1fb1cc1 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 22 Sep 2022 09:14:42 +0200 Subject: [PATCH 22/29] test all modes --- cmd/git-sg/main_test.go | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go index ccaa9f293..8eb31129d 100644 --- a/cmd/git-sg/main_test.go +++ b/cmd/git-sg/main_test.go @@ -1,7 +1,6 @@ package main import ( - "io" "path/filepath" "testing" ) @@ -12,8 +11,34 @@ func TestDo(t *testing.T) { t.Fatal(err) } t.Setenv("GIT_DIR", dir) - err = do(io.Discard) - if err != nil { - t.Fatal(err) + + for _, envvar := range []string{"", "GIT_SG_BUFFER", "GIT_SG_FILTER", "GIT_SG_CATFILE"} { + name := envvar + if name == "" { + name = "default" + } + t.Run(name, func(t *testing.T) { + if envvar != "" { + t.Setenv(envvar, "1") + } + var w countingWriter + err = do(&w) + if err != nil { + t.Fatal(err) + } + t.Logf("wrote %d bytes", w.N) + if w.N == 0 { + t.Fatal("wrote no bytes") + } + }) } } + +type countingWriter struct { + N int +} + +func (w *countingWriter) Write(b []byte) (int, error) { + w.N += len(b) + return len(b), nil +} From d88c93fc98bbc18dac2d132f3b245402e1bb7bed Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 22 Sep 2022 09:22:03 +0200 Subject: [PATCH 23/29] wip lstree --- cmd/git-sg/lstree.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 cmd/git-sg/lstree.go diff --git a/cmd/git-sg/lstree.go b/cmd/git-sg/lstree.go new file mode 100644 index 000000000..95c6a7c40 --- /dev/null +++ b/cmd/git-sg/lstree.go @@ -0,0 +1,32 @@ +package main + +import ( + "bytes" + "fmt" + "io" + "syscall" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/object" +) + +func archiveLsTree(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) (err error) { + fmt.Println(syscall.PathMax) + return nil +} + +// scanNull is a split function for bufio.Scanner that returns each item of +// text as split by the null character. It will not include the null. +func scanNull(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + if i := bytes.IndexByte(data, 0); i >= 0 { + return i + 1, data[0:i], nil + } + if atEOF { + return len(data), data, nil + } + // Request more data. + return 0, nil, nil +} From d29b6ba2fce124b42530a272e80017e29abff2c2 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 22 Sep 2022 11:38:20 +0200 Subject: [PATCH 24/29] refactor catfile to separate out gitCatFileBatchReader --- cmd/git-sg/catfile.go | 144 ++++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 56 deletions(-) diff --git a/cmd/git-sg/catfile.go b/cmd/git-sg/catfile.go index 13749654e..4bd0179fe 100644 --- a/cmd/git-sg/catfile.go +++ b/cmd/git-sg/catfile.go @@ -19,19 +19,13 @@ import ( // This provides an efficient means to interact with the git object store of a // repository. type gitCatFileBatch struct { - cmd *exec.Cmd - in *bufio.Writer - inCloser io.Closer - out *bufio.Reader - outCloser io.Closer + cmd *exec.Cmd + in *bufio.Writer + inCloser io.Closer + out *gitCatFileBatchReader // hashBuf is encoded to for plumbing.Hash hashBuf [20 * 2]byte - - // readerN is the amount left to read for Read. Note: git-cat-file always - // has a trailing new line, so this will always be the size of an object + - // 1. - readerN int64 } type missingError struct { @@ -81,11 +75,10 @@ func startGitCatFileBatch(dir string) (_ *gitCatFileBatch, err error) { } return &gitCatFileBatch{ - cmd: cmd, - in: bufio.NewWriter(stdin), - inCloser: stdin, - out: bufio.NewReader(stdout), - outCloser: stdout, + cmd: cmd, + in: bufio.NewWriter(stdin), + inCloser: stdin, + out: newGitCatFileBatchReader(stdout), }, nil } @@ -99,60 +92,68 @@ func (g *gitCatFileBatch) InfoString(ref string) (gitCatFileBatchInfo, error) { g.in.WriteString("info ") g.in.WriteString(ref) g.in.WriteByte('\n') - - info, err := g.sendCommand() - if err != nil { - return info, err + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err } - g.readerN = 0 - - return info, nil + info, err := g.out.Info() + if err != nil && !isMissingError(err) { // missingError is recoverable + g.kill() + } + return info, err } func (g *gitCatFileBatch) Info(hash plumbing.Hash) (gitCatFileBatchInfo, error) { g.in.WriteString("info ") g.writeHash(hash) g.in.WriteByte('\n') - - info, err := g.sendCommand() - if err != nil { - return info, err + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err } - g.readerN = 0 - - return info, nil + info, err := g.out.Info() + if err != nil && !isMissingError(err) { // missingError is recoverable + g.kill() + } + return info, err } func (g *gitCatFileBatch) ContentsString(ref string) (gitCatFileBatchInfo, error) { g.in.WriteString("contents ") g.in.WriteString(ref) g.in.WriteByte('\n') - - info, err := g.sendCommand() - if err != nil { - return info, err + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err } - g.readerN = info.Size + 1 - - return info, nil + info, err := g.out.Contents() + if err != nil && !isMissingError(err) { // missingError is recoverable + g.kill() + } + return info, err } func (g *gitCatFileBatch) Contents(hash plumbing.Hash) (gitCatFileBatchInfo, error) { g.in.WriteString("contents ") g.writeHash(hash) g.in.WriteByte('\n') - - info, err := g.sendCommand() - if err != nil { - return info, err + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err } - g.readerN = info.Size + 1 + info, err := g.out.Contents() + if err != nil && !isMissingError(err) { // missingError is recoverable + g.kill() + } + return info, err +} - return info, nil +func (g *gitCatFileBatch) Read(b []byte) (int, error) { + return g.out.Read(b) } func (g *gitCatFileBatch) writeHash(hash plumbing.Hash) { @@ -160,35 +161,62 @@ func (g *gitCatFileBatch) writeHash(hash plumbing.Hash) { g.in.Write(g.hashBuf[:]) } -func (g *gitCatFileBatch) sendCommand() (gitCatFileBatchInfo, error) { - if err := g.in.Flush(); err != nil { - g.kill() - return gitCatFileBatchInfo{}, err +type gitCatFileBatchReader struct { + out *bufio.Reader + outCloser io.Closer + + // readerN is the amount left to read for Read. Note: git-cat-file always + // has a trailing new line, so this will always be the size of an object + + // 1. + readerN int64 +} + +func newGitCatFileBatchReader(r io.ReadCloser) *gitCatFileBatchReader { + return &gitCatFileBatchReader{ + out: bufio.NewReader(r), + outCloser: r, } +} - if err := g.discard(); err != nil { - g.kill() +func (g *gitCatFileBatchReader) Info() (gitCatFileBatchInfo, error) { + if err := g.Discard(); err != nil { + g.Close() return gitCatFileBatchInfo{}, err } line, err := g.out.ReadSlice('\n') if err != nil { - g.kill() + g.Close() return gitCatFileBatchInfo{}, err } info, err := parseGitCatFileBatchInfoLine(line) if err != nil { if !isMissingError(err) { // missingError is recoverable - g.kill() + g.Close() } return gitCatFileBatchInfo{}, err } + // Info has nothing following to read + g.readerN = 0 + + return info, nil +} + +func (g *gitCatFileBatchReader) Contents() (gitCatFileBatchInfo, error) { + info, err := g.Info() + if err != nil { + return info, err + } + + // Still have the contents to read and an extra newline + g.readerN = info.Size + 1 + return info, nil } -func (g *gitCatFileBatch) Read(p []byte) (n int, err error) { +func (g *gitCatFileBatchReader) Read(p []byte) (n int, err error) { // We avoid reading the final byte (a newline). That will be handled by // discard. if g.readerN <= 1 { @@ -202,9 +230,9 @@ func (g *gitCatFileBatch) Read(p []byte) (n int, err error) { return } -// discard should be called before parsing a response to flush out any unread +// Discard should be called before parsing a response to flush out any unread // data since the last command. -func (g *gitCatFileBatch) discard() error { +func (g *gitCatFileBatchReader) Discard() error { if g.readerN > 0 { n, err := g.out.Discard(int(g.readerN)) g.readerN -= int64(n) @@ -213,6 +241,10 @@ func (g *gitCatFileBatch) discard() error { return nil } +func (g *gitCatFileBatchReader) Close() error { + return g.outCloser.Close() +} + // parseGitCatFileBatchInfoLine parses the info line from git-cat-file. It // expects the default format of: // @@ -267,7 +299,7 @@ func (g *gitCatFileBatch) Close() (err error) { } }() - if err := g.discard(); err != nil { + if err := g.out.Discard(); err != nil { return err } @@ -283,7 +315,7 @@ func (g *gitCatFileBatch) Close() (err error) { log.Printf("unexpected %d bytes of remaining output when calling close", n) } - if err := g.outCloser.Close(); err != nil { + if err := g.out.Close(); err != nil { return err } @@ -293,5 +325,5 @@ func (g *gitCatFileBatch) Close() (err error) { func (g *gitCatFileBatch) kill() { _ = g.cmd.Process.Kill() _ = g.inCloser.Close() - _ = g.outCloser.Close() + _ = g.out.Close() } From 01adb4e51817b407674362973d7ca7329789f8b2 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 22 Sep 2022 11:38:37 +0200 Subject: [PATCH 25/29] ls-tree just writing an archive of names --- cmd/git-sg/lstree.go | 120 +++++++++++++++++++++++++++++++++++++++- cmd/git-sg/main.go | 5 ++ cmd/git-sg/main_test.go | 2 +- 3 files changed, 123 insertions(+), 4 deletions(-) diff --git a/cmd/git-sg/lstree.go b/cmd/git-sg/lstree.go index 95c6a7c40..c6e943782 100644 --- a/cmd/git-sg/lstree.go +++ b/cmd/git-sg/lstree.go @@ -1,18 +1,132 @@ package main import ( + "archive/tar" + "bufio" "bytes" "fmt" "io" - "syscall" + "log" + "os/exec" + "strconv" "github.com/go-git/go-git/v5" "github.com/go-git/go-git/v5/plumbing/object" ) func archiveLsTree(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) (err error) { - fmt.Println(syscall.PathMax) - return nil + cmd := exec.Command("git", "ls-tree", "-r", "-l", "-t", "-z", tree.Hash.String()) + r, err := cmd.StdoutPipe() + if err != nil { + return err + } + defer r.Close() + + tw := tar.NewWriter(w) + + err = cmd.Start() + if err != nil { + return err + } + + done := false + defer func() { + if done { + return + } + err2 := cmd.Process.Kill() + if err == nil { + err = err2 + } + }() + + entries := bufio.NewScanner(r) + entries.Split(scanNull) + + for entries.Scan() { + line := entries.Bytes() + // PERF this allocates much less than bytes.Split + next := func() []byte { + i := bytes.IndexByte(line, ' ') + if i < 0 { + pre := line + line = nil + return pre + } + pre := line[:i] + line = bytes.TrimLeft(line[i+1:], " ") + return pre + } + + // %(objectmode) %(objecttype) %(objectname) %(objectsize:padded)%x09%(path) + modeRaw := next() + typ := next() + hash := next() + + _ = hash + + // remaining: %(objectsize:padded)\t%(path) + // + // size is left padded with space + line = bytes.TrimLeft(line, " ") + i := bytes.IndexByte(line, '\t') + if i < 0 { + return fmt.Errorf("malformed ls-tree entry: %q", entries.Text()) + } + sizeRaw := line[:i] + path := string(line[i+1:]) + + if opts.Ignore(path) { + continue + } + + if bytes.Equal(typ, []byte("blob")) { + mode, _ := strconv.ParseInt(string(modeRaw), 8, 64) + size, _ := strconv.ParseInt(string(sizeRaw), 10, 64) + + hdr := tar.Header{ + Typeflag: tar.TypeReg, + Name: path, + Mode: mode & 0777, + Size: size, + Format: tar.FormatPAX, // TODO ? + } + + if reason := opts.SkipContent(&hdr); reason != "" { + hdr.PAXRecords = map[string]string{"SG.skip": reason} + } + + hdr.Size = 0 + if err := tw.WriteHeader(&hdr); err != nil { + return err + } + + } else if bytes.Equal(typ, []byte("tree")) { + hdr := tar.Header{ + Typeflag: tar.TypeDir, + Name: path, + Mode: 0777, + Format: tar.FormatPAX, // TODO ? + } + if err := tw.WriteHeader(&hdr); err != nil { + return err + } + } else { + log.Printf("unexpected type on line: %q", entries.Text()) + continue + } + } + + if err := entries.Err(); err != nil { + return err + } + + if err := tw.Close(); err != nil { + return err + } + + done = true + return cmd.Wait() } // scanNull is a split function for bufio.Scanner that returns each item of diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index 00226078b..a5d8085da 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -65,6 +65,11 @@ func do(w io.Writer) error { return archiveFilter(w, r, root, opts) } + if os.Getenv("GIT_SG_LSTREE") != "" { + log.Println("using git-ls-tree") + return archiveLsTree(w, r, root, opts) + } + var repo archiveWriterRepo = (*archiveWriterRepoGoGit)(r) if os.Getenv("GIT_SG_CATFILE") != "" { log.Println("using git-cat-file") diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go index 8eb31129d..4a0feea2b 100644 --- a/cmd/git-sg/main_test.go +++ b/cmd/git-sg/main_test.go @@ -12,7 +12,7 @@ func TestDo(t *testing.T) { } t.Setenv("GIT_DIR", dir) - for _, envvar := range []string{"", "GIT_SG_BUFFER", "GIT_SG_FILTER", "GIT_SG_CATFILE"} { + for _, envvar := range []string{"", "GIT_SG_BUFFER", "GIT_SG_FILTER", "GIT_SG_CATFILE", "GIT_SG_LSTREE"} { name := envvar if name == "" { name = "default" From d45e9276d5b36c26f28bd2032a204880fbf4bc69 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 23 Sep 2022 09:26:00 +0200 Subject: [PATCH 26/29] ls-tree implemented And its pretty much the same speed as git archive on sourcegraph repo. We only skip 7 files in that repo, which means its hard to beat the speed of git archive. --- cmd/git-sg/lstree.go | 49 ++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/cmd/git-sg/lstree.go b/cmd/git-sg/lstree.go index c6e943782..71904c9c9 100644 --- a/cmd/git-sg/lstree.go +++ b/cmd/git-sg/lstree.go @@ -15,30 +15,30 @@ import ( ) func archiveLsTree(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) (err error) { - cmd := exec.Command("git", "ls-tree", "-r", "-l", "-t", "-z", tree.Hash.String()) - r, err := cmd.StdoutPipe() + // 32*1024 is the same size used by io.Copy + buf := make([]byte, 32*1024) + + lsTree := exec.Command("git", "ls-tree", "-r", "-l", "-t", "-z", tree.Hash.String()) + r, err := lsTree.StdoutPipe() if err != nil { return err } defer r.Close() + // TODO we are not respecting dir + catFile, err := startGitCatFileBatch("") + if err != nil { + return err + } + defer catFile.Close() + tw := tar.NewWriter(w) - err = cmd.Start() + err = lsTree.Start() if err != nil { return err } - - done := false - defer func() { - if done { - return - } - err2 := cmd.Process.Kill() - if err == nil { - err = err2 - } - }() + defer lsTree.Process.Kill() entries := bufio.NewScanner(r) entries.Split(scanNull) @@ -94,13 +94,27 @@ func archiveLsTree(w io.Writer, repo *git.Repository, tree *object.Tree, opts *a if reason := opts.SkipContent(&hdr); reason != "" { hdr.PAXRecords = map[string]string{"SG.skip": reason} + hdr.Size = 0 + if err := tw.WriteHeader(&hdr); err != nil { + return err + } + continue } - hdr.Size = 0 - if err := tw.WriteHeader(&hdr); err != nil { + if info, err := catFile.ContentsString(string(hash)); err != nil { return err + } else if info.Size != size { + return fmt.Errorf("git-cat-file returned a different size (%d) to git-ls-tree (%d) for %s", info.Size, size, path) } + if err := tw.WriteHeader(&hdr); err != nil { + return err + } + if n, err := io.CopyBuffer(tw, catFile, buf); err != nil { + return err + } else if n != size { + return fmt.Errorf("git-cat-file unmarshalled %d bytes instead of %d for %s", n, size, path) + } } else if bytes.Equal(typ, []byte("tree")) { hdr := tar.Header{ Typeflag: tar.TypeDir, @@ -125,8 +139,7 @@ func archiveLsTree(w io.Writer, repo *git.Repository, tree *object.Tree, opts *a return err } - done = true - return cmd.Wait() + return lsTree.Wait() } // scanNull is a split function for bufio.Scanner that returns each item of From 5040bd7b89f08ad6d25e9308a133d8f564125db2 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 23 Sep 2022 09:37:53 +0200 Subject: [PATCH 27/29] skip TestDo on CI if missing .git --- cmd/git-sg/main_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go index 4a0feea2b..9289e7ef6 100644 --- a/cmd/git-sg/main_test.go +++ b/cmd/git-sg/main_test.go @@ -1,6 +1,7 @@ package main import ( + "os" "path/filepath" "testing" ) @@ -12,6 +13,10 @@ func TestDo(t *testing.T) { } t.Setenv("GIT_DIR", dir) + if _, err := os.Stat(dir); os.Getenv("CI") != "" && os.IsNotExist(err) { + t.Skipf("skipping since on CI and this is not a git checkout: %v", err) + } + for _, envvar := range []string{"", "GIT_SG_BUFFER", "GIT_SG_FILTER", "GIT_SG_CATFILE", "GIT_SG_LSTREE"} { name := envvar if name == "" { From b07c06983c85824e52ee3e584bca8dbdc4d0be14 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 23 Sep 2022 09:52:19 +0200 Subject: [PATCH 28/29] check for .git in all tests --- cmd/git-sg/catfile_test.go | 4 ++++ cmd/git-sg/main_test.go | 26 ++++++++++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/cmd/git-sg/catfile_test.go b/cmd/git-sg/catfile_test.go index b6682da0d..cbe9b4b98 100644 --- a/cmd/git-sg/catfile_test.go +++ b/cmd/git-sg/catfile_test.go @@ -9,6 +9,8 @@ import ( ) func TestInfo(t *testing.T) { + setGitDir(t) + p, err := startGitCatFileBatch("") if err != nil { t.Fatal(err) @@ -43,6 +45,8 @@ func TestInfo(t *testing.T) { } func TestContents(t *testing.T) { + setGitDir(t) + p, err := startGitCatFileBatch("") if err != nil { t.Fatal(err) diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go index 9289e7ef6..d933807c3 100644 --- a/cmd/git-sg/main_test.go +++ b/cmd/git-sg/main_test.go @@ -7,15 +7,7 @@ import ( ) func TestDo(t *testing.T) { - dir, err := filepath.Abs("../../.git") - if err != nil { - t.Fatal(err) - } - t.Setenv("GIT_DIR", dir) - - if _, err := os.Stat(dir); os.Getenv("CI") != "" && os.IsNotExist(err) { - t.Skipf("skipping since on CI and this is not a git checkout: %v", err) - } + setGitDir(t) for _, envvar := range []string{"", "GIT_SG_BUFFER", "GIT_SG_FILTER", "GIT_SG_CATFILE", "GIT_SG_LSTREE"} { name := envvar @@ -27,7 +19,7 @@ func TestDo(t *testing.T) { t.Setenv(envvar, "1") } var w countingWriter - err = do(&w) + err := do(&w) if err != nil { t.Fatal(err) } @@ -47,3 +39,17 @@ func (w *countingWriter) Write(b []byte) (int, error) { w.N += len(b) return len(b), nil } + +func setGitDir(t *testing.T) { + t.Helper() + + dir, err := filepath.Abs("../../.git") + if err != nil { + t.Fatal(err) + } + t.Setenv("GIT_DIR", dir) + + if _, err := os.Stat(dir); os.Getenv("CI") != "" && os.IsNotExist(err) { + t.Skipf("skipping since on CI and this is not a git checkout: %v", err) + } +} From 615d1d93dae46a03610d70785ae3a76ca7efe02f Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Mon, 10 Oct 2022 15:25:26 +0200 Subject: [PATCH 29/29] implement archiver via git-lfs/gitobj This is our slowest implementation so far! I believe this is because gitobj has no caching between parsing packfiles so it pays the cost on each object retrieval. --- cmd/git-sg/archive.go | 10 +++++++ cmd/git-sg/gitobj.go | 61 +++++++++++++++++++++++++++++++++++++++++ cmd/git-sg/main.go | 17 ++++++++++++ cmd/git-sg/main_test.go | 2 +- go.mod | 3 +- go.sum | 2 ++ 6 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 cmd/git-sg/gitobj.go diff --git a/cmd/git-sg/archive.go b/cmd/git-sg/archive.go index ae76cc8e2..3c513c26f 100644 --- a/cmd/git-sg/archive.go +++ b/cmd/git-sg/archive.go @@ -57,6 +57,7 @@ type item struct { type archiveWriterBlob interface { Size() int64 Reader() (io.ReadCloser, error) + Close() error } type archiveWriterRepo interface { @@ -130,6 +131,7 @@ func (a *archiveWriter) writeRegularTreeEntry(entry object.TreeEntry, path strin log.Printf("failed to get blob object for %s %v: %v", path, entry.Hash, err) return nil } + defer blob.Close() // TODO symlinks, mode, etc. Handle large Linkname hdr := &tar.Header{ @@ -206,6 +208,10 @@ func (b archiveWriterBlobGoGit) Reader() (io.ReadCloser, error) { return b.blob.Reader() } +func (b archiveWriterBlobGoGit) Close() error { + return nil +} + type archiveWriterRepoGoGit git.Repository func (repo *archiveWriterRepoGoGit) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { @@ -241,6 +247,10 @@ func (b archiveWriterBlobCatFile) Reader() (io.ReadCloser, error) { return io.NopCloser(b.catFile), nil } +func (b archiveWriterBlobCatFile) Close() error { + return nil +} + type archiveWriterRepoCatFile struct { catFile *gitCatFileBatch } diff --git a/cmd/git-sg/gitobj.go b/cmd/git-sg/gitobj.go new file mode 100644 index 000000000..0b8b50d22 --- /dev/null +++ b/cmd/git-sg/gitobj.go @@ -0,0 +1,61 @@ +package main + +import ( + "io" + + "github.com/git-lfs/gitobj/v2" + "github.com/go-git/go-git/v5/plumbing" + "github.com/go-git/go-git/v5/plumbing/filemode" + "github.com/go-git/go-git/v5/plumbing/object" +) + +type archiveWriterBlobGitObj struct { + blob *gitobj.Blob +} + +func (b archiveWriterBlobGitObj) Size() int64 { + // TODO close if only asking size? + return b.blob.Size +} + +func (b archiveWriterBlobGitObj) Reader() (io.ReadCloser, error) { + return b, nil +} + +func (b archiveWriterBlobGitObj) Read(p []byte) (int, error) { + return b.blob.Contents.Read(p) +} + +func (b archiveWriterBlobGitObj) Close() error { + return b.blob.Close() +} + +type archiveWriterRepoGitObj struct { + db *gitobj.ObjectDatabase +} + +func (w archiveWriterRepoGitObj) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { + tree, err := w.db.Tree(hash[:]) + if err != nil { + return nil, err + } + + entries := make([]object.TreeEntry, len(tree.Entries)) + for i, e := range tree.Entries { + copy(entries[i].Hash[:], e.Oid) + entries[i].Mode = filemode.FileMode(e.Filemode) + entries[i].Name = e.Name + } + + return entries, nil +} + +func (w archiveWriterRepoGitObj) Blob(hash plumbing.Hash) (archiveWriterBlob, error) { + blob, err := w.db.Blob(hash[:]) + if err != nil { + return nil, err + } + return archiveWriterBlobGitObj{ + blob: blob, + }, nil +} diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go index a5d8085da..12d1708be 100644 --- a/cmd/git-sg/main.go +++ b/cmd/git-sg/main.go @@ -8,10 +8,12 @@ import ( "io" "log" "os" + "path/filepath" "runtime" "runtime/pprof" "strings" + "github.com/git-lfs/gitobj/v2" "github.com/go-git/go-billy/v5/osfs" "github.com/go-git/go-git/v5" "github.com/go-git/go-git/v5/plumbing/cache" @@ -83,6 +85,21 @@ func do(w io.Writer) error { } defer catFile.Close() repo = archiveWriterRepoCatFile{catFile: catFile} + } else if os.Getenv("GIT_SG_GITOBJ") != "" { + log.Println("using github.com/git-lfs/gitobj") + dir, err := gitDir() + if err != nil { + return err + } + if _, err := os.Stat(filepath.Join(dir, ".git")); err == nil { + dir = filepath.Join(dir, ".git") + } + db, err := gitobj.FromFilesystem(filepath.Join(dir, "objects"), "") + if err != nil { + return err + } + defer db.Close() + repo = archiveWriterRepoGitObj{db: db} } return archiveWrite(w, repo, root, opts) diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go index d933807c3..3703a7822 100644 --- a/cmd/git-sg/main_test.go +++ b/cmd/git-sg/main_test.go @@ -9,7 +9,7 @@ import ( func TestDo(t *testing.T) { setGitDir(t) - for _, envvar := range []string{"", "GIT_SG_BUFFER", "GIT_SG_FILTER", "GIT_SG_CATFILE", "GIT_SG_LSTREE"} { + for _, envvar := range []string{"", "GIT_SG_BUFFER", "GIT_SG_FILTER", "GIT_SG_CATFILE", "GIT_SG_LSTREE", "GIT_SG_GITOBJ"} { name := envvar if name == "" { name = "default" diff --git a/go.mod b/go.mod index c6a5e8d51..a25ef9df6 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,9 @@ require ( github.com/bmatcuk/doublestar v1.3.4 github.com/fsnotify/fsnotify v1.5.4 github.com/gfleury/go-bitbucket-v1 v0.0.0-20220418082332-711d7d5e805f + github.com/git-lfs/gitobj/v2 v2.1.1 github.com/go-enry/go-enry/v2 v2.8.3 + github.com/go-git/go-billy/v5 v5.3.1 github.com/go-git/go-git/v5 v5.4.2 github.com/gobwas/glob v0.2.3 github.com/google/go-cmp v0.5.9 @@ -67,7 +69,6 @@ require ( github.com/getsentry/sentry-go v0.14.0 // indirect github.com/go-enry/go-oniguruma v1.2.1 // indirect github.com/go-git/gcfg v1.5.0 // indirect - github.com/go-git/go-billy/v5 v5.3.1 // indirect github.com/go-logr/logr v1.2.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect diff --git a/go.sum b/go.sum index 27be43975..4a47f7860 100644 --- a/go.sum +++ b/go.sum @@ -200,6 +200,8 @@ github.com/gfleury/go-bitbucket-v1 v0.0.0-20220418082332-711d7d5e805f/go.mod h1: github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM= +github.com/git-lfs/gitobj/v2 v2.1.1 h1:tf/VU6zL1kxa3he+nf6FO/syX+LGkm6WGDsMpfuXV7Q= +github.com/git-lfs/gitobj/v2 v2.1.1/go.mod h1:q6aqxl6Uu3gWsip5GEKpw+7459F97er8COmU45ncAxw= github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98=