diff --git a/cmd/git-sg/archive.go b/cmd/git-sg/archive.go new file mode 100644 index 000000000..3c513c26f --- /dev/null +++ b/cmd/git-sg/archive.go @@ -0,0 +1,322 @@ +package main + +import ( + "archive/tar" + "bufio" + "bytes" + "io" + "log" + "sync" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing" + "github.com/go-git/go-git/v5/plumbing/filemode" + "github.com/go-git/go-git/v5/plumbing/object" +) + +type archiveOpts struct { + // Ignore if true will exclude path from the archive + Ignore func(path string) bool + // SkipContent if returning a non-empty string will include an entry for + // path but with no content. The PAX header SOURCEGRAPH.skip will contain + // the returned string (a reason for skipping). + SkipContent func(hdr *tar.Header) string +} + +func archiveWrite(w io.Writer, repo archiveWriterRepo, tree *object.Tree, opts *archiveOpts) error { + a := &archiveWriter{ + w: tar.NewWriter(w), + repo: repo, + opts: opts, + + stack: []item{{entries: tree.Entries, path: ""}}, + + // 32*1024 is the same size used by io.Copy + buf: make([]byte, 32*1024), + } + + for len(a.stack) > 0 { + item := a.stack[len(a.stack)-1] + a.stack = a.stack[:len(a.stack)-1] + + err := a.writeTree(item.entries, item.path) + if err != nil { + _ = a.w.Close() + return err + } + } + + return a.w.Close() +} + +type item struct { + entries []object.TreeEntry + path string +} + +type archiveWriterBlob interface { + Size() int64 + Reader() (io.ReadCloser, error) + Close() error +} + +type archiveWriterRepo interface { + TreeEntries(plumbing.Hash) ([]object.TreeEntry, error) + Blob(plumbing.Hash) (archiveWriterBlob, error) +} + +type archiveWriter struct { + w *tar.Writer + opts *archiveOpts + + repo archiveWriterRepo + + stack []item + + buf []byte +} + +func (a *archiveWriter) writeTree(entries []object.TreeEntry, path string) error { + for _, e := range entries { + var p string + if e.Mode == filemode.Dir { + p = path + e.Name + "/" + } else { + p = path + e.Name + } + + if a.opts.Ignore(p) { + continue + } + + switch e.Mode { + case filemode.Dir: + child, err := a.repo.TreeEntries(e.Hash) + if err != nil { + log.Printf("failed to fetch tree object for %s %v: %v", p, e.Hash, err) + continue + } + + if err := a.w.WriteHeader(&tar.Header{ + Typeflag: tar.TypeDir, + Name: p, + Mode: 0777, + Format: tar.FormatPAX, // TODO ? + }); err != nil { + return err + } + + a.stack = append(a.stack, item{entries: child, path: p}) + + case filemode.Deprecated, filemode.Executable, filemode.Regular, filemode.Symlink: + if err := a.writeRegularTreeEntry(e, p); err != nil { + return err + } + + case filemode.Submodule: + // TODO what do? + continue + + default: + log.Printf("WARN: unexpected filemode %+v", e) + } + } + + return nil +} + +func (a *archiveWriter) writeRegularTreeEntry(entry object.TreeEntry, path string) error { + blob, err := a.repo.Blob(entry.Hash) + if err != nil { + log.Printf("failed to get blob object for %s %v: %v", path, entry.Hash, err) + return nil + } + defer blob.Close() + + // TODO symlinks, mode, etc. Handle large Linkname + hdr := &tar.Header{ + Typeflag: tar.TypeReg, + Name: path, + Size: blob.Size(), + Mode: 0666, + + Format: tar.FormatPAX, // TODO ? + } + + if reason := a.opts.SkipContent(hdr); reason != "" { + return a.writeSkipHeader(hdr, reason) + } + + r, err := blob.Reader() + if err != nil { + log.Printf("failed to read blob object for %s %v: %v", path, entry.Hash, err) + return nil + } + + // TODO confirm it is fine to call Close twice. From initial reading of + // go-git it relies on io.Pipe for readers, so this should be fine. + defer r.Close() + + // Heuristic: Assume file is binary if first 256 bytes contain a 0x00. + blobSample := a.buf[:256] + if n, err := io.ReadAtLeast(r, blobSample, 256); err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { + log.Printf("failed to read blob object for %s %v: %v", path, entry.Hash, err) + return nil + } else { + blobSample = blobSample[:n] + } + + // TODO instead of just binary, should we only allow utf8? utf.Valid + // works except for the fact we may be invalid utf8 at the 256 boundary + // since we cut it off. So will need to copypasta that. + if bytes.IndexByte(blobSample, 0x00) >= 0 { + return a.writeSkipHeader(hdr, "binary") + } + + if err := a.w.WriteHeader(hdr); err != nil { + return err + } + + // We read some bytes from r already, first write those. + if _, err := a.w.Write(blobSample); err != nil { + return err + } + + // Write out the rest of r. + if _, err := io.CopyBuffer(a.w, r, a.buf); err != nil { + return err + } + + return r.Close() +} + +func (a *archiveWriter) writeSkipHeader(hdr *tar.Header, reason string) error { + hdr.PAXRecords = map[string]string{"SG.skip": reason} + hdr.Size = 0 // clear out size since we won't write the body + return a.w.WriteHeader(hdr) +} + +type archiveWriterBlobGoGit struct { + blob *object.Blob +} + +func (b archiveWriterBlobGoGit) Size() int64 { + return b.blob.Size +} + +func (b archiveWriterBlobGoGit) Reader() (io.ReadCloser, error) { + return b.blob.Reader() +} + +func (b archiveWriterBlobGoGit) Close() error { + return nil +} + +type archiveWriterRepoGoGit git.Repository + +func (repo *archiveWriterRepoGoGit) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { + tree, err := (*git.Repository)(repo).TreeObject(hash) + if err != nil { + return nil, err + } + return tree.Entries, nil +} + +func (repo *archiveWriterRepoGoGit) Blob(hash plumbing.Hash) (archiveWriterBlob, error) { + blob, err := (*git.Repository)(repo).BlobObject(hash) + if err != nil { + return nil, err + } + return archiveWriterBlobGoGit{blob: blob}, nil +} + +type archiveWriterBlobCatFile struct { + catFile *gitCatFileBatch + info gitCatFileBatchInfo +} + +func (b archiveWriterBlobCatFile) Size() int64 { + return b.info.Size +} + +func (b archiveWriterBlobCatFile) Reader() (io.ReadCloser, error) { + _, err := b.catFile.Contents(b.info.Hash) + if err != nil { + return nil, err + } + return io.NopCloser(b.catFile), nil +} + +func (b archiveWriterBlobCatFile) Close() error { + return nil +} + +type archiveWriterRepoCatFile struct { + catFile *gitCatFileBatch +} + +var bufPool = sync.Pool{ + New: func() interface{} { + return bufio.NewReader(nil) + }, +} + +func (w archiveWriterRepoCatFile) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { + _, err := w.catFile.Contents(hash) + if err != nil { + return nil, err + } + + var entries []object.TreeEntry + + // Copy-pasta from go-git/plumbing/object/tree.go + r := bufPool.Get().(*bufio.Reader) + defer bufPool.Put(r) + r.Reset(w.catFile) + for { + str, err := r.ReadString(' ') + if err != nil { + if err == io.EOF { + break + } + + return nil, err + } + str = str[:len(str)-1] // strip last byte (' ') + + mode, err := filemode.New(str) + if err != nil { + return nil, err + } + + name, err := r.ReadString(0) + if err != nil && err != io.EOF { + return nil, err + } + + var hash plumbing.Hash + if _, err = io.ReadFull(r, hash[:]); err != nil { + return nil, err + } + + baseName := name[:len(name)-1] + entries = append(entries, object.TreeEntry{ + Hash: hash, + Mode: mode, + Name: baseName, + }) + } + + return entries, nil +} + +func (w archiveWriterRepoCatFile) Blob(hash plumbing.Hash) (archiveWriterBlob, error) { + info, err := w.catFile.Info(hash) + if err != nil { + return nil, err + } + return archiveWriterBlobCatFile{ + catFile: w.catFile, + info: info, + }, nil +} diff --git a/cmd/git-sg/catfile.go b/cmd/git-sg/catfile.go new file mode 100644 index 000000000..4bd0179fe --- /dev/null +++ b/cmd/git-sg/catfile.go @@ -0,0 +1,329 @@ +package main + +import ( + "bufio" + "bytes" + "encoding/hex" + "errors" + "fmt" + "io" + "log" + "os" + "os/exec" + "strconv" + + "github.com/go-git/go-git/v5/plumbing" +) + +// gitCatFileBatch is a wrapper around a git-cat-file --batch-command process. +// This provides an efficient means to interact with the git object store of a +// repository. +type gitCatFileBatch struct { + cmd *exec.Cmd + in *bufio.Writer + inCloser io.Closer + out *gitCatFileBatchReader + + // hashBuf is encoded to for plumbing.Hash + hashBuf [20 * 2]byte +} + +type missingError struct { + ref string +} + +func (e *missingError) Error() string { + return e.ref + " missing" +} + +func isMissingError(err error) bool { + var e *missingError + return errors.As(err, &e) +} + +// startGitCatFileBatch returns a gitCatFileBatch for the repository at dir. +// +// Callers must ensure to call gitCatFileBatch.Close() to ensure the +// associated subprocess and file descriptors are cleaned up. +func startGitCatFileBatch(dir string) (_ *gitCatFileBatch, err error) { + cmd := exec.Command("git", "cat-file", "--batch-command") + cmd.Dir = dir + + closeIfErr := func(closer io.Closer) { + if err != nil { + closer.Close() + } + } + + stdin, err := cmd.StdinPipe() + if err != nil { + return nil, err + } + defer closeIfErr(stdin) + + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil, err + } + defer closeIfErr(stdin) + + // TODO should capture somehow and put into error + cmd.Stderr = os.Stderr + + if err := cmd.Start(); err != nil { + return nil, err + } + + return &gitCatFileBatch{ + cmd: cmd, + in: bufio.NewWriter(stdin), + inCloser: stdin, + out: newGitCatFileBatchReader(stdout), + }, nil +} + +type gitCatFileBatchInfo struct { + Hash plumbing.Hash + Type plumbing.ObjectType + Size int64 +} + +func (g *gitCatFileBatch) InfoString(ref string) (gitCatFileBatchInfo, error) { + g.in.WriteString("info ") + g.in.WriteString(ref) + g.in.WriteByte('\n') + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + info, err := g.out.Info() + if err != nil && !isMissingError(err) { // missingError is recoverable + g.kill() + } + return info, err +} + +func (g *gitCatFileBatch) Info(hash plumbing.Hash) (gitCatFileBatchInfo, error) { + g.in.WriteString("info ") + g.writeHash(hash) + g.in.WriteByte('\n') + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + info, err := g.out.Info() + if err != nil && !isMissingError(err) { // missingError is recoverable + g.kill() + } + return info, err +} + +func (g *gitCatFileBatch) ContentsString(ref string) (gitCatFileBatchInfo, error) { + g.in.WriteString("contents ") + g.in.WriteString(ref) + g.in.WriteByte('\n') + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + info, err := g.out.Contents() + if err != nil && !isMissingError(err) { // missingError is recoverable + g.kill() + } + return info, err +} + +func (g *gitCatFileBatch) Contents(hash plumbing.Hash) (gitCatFileBatchInfo, error) { + g.in.WriteString("contents ") + g.writeHash(hash) + g.in.WriteByte('\n') + if err := g.in.Flush(); err != nil { + g.kill() + return gitCatFileBatchInfo{}, err + } + + info, err := g.out.Contents() + if err != nil && !isMissingError(err) { // missingError is recoverable + g.kill() + } + return info, err +} + +func (g *gitCatFileBatch) Read(b []byte) (int, error) { + return g.out.Read(b) +} + +func (g *gitCatFileBatch) writeHash(hash plumbing.Hash) { + hex.Encode(g.hashBuf[:], hash[:]) + g.in.Write(g.hashBuf[:]) +} + +type gitCatFileBatchReader struct { + out *bufio.Reader + outCloser io.Closer + + // readerN is the amount left to read for Read. Note: git-cat-file always + // has a trailing new line, so this will always be the size of an object + + // 1. + readerN int64 +} + +func newGitCatFileBatchReader(r io.ReadCloser) *gitCatFileBatchReader { + return &gitCatFileBatchReader{ + out: bufio.NewReader(r), + outCloser: r, + } +} + +func (g *gitCatFileBatchReader) Info() (gitCatFileBatchInfo, error) { + if err := g.Discard(); err != nil { + g.Close() + return gitCatFileBatchInfo{}, err + } + + line, err := g.out.ReadSlice('\n') + if err != nil { + g.Close() + return gitCatFileBatchInfo{}, err + } + + info, err := parseGitCatFileBatchInfoLine(line) + if err != nil { + if !isMissingError(err) { // missingError is recoverable + g.Close() + } + return gitCatFileBatchInfo{}, err + } + + // Info has nothing following to read + g.readerN = 0 + + return info, nil +} + +func (g *gitCatFileBatchReader) Contents() (gitCatFileBatchInfo, error) { + info, err := g.Info() + if err != nil { + return info, err + } + + // Still have the contents to read and an extra newline + g.readerN = info.Size + 1 + + return info, nil +} + +func (g *gitCatFileBatchReader) Read(p []byte) (n int, err error) { + // We avoid reading the final byte (a newline). That will be handled by + // discard. + if g.readerN <= 1 { + return 0, io.EOF + } + if max := g.readerN - 1; int64(len(p)) > max { + p = p[0:max] + } + n, err = g.out.Read(p) + g.readerN -= int64(n) + return +} + +// Discard should be called before parsing a response to flush out any unread +// data since the last command. +func (g *gitCatFileBatchReader) Discard() error { + if g.readerN > 0 { + n, err := g.out.Discard(int(g.readerN)) + g.readerN -= int64(n) + return err + } + return nil +} + +func (g *gitCatFileBatchReader) Close() error { + return g.outCloser.Close() +} + +// parseGitCatFileBatchInfoLine parses the info line from git-cat-file. It +// expects the default format of: +// +// SP SP LF +func parseGitCatFileBatchInfoLine(line []byte) (gitCatFileBatchInfo, error) { + line = bytes.TrimRight(line, "\n") + origLine := line + + if bytes.HasSuffix(line, []byte(" missing")) { + ref := bytes.TrimSuffix(line, []byte(" missing")) + return gitCatFileBatchInfo{}, &missingError{ref: string(ref)} + } + + // PERF this allocates much less than bytes.Split + next := func() []byte { + i := bytes.IndexByte(line, ' ') + if i < 0 { + pre := line + line = nil + return pre + } + pre := line[:i] + line = line[i+1:] + return pre + } + + info := gitCatFileBatchInfo{} + + var err error + _, err = hex.Decode(info.Hash[:], next()) + if err != nil { + return info, fmt.Errorf("unexpected git-cat-file --batch info line %q: %w", string(origLine), err) + } + + info.Type, err = plumbing.ParseObjectType(string(next())) + if err != nil { + return info, fmt.Errorf("unexpected git-cat-file --batch info line %q: %w", string(origLine), err) + } + + info.Size, err = strconv.ParseInt(string(next()), 10, 64) + if err != nil { + return info, fmt.Errorf("unexpected git-cat-file --batch info line %q: %w", string(origLine), err) + } + + return info, nil +} + +func (g *gitCatFileBatch) Close() (err error) { + defer func() { + if err != nil { + g.kill() + } + }() + + if err := g.out.Discard(); err != nil { + return err + } + + // This Close will tell git to shutdown + if err := g.inCloser.Close(); err != nil { + return err + } + + // Drain and check we have no output left (to detect mistakes) + if n, err := io.Copy(io.Discard, g.out); err != nil { + return err + } else if n > 0 { + log.Printf("unexpected %d bytes of remaining output when calling close", n) + } + + if err := g.out.Close(); err != nil { + return err + } + + return g.cmd.Wait() +} + +func (g *gitCatFileBatch) kill() { + _ = g.cmd.Process.Kill() + _ = g.inCloser.Close() + _ = g.out.Close() +} diff --git a/cmd/git-sg/catfile_test.go b/cmd/git-sg/catfile_test.go new file mode 100644 index 000000000..cbe9b4b98 --- /dev/null +++ b/cmd/git-sg/catfile_test.go @@ -0,0 +1,126 @@ +package main + +import ( + "io" + "testing" + + "github.com/go-git/go-git/v5/plumbing" + "github.com/google/go-cmp/cmp" +) + +func TestInfo(t *testing.T) { + setGitDir(t) + + p, err := startGitCatFileBatch("") + if err != nil { + t.Fatal(err) + } + defer p.Close() + + info, err := p.InfoString("HEAD") + if err != nil { + t.Fatal(err) + } + + t.Log(info.Hash, info.Type, info.Size) + + // Test that we can recover from missing + if info, err := p.InfoString("sdflkjsdfDoesNOTexist"); !isMissingError(err) { + t.Fatalf("expected missing error got info=%v err=%v", info, err) + } + + // Now lets fetch the object again via hash and see if it stays the same. + info2, err := p.Info(info.Hash) + if err != nil { + t.Fatal(err) + } + + if d := cmp.Diff(info, info2); d != "" { + t.Fatalf("info changed (-first, +second):\n%s", d) + } + + if err := p.Close(); err != nil { + t.Fatal(err) + } +} + +func TestContents(t *testing.T) { + setGitDir(t) + + p, err := startGitCatFileBatch("") + if err != nil { + t.Fatal(err) + } + defer p.Close() + + info, err := p.ContentsString("HEAD") + if err != nil { + t.Fatal(err) + } + + t.Log(info.Hash, info.Type, info.Size) + + b, err := io.ReadAll(p) + if err != nil { + t.Fatal(err) + } + t.Log(string(b)) + + if len(b) != int(info.Size) { + t.Fatalf("amount read (%d) is different to object size (%d)", len(b), info.Size) + } + if info.Type != plumbing.CommitObject { + t.Fatalf("expected HEAD to be a commit, got %s", info.Type) + } + + // Test that we can recover from missing + if info, err := p.ContentsString("sdflkjsdfDoesNOTexist"); !isMissingError(err) { + t.Fatalf("expected missing error got info=%v err=%v", info, err) + } + + // Now lets fetch the object again via hash and see if it stays the same. + info2, err := p.Contents(info.Hash) + if err != nil { + t.Fatal(err) + } + + if d := cmp.Diff(info, info2); d != "" { + t.Fatalf("info changed (-first, +second):\n%s", d) + } + + b2, err := io.ReadAll(p) + if err != nil { + t.Fatal(err) + } + if d := cmp.Diff(b, b2); d != "" { + t.Fatalf("content changed (-first, +second):\n%s", d) + } + + if err := p.Close(); err != nil { + t.Fatal(err) + } +} + +func BenchmarkInfo(b *testing.B) { + p, err := startGitCatFileBatch("") + if err != nil { + b.Fatal(err) + } + defer p.Close() + + info, err := p.InfoString("HEAD") + if err != nil { + b.Fatal(err) + } + + for i := 0; i < b.N; i++ { + _, err := p.Info(info.Hash) + if err != nil { + b.Fatal(err) + } + } + + if err := p.Close(); err != nil { + b.Fatal(err) + } +} diff --git a/cmd/git-sg/filter.go b/cmd/git-sg/filter.go new file mode 100644 index 000000000..7dbc72874 --- /dev/null +++ b/cmd/git-sg/filter.go @@ -0,0 +1,75 @@ +package main + +import ( + "archive/tar" + "io" + "os/exec" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/object" +) + +func archiveFilter(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) (err error) { + // 32*1024 is the same size used by io.Copy + buf := make([]byte, 32*1024) + + cmd := exec.Command("git", "archive", "--worktree-attributes", "--format=tar", tree.Hash.String(), "--") + r, err := cmd.StdoutPipe() + if err != nil { + return err + } + defer r.Close() + + tr := tar.NewReader(r) + tw := tar.NewWriter(w) + + err = cmd.Start() + if err != nil { + return err + } + + done := false + defer func() { + if done { + return + } + err2 := cmd.Process.Kill() + if err == nil { + err = err2 + } + }() + + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + + if opts.Ignore(hdr.Name) { + continue + } else if reason := opts.SkipContent(hdr); reason != "" { + hdr.Size = 0 + hdr.PAXRecords = map[string]string{"SG.skip": reason} + hdr.Format = tar.FormatPAX + if err := tw.WriteHeader(hdr); err != nil { + return err + } + continue + } + + tw.WriteHeader(hdr) + if _, err := io.CopyBuffer(tw, tr, buf); err != nil { + return err + } + } + + if err := tw.Close(); err != nil { + return err + } + + done = true + return cmd.Wait() +} diff --git a/cmd/git-sg/gitobj.go b/cmd/git-sg/gitobj.go new file mode 100644 index 000000000..0b8b50d22 --- /dev/null +++ b/cmd/git-sg/gitobj.go @@ -0,0 +1,61 @@ +package main + +import ( + "io" + + "github.com/git-lfs/gitobj/v2" + "github.com/go-git/go-git/v5/plumbing" + "github.com/go-git/go-git/v5/plumbing/filemode" + "github.com/go-git/go-git/v5/plumbing/object" +) + +type archiveWriterBlobGitObj struct { + blob *gitobj.Blob +} + +func (b archiveWriterBlobGitObj) Size() int64 { + // TODO close if only asking size? + return b.blob.Size +} + +func (b archiveWriterBlobGitObj) Reader() (io.ReadCloser, error) { + return b, nil +} + +func (b archiveWriterBlobGitObj) Read(p []byte) (int, error) { + return b.blob.Contents.Read(p) +} + +func (b archiveWriterBlobGitObj) Close() error { + return b.blob.Close() +} + +type archiveWriterRepoGitObj struct { + db *gitobj.ObjectDatabase +} + +func (w archiveWriterRepoGitObj) TreeEntries(hash plumbing.Hash) ([]object.TreeEntry, error) { + tree, err := w.db.Tree(hash[:]) + if err != nil { + return nil, err + } + + entries := make([]object.TreeEntry, len(tree.Entries)) + for i, e := range tree.Entries { + copy(entries[i].Hash[:], e.Oid) + entries[i].Mode = filemode.FileMode(e.Filemode) + entries[i].Name = e.Name + } + + return entries, nil +} + +func (w archiveWriterRepoGitObj) Blob(hash plumbing.Hash) (archiveWriterBlob, error) { + blob, err := w.db.Blob(hash[:]) + if err != nil { + return nil, err + } + return archiveWriterBlobGitObj{ + blob: blob, + }, nil +} diff --git a/cmd/git-sg/lstree.go b/cmd/git-sg/lstree.go new file mode 100644 index 000000000..71904c9c9 --- /dev/null +++ b/cmd/git-sg/lstree.go @@ -0,0 +1,159 @@ +package main + +import ( + "archive/tar" + "bufio" + "bytes" + "fmt" + "io" + "log" + "os/exec" + "strconv" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/object" +) + +func archiveLsTree(w io.Writer, repo *git.Repository, tree *object.Tree, opts *archiveOpts) (err error) { + // 32*1024 is the same size used by io.Copy + buf := make([]byte, 32*1024) + + lsTree := exec.Command("git", "ls-tree", "-r", "-l", "-t", "-z", tree.Hash.String()) + r, err := lsTree.StdoutPipe() + if err != nil { + return err + } + defer r.Close() + + // TODO we are not respecting dir + catFile, err := startGitCatFileBatch("") + if err != nil { + return err + } + defer catFile.Close() + + tw := tar.NewWriter(w) + + err = lsTree.Start() + if err != nil { + return err + } + defer lsTree.Process.Kill() + + entries := bufio.NewScanner(r) + entries.Split(scanNull) + + for entries.Scan() { + line := entries.Bytes() + // PERF this allocates much less than bytes.Split + next := func() []byte { + i := bytes.IndexByte(line, ' ') + if i < 0 { + pre := line + line = nil + return pre + } + pre := line[:i] + line = bytes.TrimLeft(line[i+1:], " ") + return pre + } + + // %(objectmode) %(objecttype) %(objectname) %(objectsize:padded)%x09%(path) + modeRaw := next() + typ := next() + hash := next() + + _ = hash + + // remaining: %(objectsize:padded)\t%(path) + // + // size is left padded with space + line = bytes.TrimLeft(line, " ") + i := bytes.IndexByte(line, '\t') + if i < 0 { + return fmt.Errorf("malformed ls-tree entry: %q", entries.Text()) + } + sizeRaw := line[:i] + path := string(line[i+1:]) + + if opts.Ignore(path) { + continue + } + + if bytes.Equal(typ, []byte("blob")) { + mode, _ := strconv.ParseInt(string(modeRaw), 8, 64) + size, _ := strconv.ParseInt(string(sizeRaw), 10, 64) + + hdr := tar.Header{ + Typeflag: tar.TypeReg, + Name: path, + Mode: mode & 0777, + Size: size, + Format: tar.FormatPAX, // TODO ? + } + + if reason := opts.SkipContent(&hdr); reason != "" { + hdr.PAXRecords = map[string]string{"SG.skip": reason} + hdr.Size = 0 + if err := tw.WriteHeader(&hdr); err != nil { + return err + } + continue + } + + if info, err := catFile.ContentsString(string(hash)); err != nil { + return err + } else if info.Size != size { + return fmt.Errorf("git-cat-file returned a different size (%d) to git-ls-tree (%d) for %s", info.Size, size, path) + } + + if err := tw.WriteHeader(&hdr); err != nil { + return err + } + if n, err := io.CopyBuffer(tw, catFile, buf); err != nil { + return err + } else if n != size { + return fmt.Errorf("git-cat-file unmarshalled %d bytes instead of %d for %s", n, size, path) + } + } else if bytes.Equal(typ, []byte("tree")) { + hdr := tar.Header{ + Typeflag: tar.TypeDir, + Name: path, + Mode: 0777, + Format: tar.FormatPAX, // TODO ? + } + if err := tw.WriteHeader(&hdr); err != nil { + return err + } + } else { + log.Printf("unexpected type on line: %q", entries.Text()) + continue + } + } + + if err := entries.Err(); err != nil { + return err + } + + if err := tw.Close(); err != nil { + return err + } + + return lsTree.Wait() +} + +// scanNull is a split function for bufio.Scanner that returns each item of +// text as split by the null character. It will not include the null. +func scanNull(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + if i := bytes.IndexByte(data, 0); i >= 0 { + return i + 1, data[0:i], nil + } + if atEOF { + return len(data), data, nil + } + // Request more data. + return 0, nil, nil +} diff --git a/cmd/git-sg/main.go b/cmd/git-sg/main.go new file mode 100644 index 000000000..12d1708be --- /dev/null +++ b/cmd/git-sg/main.go @@ -0,0 +1,225 @@ +package main + +import ( + "archive/tar" + "bufio" + "flag" + "fmt" + "io" + "log" + "os" + "path/filepath" + "runtime" + "runtime/pprof" + "strings" + + "github.com/git-lfs/gitobj/v2" + "github.com/go-git/go-billy/v5/osfs" + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/cache" + "github.com/go-git/go-git/v5/plumbing/object" + "github.com/go-git/go-git/v5/storage/filesystem" + "github.com/sourcegraph/zoekt/ignore" +) + +func do(w io.Writer) error { + r, err := openGitRepo() + if err != nil { + return err + } + + head, err := r.Head() + if err != nil { + return err + } + + commit, err := r.CommitObject(head.Hash()) + if err != nil { + return err + } + + root, err := r.TreeObject(commit.TreeHash) + if err != nil { + return err + } + + // Gating this right now because I get inconsistent performance on my + // macbook. Want to test on linux and larger repos. + if os.Getenv("GIT_SG_BUFFER") != "" { + log.Println("buffering output") + bw := bufio.NewWriter(w) + defer bw.Flush() + w = bw + } + + opts := &archiveOpts{ + Ignore: getIgnoreFilter(r, root), + SkipContent: func(hdr *tar.Header) string { + if hdr.Size > 2<<20 { + return "large file" + } + return "" + }, + } + + if os.Getenv("GIT_SG_FILTER") != "" { + log.Println("filtering git archive output") + return archiveFilter(w, r, root, opts) + } + + if os.Getenv("GIT_SG_LSTREE") != "" { + log.Println("using git-ls-tree") + return archiveLsTree(w, r, root, opts) + } + + var repo archiveWriterRepo = (*archiveWriterRepoGoGit)(r) + if os.Getenv("GIT_SG_CATFILE") != "" { + log.Println("using git-cat-file") + dir, err := gitDir() + if err != nil { + return err + } + catFile, err := startGitCatFileBatch(dir) + if err != nil { + return err + } + defer catFile.Close() + repo = archiveWriterRepoCatFile{catFile: catFile} + } else if os.Getenv("GIT_SG_GITOBJ") != "" { + log.Println("using github.com/git-lfs/gitobj") + dir, err := gitDir() + if err != nil { + return err + } + if _, err := os.Stat(filepath.Join(dir, ".git")); err == nil { + dir = filepath.Join(dir, ".git") + } + db, err := gitobj.FromFilesystem(filepath.Join(dir, "objects"), "") + if err != nil { + return err + } + defer db.Close() + repo = archiveWriterRepoGitObj{db: db} + } + + return archiveWrite(w, repo, root, opts) +} + +func getIgnoreFilter(r *git.Repository, root *object.Tree) func(string) bool { + m, err := parseIgnoreFile(r, root) + if err != nil { + // likely malformed, just log and ignore + log.Printf("WARN: failed to parse sourcegraph ignore file: %v", err) + return func(_ string) bool { return false } + } + + return m.Match +} + +func parseIgnoreFile(r *git.Repository, root *object.Tree) (*ignore.Matcher, error) { + entry, err := root.FindEntry(ignore.IgnoreFile) + if isNotExist(err) { + return &ignore.Matcher{}, nil + } else if err != nil { + return nil, fmt.Errorf("failed to find %s: %w", ignore.IgnoreFile, err) + } + + if !entry.Mode.IsFile() { + return &ignore.Matcher{}, nil + } + + blob, err := r.BlobObject(entry.Hash) + if err != nil { + return nil, err + } + + reader, err := blob.Reader() + if err != nil { + return nil, err + } + defer reader.Close() + + m, err := ignore.ParseIgnoreFile(reader) + if err != nil { + return nil, err + } + + return m, nil +} + +func isNotExist(err error) bool { + if err == nil { + return false + } + // go-git does not have an interface to check for not found, and can + // returned a myraid of errors for not found depending on were along looking + // for a file it failed (object, tree, entry, etc). So strings are the best + // we can do. + return os.IsNotExist(err) || strings.Contains(err.Error(), "not found") +} + +func gitDir() (string, error) { + if dir := os.Getenv("GIT_DIR"); dir != "" { + return dir, nil + } + return os.Getwd() +} + +func openGitRepo() (*git.Repository, error) { + dir, err := gitDir() + if err != nil { + return nil, err + } + + fs := osfs.New(dir) + if _, err := fs.Stat(git.GitDirName); err == nil { + fs, err = fs.Chroot(git.GitDirName) + if err != nil { + return nil, err + } + } + + // TODO PERF try skip object caching since we don't need it for archive. + s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{ + // PERF: important, otherwise we pay the cost of opening and closing + // packfiles per object access and read. + KeepDescriptors: true, + }) + + return git.Open(s, fs) +} + +func main() { + cpuprofile := flag.String("cpuprofile", "", "write cpu profile to `file`") + memprofile := flag.String("memprofile", "", "write memory profile to `file`") + flag.Parse() + + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal("could not create CPU profile: ", err) + } + defer f.Close() // error handling omitted for example + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) + } + defer pprof.StopCPUProfile() + } + + err := do(os.Stdout) + if err != nil { + log.Fatal(err) + } + + if *memprofile != "" { + f, err := os.Create(*memprofile) + if err != nil { + log.Fatal("could not create memory profile: ", err) + } + defer f.Close() // error handling omitted for example + runtime.GC() // get up-to-date statistics + if err := pprof.WriteHeapProfile(f); err != nil { + log.Fatal("could not write memory profile: ", err) + } + } +} diff --git a/cmd/git-sg/main_test.go b/cmd/git-sg/main_test.go new file mode 100644 index 000000000..3703a7822 --- /dev/null +++ b/cmd/git-sg/main_test.go @@ -0,0 +1,55 @@ +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestDo(t *testing.T) { + setGitDir(t) + + for _, envvar := range []string{"", "GIT_SG_BUFFER", "GIT_SG_FILTER", "GIT_SG_CATFILE", "GIT_SG_LSTREE", "GIT_SG_GITOBJ"} { + name := envvar + if name == "" { + name = "default" + } + t.Run(name, func(t *testing.T) { + if envvar != "" { + t.Setenv(envvar, "1") + } + var w countingWriter + err := do(&w) + if err != nil { + t.Fatal(err) + } + t.Logf("wrote %d bytes", w.N) + if w.N == 0 { + t.Fatal("wrote no bytes") + } + }) + } +} + +type countingWriter struct { + N int +} + +func (w *countingWriter) Write(b []byte) (int, error) { + w.N += len(b) + return len(b), nil +} + +func setGitDir(t *testing.T) { + t.Helper() + + dir, err := filepath.Abs("../../.git") + if err != nil { + t.Fatal(err) + } + t.Setenv("GIT_DIR", dir) + + if _, err := os.Stat(dir); os.Getenv("CI") != "" && os.IsNotExist(err) { + t.Skipf("skipping since on CI and this is not a git checkout: %v", err) + } +} diff --git a/go.mod b/go.mod index c6a5e8d51..a25ef9df6 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,9 @@ require ( github.com/bmatcuk/doublestar v1.3.4 github.com/fsnotify/fsnotify v1.5.4 github.com/gfleury/go-bitbucket-v1 v0.0.0-20220418082332-711d7d5e805f + github.com/git-lfs/gitobj/v2 v2.1.1 github.com/go-enry/go-enry/v2 v2.8.3 + github.com/go-git/go-billy/v5 v5.3.1 github.com/go-git/go-git/v5 v5.4.2 github.com/gobwas/glob v0.2.3 github.com/google/go-cmp v0.5.9 @@ -67,7 +69,6 @@ require ( github.com/getsentry/sentry-go v0.14.0 // indirect github.com/go-enry/go-oniguruma v1.2.1 // indirect github.com/go-git/gcfg v1.5.0 // indirect - github.com/go-git/go-billy/v5 v5.3.1 // indirect github.com/go-logr/logr v1.2.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect diff --git a/go.sum b/go.sum index 27be43975..4a47f7860 100644 --- a/go.sum +++ b/go.sum @@ -200,6 +200,8 @@ github.com/gfleury/go-bitbucket-v1 v0.0.0-20220418082332-711d7d5e805f/go.mod h1: github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM= +github.com/git-lfs/gitobj/v2 v2.1.1 h1:tf/VU6zL1kxa3he+nf6FO/syX+LGkm6WGDsMpfuXV7Q= +github.com/git-lfs/gitobj/v2 v2.1.1/go.mod h1:q6aqxl6Uu3gWsip5GEKpw+7459F97er8COmU45ncAxw= github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98=