Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor iterate git tree to reduce memory usage #31076

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
13 changes: 6 additions & 7 deletions modules/actions/workflows.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,16 @@ func ListWorkflows(commit *git.Commit) (git.Entries, error) {
return nil, err
}

entries, err := tree.ListEntriesRecursiveFast()
if err != nil {
return nil, err
}

ret := make(git.Entries, 0, len(entries))
for _, entry := range entries {
ret := make(git.Entries, 0, 5)
if err := tree.IterateEntriesRecursive(func(entry *git.TreeEntry) error {
if strings.HasSuffix(entry.Name(), ".yml") || strings.HasSuffix(entry.Name(), ".yaml") {
ret = append(ret, entry)
}
return nil
}, nil); err != nil {
return nil, err
}

return ret, nil
}

Expand Down
23 changes: 16 additions & 7 deletions modules/git/parse_nogogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,15 @@ func ParseTreeEntries(data []byte) ([]*TreeEntry, error) {
var sepSpace = []byte{' '}

func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
var err error
entries := make([]*TreeEntry, 0, bytes.Count(data, []byte{'\n'})+1)
return entries, iterateTreeEntries(data, ptree, func(entry *TreeEntry) error {
entries = append(entries, entry)
return nil
})
}

func iterateTreeEntries(data []byte, ptree *Tree, f func(entry *TreeEntry) error) error {
var err error
for pos := 0; pos < len(data); {
// expect line to be of the form:
// <mode> <type> <sha> <space-padded-size>\t<filename>
Expand All @@ -39,7 +46,7 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
line := data[pos:posEnd]
posTab := bytes.IndexByte(line, '\t')
if posTab == -1 {
return nil, fmt.Errorf("invalid ls-tree output (no tab): %q", line)
return fmt.Errorf("invalid ls-tree output (no tab): %q", line)
}

entry := new(TreeEntry)
Expand Down Expand Up @@ -69,27 +76,29 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
case "040000", "040755": // git uses 040000 for tree object, but some users may get 040755 for unknown reasons
entry.entryMode = EntryModeTree
default:
return nil, fmt.Errorf("unknown type: %v", string(entryMode))
return fmt.Errorf("unknown type: %v", string(entryMode))
}

entry.ID, err = NewIDFromString(string(entryObjectID))
if err != nil {
return nil, fmt.Errorf("invalid ls-tree output (invalid object id): %q, err: %w", line, err)
return fmt.Errorf("invalid ls-tree output (invalid object id): %q, err: %w", line, err)
}

if len(entryName) > 0 && entryName[0] == '"' {
entry.name, err = strconv.Unquote(string(entryName))
if err != nil {
return nil, fmt.Errorf("invalid ls-tree output (invalid name): %q, err: %w", line, err)
return fmt.Errorf("invalid ls-tree output (invalid name): %q, err: %w", line, err)
}
} else {
entry.name = string(entryName)
}

pos = posEnd + 1
entries = append(entries, entry)
if err := f(entry); err != nil {
return err
}
}
return entries, nil
return nil
}

func catBatchParseTreeEntries(objectFormat ObjectFormat, ptree *Tree, rd *bufio.Reader, sz int64) ([]*TreeEntry, error) {
Expand Down
42 changes: 17 additions & 25 deletions modules/git/repo_language_stats_nogogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err

tree := commit.Tree

entries, err := tree.ListEntriesRecursiveWithSize()
if err != nil {
return nil, err
}

checker, deferable := repo.CheckAttributeReader(commitID)
defer deferable()

Expand All @@ -77,18 +72,12 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
firstExcludedLanguage := ""
firstExcludedLanguageSize := int64(0)

for _, f := range entries {
select {
case <-repo.Ctx.Done():
return sizes, repo.Ctx.Err()
default:
}

if err := tree.IterateEntriesRecursive(func(f *TreeEntry) error {
contentBuf.Reset()
content = contentBuf.Bytes()

if f.Size() == 0 {
continue
return nil
}

isVendored := optional.None[bool]()
Expand All @@ -101,22 +90,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
if err == nil {
isVendored = AttributeToBool(attrs, AttributeLinguistVendored)
if isVendored.ValueOrDefault(false) {
continue
return nil
}

isGenerated = AttributeToBool(attrs, AttributeLinguistGenerated)
if isGenerated.ValueOrDefault(false) {
continue
return nil
}

isDocumentation = AttributeToBool(attrs, AttributeLinguistDocumentation)
if isDocumentation.ValueOrDefault(false) {
continue
return nil
}

isDetectable = AttributeToBool(attrs, AttributeLinguistDetectable)
if !isDetectable.ValueOrDefault(true) {
continue
return nil
}

hasLanguage := TryReadLanguageAttribute(attrs)
Expand All @@ -131,7 +120,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err

// this language will always be added to the size
sizes[language] += f.Size()
continue
return nil
}
}
}
Expand All @@ -140,19 +129,19 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
enry.IsDotFile(f.Name()) ||
(!isDocumentation.Has() && enry.IsDocumentation(f.Name())) ||
enry.IsConfiguration(f.Name()) {
continue
return nil
}

// If content can not be read or file is too big just do detection by filename

if f.Size() <= bigFileSize {
if err := writeID(f.ID.String()); err != nil {
return nil, err
return err
}
_, _, size, err := ReadBatchLine(batchReader)
if err != nil {
log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
return nil, err
return err
}

sizeToRead := size
Expand All @@ -164,22 +153,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err

_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
if err != nil {
return nil, err
return err
}
content = contentBuf.Bytes()
if err := DiscardFull(batchReader, discard); err != nil {
return nil, err
return err
}
}
if !isGenerated.Has() && enry.IsGenerated(f.Name(), content) {
continue
return nil
}

// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze.GetCodeLanguage(f.Name(), content)
if language == "" {
continue
return nil
}

// group languages, such as Pug -> HTML; SCSS -> CSS
Expand All @@ -200,6 +189,9 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
firstExcludedLanguage = language
firstExcludedLanguageSize += f.Size()
}
return nil
}, TrustedCmdArgs{"--long"}); err != nil { // add --long to get size
return sizes, err
}

// If there are no included languages add the first excluded language
Expand Down
30 changes: 22 additions & 8 deletions modules/git/tree_gogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,31 @@

// ListEntriesRecursiveWithSize returns all entries of current tree recursively including all subtrees
func (t *Tree) ListEntriesRecursiveWithSize() (Entries, error) {
var entries []*TreeEntry
if err := t.IterateEntriesRecursive(func(entry *TreeEntry) error {

Check failure on line 62 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-sqlite

not enough arguments in call to t.IterateEntriesRecursive

Check failure on line 62 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-unit

not enough arguments in call to t.IterateEntriesRecursive

Check failure on line 62 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / backend

not enough arguments in call to t.IterateEntriesRecursive
entries = append(entries, convertedEntry)

Check failure on line 63 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-sqlite

undefined: convertedEntry

Check failure on line 63 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-unit

undefined: convertedEntry

Check failure on line 63 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / backend

undefined: convertedEntry
return nil
}); err != nil {
return nil, err
}
return entries, nil
}

// ListEntriesRecursiveFast is the alias of ListEntriesRecursiveWithSize for the gogit version
func (t *Tree) ListEntriesRecursiveFast() (Entries, error) {
return t.ListEntriesRecursiveWithSize()
}

// IterateEntriesRecursive returns iterate entries of current tree recursively including all subtrees
// extraArgs could be "-l" to get the size, which is slower
func (t *Tree) IterateEntriesRecursive(f func(entry *TreeEntry) error, extraArgs TrustedCmdArgs) error {
if t.gogitTree == nil {
err := t.loadTreeObject()
if err != nil {
return nil, err

Check failure on line 82 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-sqlite

too many return values

Check failure on line 82 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-unit

too many return values

Check failure on line 82 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / backend

too many return values
}
}

var entries []*TreeEntry
seen := map[plumbing.Hash]bool{}
walker := object.NewTreeWalker(t.gogitTree, true, seen)
for {
Expand All @@ -74,7 +91,7 @@
break
}
if err != nil {
return nil, err

Check failure on line 94 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-sqlite

too many return values

Check failure on line 94 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-unit

too many return values

Check failure on line 94 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / backend

too many return values
}
if seen[entry.Hash] {
continue
Expand All @@ -86,13 +103,10 @@
ptree: t,
fullName: fullName,
}
entries = append(entries, convertedEntry)
if err := f(convertedEntry); err != nil {
return nil, err

Check failure on line 107 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-sqlite

too many return values

Check failure on line 107 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / test-unit

too many return values

Check failure on line 107 in modules/git/tree_gogit.go

View workflow job for this annotation

GitHub Actions / backend

too many return values
}
}

return entries, nil
}

// ListEntriesRecursiveFast is the alias of ListEntriesRecursiveWithSize for the gogit version
func (t *Tree) ListEntriesRecursiveFast() (Entries, error) {
return t.ListEntriesRecursiveWithSize()
return nil
}
70 changes: 57 additions & 13 deletions modules/git/tree_nogogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package git

import (
"bufio"
"io"
"strings"
)
Expand Down Expand Up @@ -96,21 +97,17 @@ func (t *Tree) listEntriesRecursive(extraArgs TrustedCmdArgs) (Entries, error) {
return t.entriesRecursive, nil
}

stdout, _, runErr := NewCommand(t.repo.Ctx, "ls-tree", "-t", "-r").
AddArguments(extraArgs...).
AddDynamicArguments(t.ID.String()).
RunStdBytes(&RunOpts{Dir: t.repo.Path})
if runErr != nil {
return nil, runErr
}

var err error
t.entriesRecursive, err = parseTreeEntries(stdout, t)
if err == nil {
t.entriesRecursiveParsed = true
t.entriesRecursive = make([]*TreeEntry, 0)
if err := t.IterateEntriesRecursive(func(entry *TreeEntry) error {
t.entriesRecursive = append(t.entriesRecursive, entry)
return nil
}, extraArgs); err != nil {
t.entriesRecursive = nil
return nil, err
}

return t.entriesRecursive, err
t.entriesRecursiveParsed = true
return t.entriesRecursive, nil
}

// ListEntriesRecursiveFast returns all entries of current tree recursively including all subtrees, no size
Expand All @@ -122,3 +119,50 @@ func (t *Tree) ListEntriesRecursiveFast() (Entries, error) {
func (t *Tree) ListEntriesRecursiveWithSize() (Entries, error) {
return t.listEntriesRecursive(TrustedCmdArgs{"--long"})
}

// IterateEntriesRecursive returns iterate entries of current tree recursively including all subtrees
// extraArgs could be "-l" to get the size, which is slower
func (t *Tree) IterateEntriesRecursive(f func(entry *TreeEntry) error, extraArgs TrustedCmdArgs) error {
reader, writer := io.Pipe()
done := make(chan error)

go func(t *Tree, done chan error, writer *io.PipeWriter) {
runErr := NewCommand(t.repo.Ctx, "ls-tree", "-t", "-r").
AddArguments(extraArgs...).
AddDynamicArguments(t.ID.String()).
Run(&RunOpts{
Dir: t.repo.Path,
Stdout: writer,
})

_ = writer.Close()

done <- runErr
}(t, done, writer)

scanner := bufio.NewScanner(reader)
for scanner.Scan() {
if err := scanner.Err(); err != nil {
return err
}

data := scanner.Bytes()
if err := iterateTreeEntries(data, t, func(entry *TreeEntry) error {
if err := f(entry); err != nil {
return err
}

select {
case <-t.repo.Ctx.Done():
return t.repo.Ctx.Err()
case runErr := <-done:
return runErr
default:
return nil
}
}); err != nil {
return err
}
}
return nil
}
Loading