Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/aws s3 collect multiple profiles #174

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/exporter/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ type Config struct {
ProjectID string
Providers struct {
AWS struct {
Profile string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only thing I'm confused about now is why even have this at all? It's not used anywhere as far as I can tell -

func selectProvider(cfg *config.Config) (provider.Provider, error) {
	switch cfg.Provider {
	case "aws":
		return aws.New(&aws.Config{
			Region:         cfg.Providers.AWS.Region,
			Profiles:       strings.Split(cfg.Providers.AWS.Profiles.String(), ","),
			ScrapeInterval: cfg.Collector.ScrapeInterval,
			Services:       strings.Split(cfg.Providers.AWS.Services.String(), ","),
		})

	case "gcp":
		return google.New(&google.Config{
			ProjectId:       cfg.ProjectID,
			Region:          cfg.Providers.GCP.Region,
			Projects:        cfg.Providers.GCP.Projects.String(),
			DefaultDiscount: cfg.Providers.GCP.DefaultGCSDiscount,
			ScrapeInterval:  cfg.Collector.ScrapeInterval,
			Services:        strings.Split(cfg.Providers.GCP.Services.String(), ","),
		})

	default:
		return nil, fmt.Errorf("unknown provider")
	}
}

Profiles StringSliceFlag
Region string
Services StringSliceFlag
Expand Down
6 changes: 4 additions & 2 deletions cmd/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ import (

func providerFlags(fs *flag.FlagSet, cfg *config.Config) {
flag.StringVar(&cfg.Provider, "provider", "aws", "aws or gcp")
fs.Var(&cfg.Providers.AWS.Profiles, "aws.profile", "AWS profile(s).")
fs.StringVar(&cfg.Providers.AWS.Profile, "aws.profile", "", "AWS Profile to authenticate with.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto to comment above ^

This doesn't seem to be used at all

fs.Var(&cfg.Providers.AWS.Profiles, "aws.profiles", "AWS Profiles to collect resources from.")
// TODO: RENAME THIS TO JUST PROJECTS
fs.Var(&cfg.Providers.GCP.Projects, "gcp.bucket-projects", "GCP project(s).")
fs.Var(&cfg.Providers.AWS.Services, "aws.services", "AWS service(s).")
Expand All @@ -51,7 +52,8 @@ func selectProvider(cfg *config.Config) (provider.Provider, error) {
case "aws":
return aws.New(&aws.Config{
Region: cfg.Providers.AWS.Region,
Profile: cfg.Providers.AWS.Profiles.String(),
Profile: cfg.Providers.AWS.Profile,
Profiles: strings.Split(cfg.Providers.AWS.Profiles.String(), ","),
ScrapeInterval: cfg.Collector.ScrapeInterval,
Services: strings.Split(cfg.Providers.AWS.Services.String(), ","),
})
Expand Down
46 changes: 27 additions & 19 deletions pkg/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ import (
"time"

awsconfig "github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/costexplorer"
awscostexplorer "github.com/aws/aws-sdk-go-v2/service/costexplorer"

"github.com/grafana/cloudcost-exporter/pkg/aws/costexplorer"

Comment on lines +12 to +14
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit GoIsm: remove the empty lines (not the one in line 9, tho).

"github.com/prometheus/client_golang/prometheus"

cloudcost_exporter "github.com/grafana/cloudcost-exporter"
Expand All @@ -20,6 +23,7 @@ type Config struct {
Services []string
Region string
Profile string
Profiles []string
ScrapeInterval time.Duration
}

Expand Down Expand Up @@ -98,26 +102,30 @@ func New(config *Config) (*AWS, error) {
for _, service := range services {
switch service {
case "S3":
// There are two scenarios:
// 1. Running locally, the user must pass in a region and profile to use
// 2. Running within an EC2 instance and the region and profile can be derived
// I'm going to use the AWS SDK to handle this for me. If the user has provided a region and profile, it will use that.
// If not, it will use the EC2 instance metadata service to determine the region and credentials.
// This is the same logic that the AWS CLI uses, so it should be fine.
options := []func(*awsconfig.LoadOptions) error{awsconfig.WithEC2IMDSRegion()}
if config.Region != "" {
options = append(options, awsconfig.WithRegion(config.Region))
}
if config.Profile != "" {
options = append(options, awsconfig.WithSharedConfigProfile(config.Profile))
}
ac, err := awsconfig.LoadDefaultConfig(context.Background(), options...)
if err != nil {
return nil, err
var clients []costexplorer.CostExplorer
for _, profile := range config.Profiles {
// There are two scenarios:
// 1. Running locally, the user must pass in a region and profile to use
// 2. Running within an EC2 instance and the region and profile can be derived
// I'm going to use the AWS SDK to handle this for me. If the user has provided a region and profile, it will use that.
// If not, it will use the EC2 instance metadata service to determine the region and credentials.
// This is the same logic that the AWS CLI uses, so it should be fine.
options := []func(*awsconfig.LoadOptions) error{awsconfig.WithEC2IMDSRegion()}
if config.Region != "" {
options = append(options, awsconfig.WithRegion(config.Region))
}
if config.Profile != "" {
options = append(options, awsconfig.WithSharedConfigProfile(profile))
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what's going on here. You check here that the single -profile flag has some value, and if it does, then forget about that value and use the one from the iteration 🤔

Basically the -profile value never gets used, and in fact, I'd argue than adding this flag adds confusion to the user, as now they have -profile and -profiles. It seems to me that the single case is the same as -profiles with just one value.

What am I missing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty on point question. Mainly, naming is hard 😅

It looks like we're not using the -profile flag in dev or production today:

 spec:
  ...
      containers:
        - args:
            - --provider=aws
            - --aws.services=s3

I think the initial intent of the check you see is when doing local development to figure out which profile we should target. In dev/production that should be handled by:

options := []func(*awsconfig.LoadOptions) error{awsconfig.WithEC2IMDSRegion()}

Let me remove the -profile flag as it's not used anymore. I'll build a local image and deploy this into development and make sure it still works 😬

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed here, it looks like this might've been a misstep. It now appears that we can't even use the flags separately, they must be used together, as specifying -aws.profile will not get used unless there's a profile from -aws.profiles as well, in which case it'll use the value(s) from aws.profiles, but unless you supply aws.profile, it'll never use any config values 😕

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if config.Profile != "" {
options = append(options, awsconfig.WithSharedConfigProfile(profile))
}
options = append(options, awsconfig.WithSharedConfigProfile(profile))

I'd suggest something like this, perhaps with a conditional flag to look if aws-profile is defined first rather than a loop through Profiles?

Or better yet, I'd just eliminate the aws.profiles flag entirely and allow aws.profile to be specified multiple times OR eliminate aws.profile and allow aws.profiles to be one or more comma-seperated items.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've opted to stick with -aws.profiles as a flag, as I think it makes it clear that multiple values can be passed in. @inkel @logyball I've pushed a commit that removes the reference, and appreciate the thoughts/suggestions!

cloudcost-exporter when deployed infers the profile and region from this line:

				options := []func(*awsconfig.LoadOptions) error{awsconfig.WithEC2IMDSRegion()}

I need to be a bit mindful when rolling this change out. If the -profiles isn't set, I've added logic to return an error. So I'll need to likely couple rolling out the wave with adding the -aws.profiles flag to the environment.

ac, err := awsconfig.LoadDefaultConfig(context.Background(), options...)
if err != nil {
return nil, err
}
client := awscostexplorer.NewFromConfig(ac)
clients = append(clients, client)
}

client := costexplorer.NewFromConfig(ac)
collector, err := s3.New(config.ScrapeInterval, client)
collector, err := s3.New(config.ScrapeInterval, clients)
if err != nil {
return nil, fmt.Errorf("error creating s3 collector: %w", err)
}
Expand Down
39 changes: 22 additions & 17 deletions pkg/aws/s3/s3.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func NewMetrics() Metrics {
// Collector is the AWS implementation of the Collector interface
// It is responsible for registering and collecting metrics
type Collector struct {
client costexplorer.CostExplorer
clients []costexplorer.CostExplorer
interval time.Duration
nextScrape time.Time
metrics Metrics
Expand All @@ -145,9 +145,9 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
}

// New creates a new Collector with a client and scrape interval defined.
func New(scrapeInterval time.Duration, client costexplorer.CostExplorer) (*Collector, error) {
func New(scrapeInterval time.Duration, clients []costexplorer.CostExplorer) (*Collector, error) {
return &Collector{
client: client,
clients: clients,
interval: scrapeInterval,
// Initially Set nextScrape to the current time minus the scrape interval so that the first scrape will run immediately
nextScrape: time.Now().Add(-scrapeInterval),
Expand All @@ -171,24 +171,29 @@ func (c *Collector) Register(registry provider.Registry) error {
return nil
}

// Collect is the function that will be called by the Prometheus client anytime a scrape is performed.
// CollectMetrics Collect is the function that will be called by the Prometheus client anytime a scrape is performed.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, I have not tested this at length, but anecdotally it has improved the performance of collecting multiple profiles:

// CollectMetrics Collect is the function that will be called by the Prometheus client anytime a scrape is performed.
func (c *Collector) CollectMetrics(ch chan<- prometheus.Metric) float64 {
	c.m.Lock()
	defer c.m.Unlock()
	now := time.Now()
	// :fire: Checking scrape interval is to _mitigate_ expensive API calls to the cost explorer API

	if c.billingData == nil || now.After(c.nextScrape) {
		eg := new(errgroup.Group)
		outputsLock := sync.Mutex{}
		var billingOutputs []*awscostexplorer.GetCostAndUsageOutput
		for _, client := range c.clients {
			eg.Go(func() error {
				endDate := time.Now().AddDate(0, 0, -1)
				// Current assumption is that we're going to pull 30 days worth of billing data
				startDate := endDate.AddDate(0, 0, -30)
				billingData, err := getBillingData(client, startDate, endDate, c.metrics)
				if err != nil {
					return err
				}
				outputsLock.Lock()
				billingOutputs = append(billingOutputs, billingData...)
				outputsLock.Unlock()
				return nil
			})
		}

		err := eg.Wait()
		if err != nil {
			log.Printf("Error getting billing data: %v\n", err)
			return 0
		}
		c.nextScrape = time.Now().Add(c.interval)
		c.metrics.NextScrapeGauge.Set(float64(c.nextScrape.Unix()))
		c.billingData = parseBillingData(billingOutputs)
	}

	exportMetrics(c.billingData, c.metrics)
	return 1.0
}

I can't speak to the validity of the results, but running it this way seemed to speed things up....

func (c *Collector) CollectMetrics(ch chan<- prometheus.Metric) float64 {
c.m.Lock()
defer c.m.Unlock()
now := time.Now()
// :fire: Checking scrape interval is to _mitigate_ expensive API calls to the cost explorer API

if c.billingData == nil || now.After(c.nextScrape) {
endDate := time.Now().AddDate(0, 0, -1)
// Current assumption is that we're going to pull 30 days worth of billing data
startDate := endDate.AddDate(0, 0, -30)
billingData, err := getBillingData(c.client, startDate, endDate, c.metrics)
if err != nil {
log.Printf("Error getting billing data: %v\n", err)
return 0
var billingOutputs []*awscostexplorer.GetCostAndUsageOutput
for _, client := range c.clients {
endDate := time.Now().AddDate(0, 0, -1)
// Current assumption is that we're going to pull 30 days worth of billing data
startDate := endDate.AddDate(0, 0, -30)
billingData, err := getBillingData(client, startDate, endDate, c.metrics)
if err != nil {
log.Printf("Error getting billing data: %v\n", err)
return 0
}
billingOutputs = append(billingOutputs, billingData...)
c.nextScrape = time.Now().Add(c.interval)
c.metrics.NextScrapeGauge.Set(float64(c.nextScrape.Unix()))
}
c.billingData = billingData
c.nextScrape = time.Now().Add(c.interval)
c.metrics.NextScrapeGauge.Set(float64(c.nextScrape.Unix()))
c.billingData = parseBillingData(billingOutputs)
}

exportMetrics(c.billingData, c.metrics)
Expand Down Expand Up @@ -278,7 +283,7 @@ func (s *BillingData) AddMetricGroup(region string, component string, group type

// getBillingData is responsible for making the API call to the AWS Cost Explorer API and parsing the response
// into a S3BillingData struct
func getBillingData(client costexplorer.CostExplorer, startDate time.Time, endDate time.Time, m Metrics) (*BillingData, error) {
func getBillingData(client costexplorer.CostExplorer, startDate time.Time, endDate time.Time, m Metrics) ([]*awscostexplorer.GetCostAndUsageOutput, error) {
log.Printf("Getting billing data for %s to %s\n", startDate.Format("2006-01-02"), endDate.Format("2006-01-02"))
input := &awscostexplorer.GetCostAndUsageInput{
TimePeriod: &types.DateInterval{
Expand Down Expand Up @@ -309,7 +314,7 @@ func getBillingData(client costexplorer.CostExplorer, startDate time.Time, endDa
if err != nil {
log.Printf("Error getting cost and usage: %v\n", err)
m.RequestErrorsCount.Inc()
return &BillingData{}, err
return nil, err
}
outputs = append(outputs, output)
if output.NextPageToken == nil {
Expand All @@ -318,7 +323,7 @@ func getBillingData(client costexplorer.CostExplorer, startDate time.Time, endDa
input.NextPageToken = output.NextPageToken
}

return parseBillingData(outputs), nil
return outputs, nil
}

// parseBillingData takes the output from the AWS Cost Explorer API and parses it into a S3BillingData struct
Expand Down
10 changes: 6 additions & 4 deletions pkg/aws/s3/s3_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"go.uber.org/mock/gomock"

mockcostexplorer "github.com/grafana/cloudcost-exporter/mocks/pkg/aws/costexplorer"
"github.com/grafana/cloudcost-exporter/pkg/aws/costexplorer"
mock_provider "github.com/grafana/cloudcost-exporter/pkg/provider/mocks"
)

Expand Down Expand Up @@ -180,7 +181,7 @@ func TestNewCollector(t *testing.T) {
t.Run(name, func(t *testing.T) {
c := mockcostexplorer.NewCostExplorer(t)

got, err := New(tt.args.interval, c)
got, err := New(tt.args.interval, []costexplorer.CostExplorer{c})
if tt.error {
require.Error(t, err)
return
Expand Down Expand Up @@ -600,9 +601,10 @@ cloudcost_aws_s3_storage_by_location_usd_per_gibyte_hour{class="StandardStorage"
}

c := &Collector{
client: ce,
clients: []costexplorer.CostExplorer{ce},
nextScrape: tc.nextScrape,
metrics: NewMetrics(),
//profiles: []string{"testing"},
}
up := c.CollectMetrics(nil)
require.Equal(t, tc.expectedResponse, up)
Expand Down Expand Up @@ -674,7 +676,7 @@ func TestCollector_MultipleCalls(t *testing.T) {
Return(&awscostexplorer.GetCostAndUsageOutput{}, nil)

c := &Collector{
client: ce,
clients: []costexplorer.CostExplorer{ce},
metrics: NewMetrics(),
interval: 1 * time.Hour,
}
Expand Down Expand Up @@ -739,7 +741,7 @@ func TestCollector_MultipleCalls(t *testing.T) {
Times(goroutines * collectCalls)

c := &Collector{
client: ce,
clients: []costexplorer.CostExplorer{ce},
metrics: NewMetrics(),
}

Expand Down
Loading