Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add: AWS s3 helper to download files from s3 #64

Merged
merged 7 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 200 additions & 0 deletions aws/s3.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
package aws

import (
"context"
"io"
"net/url"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/skit-ai/vcore/errors"
"github.com/skit-ai/vcore/log/slog"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/aws/aws-sdk-go/service/s3/s3manager"
)

const (
// Regex for S3 URLs, VPCE interface endpoint
sreeram-narayanan marked this conversation as resolved.
Show resolved Hide resolved
vpceUrlPattern = "^((.+)\\.)?" + // maybe a bucket name
sreeram-narayanan marked this conversation as resolved.
Show resolved Hide resolved
"(bucket|accesspoint|control)\\.vpce-[-a-z0-9]+\\." + // VPC endpoint DNS name
"s3[.-]" + // S3 service name
"(([-a-z0-9]+)\\.)?" + // region name, optional for us-east-1
"vpce\\." +
"(amazonaws\\.com|c2s\\.ic\\.gov|sc2s\\.sgov\\.gov)"
vpceUrlPatternHostIdx = 0
vpceUrlPatternBucketIdx = 2
vpceUrlPatternRegionIdx = 5

// Regex for S3 URLs, public S3 endpoint
nonVpceUrlPattern = "^((.+)\\.)?" + // maybe a bucket name
"s3[.-](website[-.])?(accelerate\\.)?(dualstack[-.])?" + // S3 service name with optional features
"(([-a-z0-9]+)\\.)?" + // region name, optional for us-east-1
"(amazonaws\\.com|c2s\\.ic\\.gov|sc2s\\.sgov\\.gov)"
nonVpceUrlPatternBucketIdx = 2
nonVpceUrlPatternRegionIdx = 7
)

var (
vpceUrlRegex = regexp.MustCompile(vpceUrlPattern)
nonVpceUrlRegex = regexp.MustCompile(nonVpceUrlPattern)
)

// S3URL holds interesting pieces after parsing a s3 URL
type S3URL struct {
IsPathStyle bool
EndPoint string
Bucket string
Key string
Region string
}

// DownloadFile downloads a file from s3 based on the key and writes it into WriteAt.
func (u S3URL) DownloadFile(ctx context.Context, w io.WriterAt) error {
sess, err := session.NewSession(&aws.Config{
Region: aws.String(u.Region), // Specify the region where the bucket is located
Endpoint: aws.String(u.EndPoint),
})
if err != nil {
return errors.NewError("Error creating session", err, false)
}

downloader := s3manager.NewDownloader(sess)

numBytes, err := downloader.DownloadWithContext(ctx, w, &s3.GetObjectInput{
Bucket: aws.String(u.Bucket),
Key: aws.String(u.Key),
})

if err != nil {
return errors.NewError("Error downloading file", err, false)
}

slog.Debug("Downloaded file", "size", numBytes)

return nil
}

// ParseAmazonS3URL parses an HTTP/HTTPS URL for an S3 resource and returns an
// S3URL object.
//
// S3 URLs come in two flavors: virtual hosted-style URLs and path-style URLs.
// Virtual hosted-style URLs have the bucket name as the first component of the
// hostname, e.g.
//
// https://mybucket.s3.us-east-1.amazonaws.com/a/b/c
//
// Path-style URLs have the bucket name as the first component of the path, e.g.
//
// https://s3.us-east-1.amazonaws.com/mybucket/a/b/c
func ParseAmazonS3URL(s3URL *url.URL) (S3URL, error) {
output, err := parseBucketAndRegionFromHost(s3URL.Host)
if err != nil {
return S3URL{}, errors.NewError("parsing host failed", err, false)
}

output.IsPathStyle = output.Bucket == ""

path := s3URL.Path

if output.IsPathStyle {
// no bucket name in the authority, parse it from the path
output.IsPathStyle = true

// grab the encoded path so we don't run afoul of '/'s in the bucket name
if path == "/" || path == "" {
} else {
path = path[1:]
index := strings.Index(path, "/")
if index == -1 {
// https://s3.amazonaws.com/bucket
output.Bucket = path
output.Key = ""
} else if index == (len(path) - 1) {
// https://s3.amazonaws.com/bucket/
output.Bucket = strings.TrimRight(path, "/")
output.Key = ""
} else {
// https://s3.amazonaws.com/bucket/key
output.Bucket = path[:index]
output.Key = path[index+1:]
}
}
} else {
// bucket name in the host, path is the object key
if path == "/" || path == "" {
output.Key = ""
} else {
output.Key = path[1:]
}
}

if strings.EqualFold(output.Region, "external-1") {
output.Region = "us-east-1"
} else if output.Region == "" {
// s3 bucket URL in us-east-1 doesn't include region
output.Region = "us-east-1"
}

return output, nil
}

func parseBucketAndRegionFromHost(host string) (S3URL, error) {
result := vpceUrlRegex.FindStringSubmatch(host)
if result != nil && len(result) > vpceUrlPatternBucketIdx && len(result) > vpceUrlPatternRegionIdx {
return S3URL{
EndPoint: result[vpceUrlPatternHostIdx],
Bucket: result[vpceUrlPatternBucketIdx],
Region: result[vpceUrlPatternRegionIdx],
}, nil
} else {
result = nonVpceUrlRegex.FindStringSubmatch(host)
if result != nil && len(result) > vpceUrlPatternBucketIdx && len(result) > vpceUrlPatternRegionIdx {
return S3URL{
Bucket: result[nonVpceUrlPatternBucketIdx],
Region: result[nonVpceUrlPatternRegionIdx],
}, nil
} else {
return S3URL{}, errors.NewError("failed to match URL", nil, false)
}
}
}

// DownloadFileFromS3 takes an S3 URL and a filePath, downloads the file from s3 and stores it in the filePath.
func DownloadFileFromS3(ctx context.Context, downloadURL, filePath string) error {
parsedURL, err := url.Parse(downloadURL)
if err != nil {
return err
sreeram-narayanan marked this conversation as resolved.
Show resolved Hide resolved
}

// Parse s3 URL to extract region, key and bucket.
s3URL, err := ParseAmazonS3URL(parsedURL)
if err != nil {
return errors.NewError("Failed to parse URL", err, false)
}

// Create file path
err = os.MkdirAll(filepath.Dir(filePath), os.ModePerm)
if err != nil {
return errors.NewError("Unable to create directory", err, false)
}

// Create a local file to write to
f, err := os.Create(filePath)
Dismissed Show dismissed Hide dismissed
if err != nil {
return errors.NewError("Error creating file", err, false)
}

defer func() {
// Ensure file is closed even if an error occurs
if f != nil {
f.Close()
Dismissed Show dismissed Hide dismissed
}
}()

return s3URL.DownloadFile(ctx, f)
}
10 changes: 5 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.19

require (
github.com/Vernacular-ai/gorm v1.11.3
github.com/aws/aws-sdk-go v1.44.153
github.com/aws/aws-sdk-go v1.49.15
github.com/getsentry/sentry-go v0.15.0
github.com/go-kit/log v0.2.1
github.com/google/go-cmp v0.5.9
Expand Down Expand Up @@ -94,11 +94,11 @@ require (
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.10.0 // indirect
go.uber.org/multierr v1.8.0 // indirect
golang.org/x/crypto v0.3.0 // indirect
golang.org/x/net v0.3.0 // indirect
golang.org/x/crypto v0.17.0 // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.2.0 // indirect
golang.org/x/sys v0.3.0 // indirect
golang.org/x/text v0.5.0 // indirect
golang.org/x/sys v0.15.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
google.golang.org/api v0.103.0 // indirect
Expand Down
16 changes: 16 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgI
github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
github.com/aws/aws-sdk-go v1.44.153 h1:KfN5URb9O/Fk48xHrAinrPV2DzPcLa0cd9yo1ax5KGg=
github.com/aws/aws-sdk-go v1.44.153/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/aws-sdk-go v1.49.15 h1:aH9bSV4kL4ziH0AMtuYbukGIVebXddXBL0cKZ1zj15k=
github.com/aws/aws-sdk-go v1.49.15/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
Expand Down Expand Up @@ -687,6 +689,10 @@ golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5y
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.3.0 h1:a06MkbcxBrEFc0w0QIZWXrH/9cCX6KJyWbBOIwAn+7A=
golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4=
golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k=
golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
Expand Down Expand Up @@ -777,6 +783,8 @@ golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfS
golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco=
golang.org/x/net v0.3.0 h1:VWL6FNY2bEEmsGVKabSlHu5Irp34xmMRoqb/9lF9lxk=
golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
Expand Down Expand Up @@ -894,6 +902,10 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.3.0 h1:w8ZOecv6NaNa/zC8944JTU3vz4u6Lagfk4RPQxv92NQ=
golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc=
golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
Expand All @@ -909,6 +921,10 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM=
golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
Expand Down
Loading