diff --git a/aws/s3.go b/aws/s3.go new file mode 100644 index 0000000..62a289b --- /dev/null +++ b/aws/s3.go @@ -0,0 +1,201 @@ +package aws + +import ( + "context" + "io" + "net/url" + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/skit-ai/vcore/errors" + "github.com/skit-ai/vcore/log/slog" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + "github.com/aws/aws-sdk-go/service/s3/s3manager" +) + +const ( + // Regex for S3 URLs, VPCE interface endpoint + // Source - https://github.com/aws/amazon-ssm-agent/blob/mainline/agent/s3util/s3uri.go + vpceURLPattern = "^((.+)\\.)?" + // maybe a bucket name + "(bucket|accesspoint|control)\\.vpce-[-a-z0-9]+\\." + // VPC endpoint DNS name + "s3[.-]" + // S3 service name + "(([-a-z0-9]+)\\.)?" + // region name, optional for us-east-1 + "vpce\\." + + "(amazonaws\\.com|c2s\\.ic\\.gov|sc2s\\.sgov\\.gov)" + vpceURLPatternHostIdx = 0 + vpceURLPatternBucketIdx = 2 + vpceURLPatternRegionIdx = 5 + + // Regex for S3 URLs, public S3 endpoint + nonVpceURLPattern = "^((.+)\\.)?" + // maybe a bucket name + "s3[.-](website[-.])?(accelerate\\.)?(dualstack[-.])?" + // S3 service name with optional features + "(([-a-z0-9]+)\\.)?" + // region name, optional for us-east-1 + "(amazonaws\\.com|c2s\\.ic\\.gov|sc2s\\.sgov\\.gov)" + nonVpceURLPatternBucketIdx = 2 + nonVpceURLPatternRegionIdx = 7 +) + +var ( + vpceUrlRegex = regexp.MustCompile(vpceURLPattern) + nonVpceUrlRegex = regexp.MustCompile(nonVpceURLPattern) +) + +// S3URL holds interesting pieces after parsing a s3 URL +type S3URL struct { + IsPathStyle bool + EndPoint string + Bucket string + Key string + Region string +} + +// DownloadFile downloads a file from s3 based on the key and writes it into WriteAt. +func (u S3URL) DownloadFile(ctx context.Context, w io.WriterAt) error { + sess, err := session.NewSession(&aws.Config{ + Region: aws.String(u.Region), // Specify the region where the bucket is located + Endpoint: aws.String(u.EndPoint), + }) + if err != nil { + return errors.NewError("Error creating session", err, false) + } + + downloader := s3manager.NewDownloader(sess) + + numBytes, err := downloader.DownloadWithContext(ctx, w, &s3.GetObjectInput{ + Bucket: aws.String(u.Bucket), + Key: aws.String(u.Key), + }) + + if err != nil { + return errors.NewError("Error downloading file", err, false) + } + + slog.Debug("Downloaded file", "size", numBytes) + + return nil +} + +// ParseAmazonS3URL parses an HTTP/HTTPS URL for an S3 resource and returns an +// S3URL object. +// +// S3 URLs come in two flavors: virtual hosted-style URLs and path-style URLs. +// Virtual hosted-style URLs have the bucket name as the first component of the +// hostname, e.g. +// +// https://mybucket.s3.us-east-1.amazonaws.com/a/b/c +// +// Path-style URLs have the bucket name as the first component of the path, e.g. +// +// https://s3.us-east-1.amazonaws.com/mybucket/a/b/c +func ParseAmazonS3URL(s3URL *url.URL) (S3URL, error) { + output, err := parseBucketAndRegionFromHost(s3URL.Host) + if err != nil { + return S3URL{}, errors.NewError("parsing host failed", err, false) + } + + output.IsPathStyle = output.Bucket == "" + + path := s3URL.Path + + if output.IsPathStyle { + // no bucket name in the authority, parse it from the path + output.IsPathStyle = true + + // grab the encoded path so we don't run afoul of '/'s in the bucket name + if path == "/" || path == "" { + } else { + path = path[1:] + index := strings.Index(path, "/") + if index == -1 { + // https://s3.amazonaws.com/bucket + output.Bucket = path + output.Key = "" + } else if index == (len(path) - 1) { + // https://s3.amazonaws.com/bucket/ + output.Bucket = strings.TrimRight(path, "/") + output.Key = "" + } else { + // https://s3.amazonaws.com/bucket/key + output.Bucket = path[:index] + output.Key = path[index+1:] + } + } + } else { + // bucket name in the host, path is the object key + if path == "/" || path == "" { + output.Key = "" + } else { + output.Key = path[1:] + } + } + + if strings.EqualFold(output.Region, "external-1") { + output.Region = "us-east-1" + } else if output.Region == "" { + // s3 bucket URL in us-east-1 doesn't include region + output.Region = "us-east-1" + } + + return output, nil +} + +func parseBucketAndRegionFromHost(host string) (S3URL, error) { + result := vpceUrlRegex.FindStringSubmatch(host) + if result != nil && len(result) > vpceURLPatternBucketIdx && len(result) > vpceURLPatternRegionIdx { + return S3URL{ + EndPoint: result[vpceURLPatternHostIdx], + Bucket: result[vpceURLPatternBucketIdx], + Region: result[vpceURLPatternRegionIdx], + }, nil + } else { + result = nonVpceUrlRegex.FindStringSubmatch(host) + if result != nil && len(result) > vpceURLPatternBucketIdx && len(result) > vpceURLPatternRegionIdx { + return S3URL{ + Bucket: result[nonVpceURLPatternBucketIdx], + Region: result[nonVpceURLPatternRegionIdx], + }, nil + } else { + return S3URL{}, errors.NewError("failed to match URL", nil, false) + } + } +} + +// DownloadFileFromS3 takes an S3 URL and a filePath, downloads the file from s3 and stores it in the filePath. +func DownloadFileFromS3(ctx context.Context, downloadURL, filePath string) error { + parsedURL, err := url.Parse(downloadURL) + if err != nil { + return errors.NewError("Failed to parse URL", err, false) + } + + // Parse s3 URL to extract region, key and bucket. + s3URL, err := ParseAmazonS3URL(parsedURL) + if err != nil { + return errors.NewError("Failed to parse URL as s3 URL", err, false) + } + + // Create file path + err = os.MkdirAll(filepath.Dir(filePath), os.ModePerm) + if err != nil { + return errors.NewError("Unable to create directory", err, false) + } + + // Create a local file to write to + f, err := os.Create(filePath) + if err != nil { + return errors.NewError("Error creating file", err, false) + } + + defer func() { + // Ensure file is closed even if an error occurs + if f != nil { + f.Close() + } + }() + + return s3URL.DownloadFile(ctx, f) +} diff --git a/go.mod b/go.mod index ce86d4b..3e9dd2c 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.19 require ( github.com/Vernacular-ai/gorm v1.11.3 - github.com/aws/aws-sdk-go v1.44.153 + github.com/aws/aws-sdk-go v1.49.15 github.com/getsentry/sentry-go v0.15.0 github.com/go-kit/log v0.2.1 github.com/google/go-cmp v0.5.9 @@ -94,11 +94,11 @@ require ( go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.uber.org/atomic v1.10.0 // indirect go.uber.org/multierr v1.8.0 // indirect - golang.org/x/crypto v0.3.0 // indirect - golang.org/x/net v0.3.0 // indirect + golang.org/x/crypto v0.17.0 // indirect + golang.org/x/net v0.17.0 // indirect golang.org/x/oauth2 v0.2.0 // indirect - golang.org/x/sys v0.3.0 // indirect - golang.org/x/text v0.5.0 // indirect + golang.org/x/sys v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect golang.org/x/time v0.3.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/api v0.103.0 // indirect diff --git a/go.sum b/go.sum index 8d55842..b7e5c68 100644 --- a/go.sum +++ b/go.sum @@ -210,6 +210,8 @@ github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgI github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= github.com/aws/aws-sdk-go v1.44.153 h1:KfN5URb9O/Fk48xHrAinrPV2DzPcLa0cd9yo1ax5KGg= github.com/aws/aws-sdk-go v1.44.153/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.49.15 h1:aH9bSV4kL4ziH0AMtuYbukGIVebXddXBL0cKZ1zj15k= +github.com/aws/aws-sdk-go v1.49.15/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= @@ -687,6 +689,10 @@ golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.3.0 h1:a06MkbcxBrEFc0w0QIZWXrH/9cCX6KJyWbBOIwAn+7A= golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= +golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= +golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -777,6 +783,8 @@ golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfS golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.3.0 h1:VWL6FNY2bEEmsGVKabSlHu5Irp34xmMRoqb/9lF9lxk= golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -894,6 +902,10 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0 h1:w8ZOecv6NaNa/zC8944JTU3vz4u6Lagfk4RPQxv92NQ= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -909,6 +921,10 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM= golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=