Skip to content

Commit

Permalink
[POA-2516] Change the approach to capture parsing errors, perform the…
Browse files Browse the repository at this point in the history
… same post attempt at parsing and fallback decompressions
  • Loading branch information
shreys7 committed Dec 11, 2024
1 parent 57ed2fb commit 37d3c7b
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 66 deletions.
136 changes: 84 additions & 52 deletions learn/parse_http.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import (
"github.com/google/uuid"
"github.com/pkg/errors"
"github.com/postmanlabs/postman-insights-agent/printer"
"github.com/postmanlabs/postman-insights-agent/telemetry"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/transform"
"gopkg.in/yaml.v2"
Expand Down Expand Up @@ -145,14 +144,12 @@ func ParseHTTP(elem akinet.ParsedNetworkContent) (*PartialWitness, error) {
}

if err != nil {
// Just log an error instead of returning an error so users can see the
// other parts of the endpoint in the spec rather than an empty spec.
// https://app.clubhouse.io/akita-software/story/1898/juan-s-payload-problem
telemetry.RateLimitError("unparsable body", err)
printer.Debugf("skipping unparsable body: %v\n", err)
} else if bodyData != nil {
datas = append(datas, bodyData)
// When the body is unparsable even after attempting fallback decompressions,
// we will try to capture the body as a string and indicate parsing error in the body meta
bodyData = captureUnparsableBody(rawBody, contentType, statusCode)
}

datas = append(datas, bodyData)
}

method := &pb.Method{Id: UnassignedHTTPID(), Meta: methodMeta}
Expand Down Expand Up @@ -311,46 +308,18 @@ func parseBody(contentType string, bodyStream io.Reader, statusCode int) (*pb.Da
// TODO: XML parsing
// TODO: application/json-seq (RFC 7466)?
// TODO: more text/* types
var parseBodyDataAs pb.HTTPBody_ContentType
switch mediaType {
case "application/json":
parseBodyDataAs = pb.HTTPBody_JSON
case "application/x-www-form-urlencoded":
parseBodyDataAs = pb.HTTPBody_FORM_URL_ENCODED
case "application/yaml", "application/x-yaml", "text/yaml", "text/x-yaml":
parseBodyDataAs = pb.HTTPBody_YAML
case "application/octet-stream":
parseBodyDataAs = pb.HTTPBody_OCTET_STREAM
case "text/plain", "text/csv":
parseBodyDataAs = pb.HTTPBody_TEXT_PLAIN
case "text/html":
parseBodyDataAs = pb.HTTPBody_TEXT_HTML
default:
// Handle custom JSON-encoded media types.
if strings.HasSuffix(mediaType, "+json") {
parseBodyDataAs = pb.HTTPBody_JSON
} else {
parseBodyDataAs = pb.HTTPBody_OTHER
}
}
parseBodyDataAs := getContentTypeFromMediaType(mediaType)

var bodyData *pb.Data

// Create a buffer to store the entire body
bodyBytes, err := io.ReadAll(bodyStream)
if err != nil {
return nil, errors.Wrap(err, "failed to read body")
}
bodyReader := bytes.NewReader(bodyBytes)

// Handle unstructured types, but use this local value to signal
// errors so we can do the check just once
var blobErr error = nil

// Interpret as []byte
handleAsBlob := func() {
// Grab a small sample
body, err := limitedBufferBody(bodyReader, SmallBodySample)
body, err := limitedBufferBody(bodyStream, SmallBodySample)
if err != nil {
blobErr = err
return
Expand All @@ -360,7 +329,7 @@ func parseBody(contentType string, bodyStream io.Reader, statusCode int) (*pb.Da
// Interpret as string, optionally attempt to parse into another type
handleAsString := func(interpret spec_util.InterpretStrings) {
// Grab a small sample
body, err := limitedBufferBody(bodyReader, SmallBodySample)
body, err := limitedBufferBody(bodyStream, SmallBodySample)
if err != nil {
blobErr = err
return
Expand All @@ -369,14 +338,11 @@ func parseBody(contentType string, bodyStream io.Reader, statusCode int) (*pb.Da
}

// Parse body.
parsingError := false
switch parseBodyDataAs {
case pb.HTTPBody_JSON:
jsonReader := bytes.NewBuffer(bodyBytes)
bodyData, err = parseHTTPBodyJSON(jsonReader)
bodyData, err = parseHTTPBodyJSON(bodyStream)
if err != nil {
handleAsString(spec_util.NO_INTERPRET_STRINGS) // Fallback to parsing and persisting the body as string
parsingError = true
return nil, errors.Wrapf(err, "could not parse JSON body")
}
case pb.HTTPBody_FORM_URL_ENCODED:
body, err := limitedBufferBody(bodyStream, MaxBufferedBody)
Expand Down Expand Up @@ -408,11 +374,9 @@ func parseBody(contentType string, bodyStream io.Reader, statusCode int) (*pb.Da
// to be smart about re-interpreting values.
bodyData = parseElem(m, spec_util.INTERPRET_STRINGS)
case pb.HTTPBody_YAML:
yamlReader := bytes.NewBuffer(bodyBytes)
bodyData, err = parseHTTPBodyYAML(yamlReader)
bodyData, err = parseHTTPBodyYAML(bodyStream)
if err != nil {
handleAsString(spec_util.INTERPRET_STRINGS) // Fallback to parsing and persisting the body as string
parsingError = true
return nil, errors.Wrapf(err, "could not parse YAML body")
}
case pb.HTTPBody_OCTET_STREAM:
handleAsBlob()
Expand Down Expand Up @@ -444,10 +408,6 @@ func parseBody(contentType string, bodyStream io.Reader, statusCode int) (*pb.Da
OtherType: mediaType,
}

if parsingError {
bodyMeta.Errors = pb.HTTPBody_PARSING_ERROR
}

httpMeta := &pb.HTTPMeta{
Location: &pb.HTTPMeta_Body{
Body: bodyMeta,
Expand All @@ -459,6 +419,78 @@ func parseBody(contentType string, bodyStream io.Reader, statusCode int) (*pb.Da
return bodyData, nil
}

func captureUnparsableBody(body memview.MemView, contentType string, statusCode int) *pb.Data {
// If the body is empty, return nil.
if body.Len() == 0 {
return nil
}

// If the body is too large, return nil.
if body.Len() > MaxBufferedBody {
return nil
}

// Read the body into a buffer.
bodyStream := body.CreateReader()
bodyBytes, err := ioutil.ReadAll(bodyStream)
if err != nil {
return nil
}

// If the body is too large, return nil.
if len(bodyBytes) > MaxBufferedBody {
return nil
}

// Categorize the body as a string.
mediaType, _, _ := mime.ParseMediaType(contentType)
bodyStr := string(bodyBytes)
bodyData := &pb.Data{
Value: newDataPrimitive(categorizeStringToPrimitive(bodyStr)),
Meta: newDataMetaHTTPMeta(&pb.HTTPMeta{
Location: &pb.HTTPMeta_Body{
Body: &pb.HTTPBody{
ContentType: getContentTypeFromMediaType(mediaType),
OtherType: contentType,
Errors: pb.HTTPBody_PARSING_ERROR,
},
},
ResponseCode: int32(statusCode),
}),
}

return bodyData
}

func getContentTypeFromMediaType(mediaType string) pb.HTTPBody_ContentType {
mediaType, _, _ = mime.ParseMediaType(mediaType)

var parseBodyDataAs pb.HTTPBody_ContentType
switch mediaType {
case "application/json":
parseBodyDataAs = pb.HTTPBody_JSON
case "application/x-www-form-urlencoded":
parseBodyDataAs = pb.HTTPBody_FORM_URL_ENCODED
case "application/yaml", "application/x-yaml", "text/yaml", "text/x-yaml":
parseBodyDataAs = pb.HTTPBody_YAML
case "application/octet-stream":
parseBodyDataAs = pb.HTTPBody_OCTET_STREAM
case "text/plain", "text/csv":
parseBodyDataAs = pb.HTTPBody_TEXT_PLAIN
case "text/html":
parseBodyDataAs = pb.HTTPBody_TEXT_HTML
default:
// Handle custom JSON-encoded media types.
if strings.HasSuffix(mediaType, "+json") {
parseBodyDataAs = pb.HTTPBody_JSON
} else {
parseBodyDataAs = pb.HTTPBody_OTHER
}
}

return parseBodyDataAs
}

func parseMultipartBody(multipartType string, boundary string, bodyStream io.Reader, statusCode int) (*pb.Data, error) {
fields := map[string]*pb.Data{}
r := multipart.NewReader(bodyStream, boundary)
Expand Down
36 changes: 24 additions & 12 deletions learn/parse_http_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,17 @@ func newTestBodySpecContentType(contentType string, statusCode int) *as.Data {
}

func newTestBodySpecFromStruct(statusCode int, contentType as.HTTPBody_ContentType, originalContentType string, s map[string]*as.Data) *as.Data {
return newTestBodySpecFromData(statusCode, contentType, originalContentType, dataFromStruct(s))
return newTestBodySpecFromData(statusCode, contentType, originalContentType, dataFromStruct(s), nil)
}

func newTestBodySpecFromData(statusCode int, contentType as.HTTPBody_ContentType, originalContentType string, d *as.Data) *as.Data {
d.Meta = newBodyDataMeta(statusCode, contentType, originalContentType)
func newTestBodySpecFromData(
statusCode int,
contentType as.HTTPBody_ContentType,
originalContentType string,
d *as.Data,
bodyError *as.HTTPBody_Errors,
) *as.Data {
d.Meta = newBodyDataMeta(statusCode, contentType, originalContentType, bodyError)
return d
}

Expand All @@ -99,7 +105,7 @@ func newTestMultipartFormData(statusCode int) *as.Data {
Value: &as.Data_Struct{
Struct: &as.Struct{
Fields: map[string]*as.Data{
"field1": newTestBodySpecFromData(statusCode, as.HTTPBody_TEXT_PLAIN, "text/plain", f1),
"field1": newTestBodySpecFromData(statusCode, as.HTTPBody_TEXT_PLAIN, "text/plain", f1, nil),
"field2": newTestBodySpecFromStruct(statusCode, as.HTTPBody_JSON, "application/json", map[string]*as.Data{
"foo": dataFromPrimitive(spec_util.NewPrimitiveString("bar")),
"baz": dataFromPrimitive(spec_util.NewPrimitiveInt64(123)),
Expand Down Expand Up @@ -356,6 +362,7 @@ func TestParseHTTPRequest(t *testing.T) {
as.HTTPBody_OCTET_STREAM,
"application/octet-stream",
dataFromPrimitive(spec_util.NewPrimitiveBytes([]byte("prince is a good boy"))),
nil,
),
},
UnknownHTTPMethodMeta(),
Expand All @@ -366,10 +373,10 @@ func TestParseHTTPRequest(t *testing.T) {
testContent: newTestHTTPResponse(
200,
[]byte(`
prince:
- bread
- eat
`),
prince:
- bread
- eat
`),
"application/x-yaml",
map[string][]string{},
[]*http.Cookie{},
Expand Down Expand Up @@ -468,10 +475,8 @@ prince:
),
},
&parseTest{
// Log error and skip the body if we can't parse it, instead of aborting
// the whole endpoint.
// https://app.clubhouse.io/akita-software/story/1898/juan-s-payload-problem
name: "skip body if unable to parse",
// Capture the unparsable body and indicate a parsing error in body metadata
name: "capture stringified body if unable to parse",
testContent: newTestHTTPResponse(
200,
[]byte("I am not JSON"),
Expand All @@ -485,6 +490,13 @@ prince:
nil,
[]*as.Data{
newDataHeader("X-Charming-Level", 200, spec_util.NewPrimitiveString("extreme"), false),
newTestBodySpecFromData(
200,
as.HTTPBody_JSON,
"application/json",
dataFromPrimitive(spec_util.NewPrimitiveString("I am not JSON")),
as.HTTPBody_PARSING_ERROR.Enum(),
),
},
UnknownHTTPMethodMeta(),
),
Expand Down
15 changes: 13 additions & 2 deletions learn/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,13 @@ func dataFromPrimitive(p *pb.Primitive) *pb.Data {
return &pb.Data{Value: &pb.Data_Primitive{Primitive: p}}
}

func newBodyDataMeta(responseCode int, contentType pb.HTTPBody_ContentType, originalContentType string) *pb.DataMeta {
return newDataMeta(&pb.HTTPMeta{
func newBodyDataMeta(
responseCode int,
contentType pb.HTTPBody_ContentType,
originalContentType string,
bodyError *as.HTTPBody_Errors,
) *pb.DataMeta {
dataMeta := newDataMeta(&pb.HTTPMeta{
Location: &pb.HTTPMeta_Body{
Body: &pb.HTTPBody{
ContentType: contentType,
Expand All @@ -201,6 +206,12 @@ func newBodyDataMeta(responseCode int, contentType pb.HTTPBody_ContentType, orig
},
ResponseCode: int32(responseCode),
})

if bodyError != nil {
dataMeta.GetHttp().GetBody().Errors = *bodyError
}

return dataMeta
}

func annotateIfSensitiveForTest(sensitive bool, prim *pb.Primitive) *pb.Primitive {
Expand Down

0 comments on commit 37d3c7b

Please sign in to comment.