Skip to content

Commit

Permalink
parse partial flows to mark from metadata.json
Browse files Browse the repository at this point in the history
Summary:
This is part 2 of ensuring SAPP automatically marks partial flows when a run's metadata asks it to.

Now that we construct the dataclass from the analysis output, the definition was moved there (partially to avoid buck target cycles).

Reviewed By: fahndrich

Differential Revision: D61798431

fbshipit-source-id: eea05b00824ffb4b5df3484ec4471990ec441f55
  • Loading branch information
Sinan Cepel authored and facebook-github-bot committed Aug 28, 2024
1 parent 7d7e7c1 commit 3dd2e50
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 45 deletions.
73 changes: 73 additions & 0 deletions sapp/analysis_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,51 @@
METADATA_GLOB = "*metadata.json"


@dataclass
class PartialFlowToMark:
"""
This is a specification of a partial flow that the user wishes us to mark.
`partial_issue_code` and `full_issue_code` are self-descriptive.
`full_issue_transform` should be the name of the transform we're looking
to find in the full issue, and mark matching partial flows.
`feature` is the schema of the feature to add. `has-{feature}` and
`{feature}:{issue_instance_id}` will be the resulting features.
If `is_prefix_flow` is set to True, it means the partial
issue is a prefix of the full issue. Otherwise, we assume that the partial
issue is meant to be a suffix of the full issue. If `is_prefix_flow` is true,
it means that the transform we're searching for in the larger flow is the sink
of the partial flow. Otherwise, the transform is interpreted as the source.
`is_prefix_flow` has implications how we search for transforms:
For a prefix flow, if we find a transform in the postcondition trace of the
larger issue, we will mark the frame where the transform is applied locally
as a frame to add a breadcrumb for. If the transform is found in the
precondition, we'll mark the initial postcondition frames.
For a suffix flow, we'll flip the logic:
- If the transform is found in a postcondition trace, the larger
issue's initial precondition frames.
- If the transform's in the precondition trace, the source frame for
the matching precondition callee will be marked.
The reason for the marking the opposite initial frames from where we started from
is that the transform frame will *not* appear during the search from the larger
trace. Marking the other side's root frame allows us to detect the same set of flows
without doing a complex traversal ourselves.
"""

partial_issue_code: int
full_issue_code: int
full_issue_transform: str
is_prefix_flow: bool
feature: str


@dataclass
class Metadata:
# Used to relativize paths in the results
Expand All @@ -33,6 +78,9 @@ class Metadata:
rules: Dict[int, Any] = dataclasses.field(default_factory=dict)
class_type_intervals_filenames: List[str] = dataclasses.field(default_factory=list)
category_coverage: Dict[str, Any] = dataclasses.field(default_factory=dict)
partial_flows_to_mark: list[PartialFlowToMark] = dataclasses.field(
default_factory=list
)

def merge(self, o: "Metadata") -> "Metadata":
return Metadata(
Expand All @@ -47,6 +95,7 @@ def merge(self, o: "Metadata") -> "Metadata":
class_type_intervals_filenames=self.class_type_intervals_filenames
+ o.class_type_intervals_filenames,
category_coverage=self.category_coverage, # should all be the same
partial_flows_to_mark=self.partial_flows_to_mark + o.partial_flows_to_mark,
)


Expand Down Expand Up @@ -148,6 +197,9 @@ def from_directories(cls, directories: List[str]) -> "AnalysisOutput":
class_type_intervals_filenames = _get_remapped_filename(
metadata, "class_type_intervals_filename", directory
)
partial_flows_to_mark = _parse_partial_flows_to_mark(
metadata, "partial_flows"
)
this_metadata = Metadata(
analysis_tool_version=metadata["version"],
commit_hash=metadata.get("commit"),
Expand All @@ -159,6 +211,7 @@ def from_directories(cls, directories: List[str]) -> "AnalysisOutput":
rules=rules,
class_type_intervals_filenames=class_type_intervals_filenames,
category_coverage=metadata.get("category_coverage", []),
partial_flows_to_mark=partial_flows_to_mark,
)
if not main_metadata:
main_metadata = this_metadata
Expand Down Expand Up @@ -202,6 +255,7 @@ def from_directory(cls, directory: str) -> "AnalysisOutput":
class_type_intervals_filenames = _get_remapped_filename(
metadata, "class_type_intervals_filename", directory
)
partial_flows_to_mark = _parse_partial_flows_to_mark(metadata, "partial_flows")
return cls(
directory=directory,
filename_specs=filename_specs,
Expand All @@ -217,6 +271,7 @@ def from_directory(cls, directory: str) -> "AnalysisOutput":
rules=rules,
class_type_intervals_filenames=class_type_intervals_filenames,
category_coverage=metadata.get("category_coverage", []),
partial_flows_to_mark=partial_flows_to_mark,
),
)

Expand Down Expand Up @@ -287,3 +342,21 @@ def _get_remapped_filename(
return [filename]
else:
return []


def _parse_partial_flows_to_mark(
metadata_json: dict[str, Any], key: str
) -> list[PartialFlowToMark]:
parsed = []
partial_flows_to_mark = metadata_json.get(key, [])
for partial_flow in partial_flows_to_mark:
parsed.append(
PartialFlowToMark(
full_issue_code=partial_flow["full_issue_code"],
partial_issue_code=partial_flow["partial_issue_code"],
full_issue_transform=partial_flow["full_issue_transform"],
is_prefix_flow=partial_flow["is_prefix_flow"],
feature=partial_flow["feature"],
)
)
return parsed
47 changes: 2 additions & 45 deletions sapp/pipeline/mark_partial_flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from dataclasses import dataclass
from typing import Iterable

from ..analysis_output import PartialFlowToMark

from ..models import IssueInstance, SharedTextKind, TraceFrame, TraceKind
from ..trace_graph import TraceGraph
from . import PipelineStep, SourceLocation, Summary
Expand All @@ -22,51 +24,6 @@ def __init__(self) -> None:
self.shared_texts: set[int] = set()


@dataclass
class PartialFlowToMark:
"""
This is a specification of a partial flow that the user wishes us to mark.
`partial_issue_code` and `full_issue_code` are self-descriptive.
`full_issue_transform` should be the name of the transform we're looking
to find in the full issue, and mark matching partial flows.
`feature` is the schema of the feature to add. `has-{feature}` and
`{feature}:{issue_instance_id}` will be the resulting features.
If `is_prefix_flow` is set to True, it means the partial
issue is a prefix of the full issue. Otherwise, we assume that the partial
issue is meant to be a suffix of the full issue. If `is_prefix_flow` is true,
it means that the transform we're searching for in the larger flow is the sink
of the partial flow. Otherwise, the transform is interpreted as the source.
`is_prefix_flow` has implications how we search for transforms:
For a prefix flow, if we find a transform in the postcondition trace of the
larger issue, we will mark the frame where the transform is applied locally
as a frame to add a breadcrumb for. If the transform is found in the
precondition, we'll mark the initial postcondition frames.
For a suffix flow, we'll flip the logic:
- If the transform is found in a postcondition trace, the larger
issue's initial precondition frames.
- If the transform's in the precondition trace, the source frame for
the matching precondition callee will be marked.
The reason for the marking the opposite initial frames from where we started from
is that the transform frame will *not* appear during the search from the larger
trace. Marking the other side's root frame allows us to detect the same set of flows
without doing a complex traversal ourselves.
"""

partial_issue_code: int
full_issue_code: int
full_issue_transform: str
is_prefix_flow: bool
feature: str


# A frame key is an issue-code-agnostic identifier for frames we're looking
# to associate between a longer flow and a partial flow.
# The dataclass is frozen to ensure we can use these as dict keys.
Expand Down

0 comments on commit 3dd2e50

Please sign in to comment.