Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/cached disasm #556

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
12f595e
add initial pyghidra, test fails to load
Dec 4, 2024
4fc4c66
working bb/cb unpacker
Dec 9, 2024
4a91519
Passes like 70% of tests
Dec 14, 2024
cb3c3c1
add gitattributes
Dec 14, 2024
b171a11
Update pyghidra code. CodeRegion, ComplexBlock, BasicBlock unpacking
Dec 15, 2024
11b1bf2
working tests with bb unapcker
Dec 17, 2024
d3a6133
oops, test lines included
Dec 18, 2024
e64f116
add cached disasmbler with simple json format
Dec 19, 2024
37fdfa8
added generation script for pyghidra and updated cached unapcker to n…
Dec 19, 2024
99a0881
keep analysis dict in memeory
Dec 20, 2024
5ad4d10
dependency inject cache store
Dec 20, 2024
a9df7ab
Fix indentation error
Dec 20, 2024
4e069c4
Update cache analysis store to store cache for all unpacked programs
Dec 20, 2024
8339413
get rid lookup to wokr
Dec 20, 2024
826f061
Update pyghidra generator to cli tool, create cached analysis tests u…
Jan 2, 2025
f084fb9
added some metadata with ghidra pie fix and hash checking, all tests …
Jan 2, 2025
2ba15dd
lint
Jan 2, 2025
6bbc9b5
update executable segment flag in cache file
Jan 3, 2025
381410f
Merge branch 'master' of github.com:dannyp303/ofrak into feature/cach…
Jan 3, 2025
b4ce27c
lint
Jan 3, 2025
95dc9a7
fix lfs text file issues
Jan 6, 2025
a03cac3
add cached decompilation and tests, renamed ofrak_cached to ofrak_cac…
Jan 8, 2025
48d1b72
lint
Jan 8, 2025
6eb7230
somehow had extra whitespace line
Jan 8, 2025
940decc
changelog
Jan 8, 2025
ada9a02
Merge branch 'master' into feature/cached_disasm
dannyp303 Jan 8, 2025
4bc241e
change ofrak_cached_diassembly name in lfs attributes
Jan 8, 2025
ead41cd
add lfs files
Jan 8, 2025
679f844
Merge branch 'feature/cached_disasm' of github.com:dannyp303/ofrak in…
Jan 8, 2025
a6c1147
update the pyghidra component to use the cached impl under the hood
Jan 9, 2025
42a2819
Merge branch 'master' into feature/cached_disasm
dannyp303 Jan 9, 2025
afa2762
EOF lint
Jan 9, 2025
78b06d4
update pyghidra component decomp component
Jan 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
docs/assets/* filter=lfs diff=lfs merge=lfs -text
disassemblers/ofrak_ghidra/ofrak_ghidra_test/assets/* filter=lfs diff=lfs merge=lfs -text
ofrak_patch_maker/technical_docs/vbcc.pdf filter=lfs diff=lfs merge=lfs -text
ofrak_core/test_ofrak/components/assets/* filter=lfs diff=lfs merge=lfs -text
ofrak_core/test_ofrak/components/assets/**/* filter=lfs diff=lfs merge=lfs -text
ofrak_core/test_ofrak/components/assets/cache/Makefile !filter !diff !merge text
ofrak_core/test_ofrak/components/assets/README.md !filter !diff !merge text
ofrak_core/test_ofrak/components/assets/kernel_address_space_build.sh !filter !diff !merge text
ofrak_core/test_ofrak/components/assets/string_test.c !filter !diff !merge text
Expand All @@ -11,6 +12,9 @@ ofrak_core/test_ofrak/components/assets/elf/* filter=lfs diff=lfs merge=lfs -tex
ofrak_core/test_ofrak/components/assets/elf/edge-cases/* filter=lfs diff=lfs merge=lfs -text
frontend/public/themes/**/* filter=lfs diff=lfs merge=lfs -text
disassemblers/ofrak_angr/ofrak_angr_test/assets/* filter=lfs diff=lfs merge=lfs -text
disassemblers/ofrak_pyghidra/ofrak_pyghidra_test/assets/**/* filter=lfs diff=lfs merge=lfs -text
disassemblers/ofrak_cached_disassembly/ofrak_cached_disassembly_test/assets/**/* filter=lfs diff=lfs merge=lfs -text
disassemblers/ofrak_cached_disassembly/ofrak_cached_disassembly_test/assets/Makefile !filter !diff !merge text
ofrak_core/pytest_ofrak/elf/assets/* filter=lfs diff=lfs merge=lfs -text
ofrak_core/pytest_ofrak/elf/assets/*.c !filter !diff !merge text
ofrak_core/pytest_ofrak/elf/assets/Makefile !filter !diff !merge text
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ ofrak_core/ofrak/core/entropy/entropy_c.cpython*
ofrak_core/ofrak/gui/public
ofrak_core/build
ofrak_core/ofrak/license/license.json
disassemblers/ofrak_cached_disassembly/ofrak_cached_disassembly_test/assets/*_ghidra
ofrak_core/test_ofrak/components/assets/*_ghidra
Empty file.
426 changes: 426 additions & 0 deletions disassemblers/ofrak_cached_disassembly/LICENSE

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions disassemblers/ofrak_cached_disassembly/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
PYTHON=python3
PIP=pip3

install:
$(PIP) install .

develop:
$(PIP) install -e .[test]

test:
$(PYTHON) -m pytest --cov=ofrak_cached --cov-report=term-missing ofrak_cached_test
fun-coverage --cov-fail-under=100
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from ofrak.core import *
import json


class CachedAnalysisStore:
def __init__(self):
self.analysis = dict()
self.program_attributes: Optional[ProgramAttributes] = None

def store_analysis(self, resource_id: bytes, analysis: Union[Dict, str]):
if isinstance(analysis, str):
with open(analysis) as fh:
analysis = json.load(fh)
if resource_id not in self.analysis.keys():
self.analysis[resource_id] = dict()
self.analysis[resource_id]["analysis"] = analysis

def store_program_attributes(self, resource_id: bytes, program_attributes: ProgramAttributes):
if resource_id not in self.analysis.keys():
self.analysis[resource_id] = dict()
self.analysis[resource_id]["program_attributes"] = program_attributes

def delete_id_from_store(self, resource_id: bytes):
if resource_id in self.analysis:
del self.analysis[resource_id]

def get_analysis(self, resource_id: bytes) -> Dict[str, Any]:
return self.analysis[resource_id]["analysis"]

def get_program_attributes(self, resource_id: bytes) -> ProgramAttributes:
return self.analysis[resource_id]["program_attributes"]

def id_exists(self, resource_id: bytes) -> bool:
return resource_id in self.analysis.keys()
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
from ofrak.core import *
import hashlib
from ofrak.core.code_region import CodeRegion
from ofrak.core.complex_block import ComplexBlock
from ofrak.service.component_locator_i import (
ComponentLocatorInterface,
)
from ofrak.core.decompilation import (
DecompilationAnalysis,
DecompilationAnalyzer,
DecompilationAnalysis,
)
from ofrak_cached_disassembly.components.cached_disassembly import CachedAnalysisStore

_GHIDRA_AUTO_LOADABLE_FORMATS = [Elf, Ihex, Pe]


@dataclass
class CachedAnalysis(ResourceView):
pass


@dataclass
class CachedAnalysisAnalyzerConfig(ComponentConfig):
filename: str
force: Optional[bool] = False


class CachedAnalysisAnalyzer(Analyzer[CachedAnalysisAnalyzerConfig, CachedAnalysis]):
id = b"CachedAnalysisAnalyzer"
targets = (CachedAnalysis,)
outputs = (CachedAnalysis,)

def __init__(
self,
resource_factory: ResourceFactory,
data_service: DataServiceInterface,
resource_service: ResourceServiceInterface,
analysis_store: CachedAnalysisStore,
):
super().__init__(resource_factory, data_service, resource_service)
self.analysis_store = analysis_store

async def analyze(self, resource: Resource, config: CachedAnalysisAnalyzerConfig):
await resource.identify()
if not resource.has_tag(Program) and not resource.has_attributes(ProgramAttributes):
raise AttributeError(
f"The reource with ID {resource.get_id()} is not an analyzable program format and does not have ProgramAttributes set."
)
await resource.unpack() # Must unpack ELF to get program attributes
program_attributes = await resource.analyze(ProgramAttributes)
self.analysis_store.store_analysis(resource.get_id(), config.filename)
if not config.force:
if not await self.verify_cache_file(resource):
raise ValueError(
"MD5 recorded in cache file does not match the hash of the requested resource, use the force config option to use this cache file anyway."
)
self.analysis_store.store_program_attributes(resource.get_id(), program_attributes)
cached_analysis_view = CachedAnalysis()
resource.add_view(cached_analysis_view)
await resource.save()
return cached_analysis_view

async def verify_cache_file(self, resource: Resource):
data = await resource.get_data()
md5_hash = hashlib.md5(data)
return (
md5_hash.digest().hex()
== self.analysis_store.get_analysis(resource.get_id())["metadata"]["hash"]
)


class CachedProgramUnpacker(Unpacker[None]):
targets = (CachedAnalysis,)
outputs = (CodeRegion,)

def __init__(
self,
resource_factory: ResourceFactory,
data_service: DataServiceInterface,
resource_service: ResourceServiceInterface,
analysis_store: CachedAnalysisStore,
component_locator: ComponentLocatorInterface,
):
super().__init__(resource_factory, data_service, resource_service, component_locator)
self.analysis_store = analysis_store

async def unpack(self, resource: Resource, config: None):
analysis = self.analysis_store.get_analysis(resource.get_id())
for key, mem_region in analysis.items():
if key.startswith("seg"):
await resource.create_child_from_view(
CodeRegion(
virtual_address=mem_region["virtual_address"], size=mem_region["size"]
)
)


class CachedCodeRegionModifier(Modifier[None]):
targets = (CodeRegion,)

def __init__(
self,
resource_factory: ResourceFactory,
data_service: DataServiceInterface,
resource_service: ResourceServiceInterface,
analysis_store: CachedAnalysisStore,
):
super().__init__(resource_factory, data_service, resource_service)
self.analysis_store = analysis_store

async def modify(self, resource: Resource, config: None):
program_r = await resource.get_only_ancestor(ResourceFilter.with_tags(Program))
analysis = self.analysis_store.get_analysis(program_r.get_id())
ofrak_code_regions = await program_r.get_descendants_as_view(
v_type=CodeRegion, r_filter=ResourceFilter(tags=[CodeRegion])
)
backend_code_regions: List[CodeRegion] = []
for key, mem_region in analysis.items():
if key.startswith("seg") and mem_region["executable"]:
backend_code_regions.append(
CodeRegion(
virtual_address=mem_region["virtual_address"], size=mem_region["size"]
)
)

ofrak_code_regions = sorted(ofrak_code_regions, key=lambda cr: cr.virtual_address)
backend_code_regions = sorted(backend_code_regions, key=lambda cr: cr.virtual_address)

if len(ofrak_code_regions) > 0:
code_region = await resource.view_as(CodeRegion)
relative_va = code_region.virtual_address - ofrak_code_regions[0].virtual_address

for backend_cr in backend_code_regions:
backend_relative_va = (
backend_cr.virtual_address - backend_code_regions[0].virtual_address
)
if backend_relative_va == relative_va and backend_cr.size == code_region.size:
code_region.resource.add_view(
backend_cr
) # TODO: https://github.com/redballoonsecurity/ofrak/issues/537
await resource.save()


class CachedCodeRegionUnpacker(CodeRegionUnpacker):
def __init__(
self,
resource_factory: ResourceFactory,
data_service: DataServiceInterface,
resource_service: ResourceServiceInterface,
analysis_store: CachedAnalysisStore,
component_locator: ComponentLocatorInterface,
):
super().__init__(resource_factory, data_service, resource_service, component_locator)
self.analysis_store = analysis_store

async def unpack(self, resource: Resource, config: None):
program_r = await resource.get_only_ancestor(ResourceFilter.with_tags(Program))
analysis = self.analysis_store.get_analysis(program_r.get_id())
if analysis["metadata"]["backend"] == "ghidra":
await resource.run(CachedCodeRegionModifier)
code_region_view = await resource.view_as(CodeRegion)
func_keys = analysis[f"seg_{code_region_view.virtual_address}"]["children"]
for func_key in func_keys:
complex_block = analysis[func_key]
cb = ComplexBlock(
virtual_address=complex_block["virtual_address"],
size=complex_block["size"],
name=complex_block["name"],
)
await code_region_view.create_child_region(cb)


class CachedComplexBlockUnpacker(ComplexBlockUnpacker):
def __init__(
self,
resource_factory: ResourceFactory,
data_service: DataServiceInterface,
resource_service: ResourceServiceInterface,
analysis_store: CachedAnalysisStore,
component_locator: ComponentLocatorInterface,
):
super().__init__(resource_factory, data_service, resource_service, component_locator)
self.analysis_store = analysis_store

async def unpack(self, resource: Resource, config: None):
program_r = await resource.get_only_ancestor(ResourceFilter.with_tags(Program))
analysis = self.analysis_store.get_analysis(program_r.get_id())
program_attributes = self.analysis_store.get_program_attributes(program_r.get_id())

cb_view = await resource.view_as(ComplexBlock)
child_keys = analysis[f"func_{cb_view.virtual_address}"]["children"]
for children in child_keys:
if children.startswith("bb"):
basic_block = analysis[children]
mode = InstructionSetMode.NONE
if basic_block["mode"] == "thumb":
mode = InstructionSetMode.THUMB
elif basic_block["mode"] == "vle":
mode = InstructionSetMode.VLE
bb = BasicBlock(
virtual_address=basic_block["virtual_address"],
size=basic_block["size"],
mode=mode,
is_exit_point=basic_block["is_exit_point"],
exit_vaddr=basic_block["exit_vaddr"],
)
await cb_view.create_child_region(bb)
elif children.startswith("dw"):
data_word = analysis[children]
fmt_string = (
program_attributes.endianness.get_struct_flag() + data_word["format_string"]
)
dw = DataWord(
virtual_address=data_word["virtual_address"],
size=data_word["size"],
format_string=fmt_string,
xrefs_to=tuple(data_word["xrefs_to"]),
)
await cb_view.create_child_region(dw)


class CachedBasicBlockUnpacker(BasicBlockUnpacker):
def __init__(
self,
resource_factory: ResourceFactory,
data_service: DataServiceInterface,
resource_service: ResourceServiceInterface,
analysis_store: CachedAnalysisStore,
component_locator: ComponentLocatorInterface,
):
super().__init__(resource_factory, data_service, resource_service, component_locator)
self.analysis_store = analysis_store

async def unpack(self, resource: Resource, config: None):
program_r = await resource.get_only_ancestor(ResourceFilter.with_tags(Program))
analysis = self.analysis_store.get_analysis(program_r.get_id())

bb_view = await resource.view_as(BasicBlock)
child_keys = analysis[f"bb_{bb_view.virtual_address}"]["children"]
for children in child_keys:
instruction = analysis[children]
mode = InstructionSetMode.NONE
if instruction["mode"] == "thumb":
mode = InstructionSetMode.THUMB
elif instruction["mode"] == "vle":
mode = InstructionSetMode.VLE
instr = Instruction(
virtual_address=instruction["virtual_address"],
size=instruction["size"],
disassembly=f"{instruction['mnemonic']} {instruction['operands']}",
mnemonic=instruction["mnemonic"],
operands=instruction["operands"],
mode=mode,
)
await bb_view.create_child_region(instr)


class CachedDecompilationAnalyzer(DecompilationAnalyzer):
def __init__(
self,
resource_factory: ResourceFactory,
data_service: DataServiceInterface,
resource_service: ResourceServiceInterface,
analysis_store: CachedAnalysisStore,
):
super().__init__(resource_factory, data_service, resource_service)
self.analysis_store = analysis_store

async def analyze(self, resource: Resource, config: None) -> DecompilationAnalysis:
# Run / fetch ghidra analyzer
program_r = await resource.get_only_ancestor(ResourceFilter.with_tags(Program))
analysis = self.analysis_store.get_analysis(program_r.get_id())
complex_block = await resource.view_as(ComplexBlock)
decomp = analysis[f"func_{complex_block.virtual_address}"]["decompilation"]
return DecompilationAnalysis(decomp)
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
pyghidra: clean fib-pyghidra fib-thumb-pyghidra hello.x64.elf-pyghidra

fib-pyghidra:
python3 -m ofrak_pyghidra analyze -i fib -o fib.json

fib-thumb-pyghidra:
python3 -m ofrak_pyghidra analyze -i fib_thumb -o fib_thumb.json

hello.x64.elf-pyghidra:
python3 -m ofrak_pyghidra analyze -i hello.x64.elf -o hello.x64.elf.json -d

clean:
rm -f fib.json fib_thumb.json hello.x64.elf.json
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Loading
Loading