From 174965f75faa4eb18e8de8caf5bfb1511d60d75e Mon Sep 17 00:00:00 2001 From: Robin David Date: Tue, 12 Mar 2024 13:43:32 +0100 Subject: [PATCH 1/3] update pypcode to use version >2.0.0 --- bindings/python/quokka/backends/pypcode.py | 162 ++++---------------- pyproject.toml | 6 +- tests/python/tests/backends/test_pypcode.py | 8 +- 3 files changed, 39 insertions(+), 137 deletions(-) diff --git a/bindings/python/quokka/backends/pypcode.py b/bindings/python/quokka/backends/pypcode.py index bd3fc21..19542de 100644 --- a/bindings/python/quokka/backends/pypcode.py +++ b/bindings/python/quokka/backends/pypcode.py @@ -60,6 +60,7 @@ def get_pypcode_context( Arguments: arch: Quokka program architecture + endian: Architecture endianness Raises: PypcodeError: if the conversion for arch is not found @@ -74,7 +75,7 @@ def get_pypcode_context( quokka.analysis.ArchARM64: "AARCH64:LE:64:v8A", quokka.analysis.ArchARMThumb: "ARM:LE:32:v8T", quokka.analysis.ArchMIPS: "MIPS:LE:32:default", - quokka.analysis.ArchMIPS: "MIPS:LE:64:default", + quokka.analysis.ArchMIPS64: "MIPS:LE:64:default", quokka.analysis.ArchPPC: "PowerPC:LE:32:default", quokka.analysis.ArchPPC64: "PowerPC:LE:64:default", } @@ -93,105 +94,6 @@ def get_pypcode_context( return pypcode.Context(pcode_arch) -def equality(self: pypcode.ContextObj, other: Any) -> bool: - """Check if two pypcode objets are the same - - We use monkey patching to attach the equality method to other classes and rely on - __slots__ to check which fields to check. - - Arguments: - self: First object - other: Other variable - - Returns: - Boolean for equality - """ - return isinstance(other, self.__class__) and all( - getattr(other, attr) == getattr(self, attr) - for attr in self.__slots__ - if attr != "cobj" - ) - - -def object_hash(obj: pypcode.ContextObj) -> int: - """Create a hash value for a pypcode object - - This allows to create set of values. - - Arguments: - obj: Object to hash - - Returns: - An integer for the hash - """ - - assert isinstance(obj, pypcode.ContextObj) - return sum(hash(getattr(obj, attr)) for attr in obj.__slots__ if attr != "cobj") - - -pypcode.Varnode.__eq__ = equality -pypcode.Varnode.__hash__ = object_hash - -pypcode.AddrSpace.__eq__ = equality -pypcode.AddrSpace.__hash__ = object_hash - -pypcode.PcodeOp.__eq__ = equality -pypcode.PcodeOp.__hash__ = object_hash - - -def combine_instructions( - block: quokka.Block, translated_instructions: Sequence[pypcode.Translation] -) -> List[pypcode.PcodeOp]: - """Combine instructions between the Quokka and PyPcode - - Some instruction are split between IDA and Ghidra, so we have to account for it. - A problem for example is the support of prefixes (such LOCK) which are decoded as 2 - instructions by Ghidra (wrong) but only 1 by IDA (correct). - - Arguments: - block: Quokka block - translated_instructions: Translated instructions by Pypcode - - Raises - PypcodeError: if the combination doesn't work - - Returns: - A list of Pypcode statements - """ - pcode_instructions: List[pypcode.PcodeOp] = [] - translated_instructions = iter(translated_instructions) - - instruction: quokka.Instruction - for instruction in block.instructions: - instruction._pcode_insts = [] - remaining_size: int = instruction.size - while remaining_size > 0: - try: - pcode_inst: pypcode.Translation = next(translated_instructions) - except StopIteration as exc: - logger.error( - "Disassembly discrepancy between Pypcode / IDA: missing inst" - ) - raise quokka.PypcodeError( - f"Decoding error for block at 0x{block.start:x}" - ) from exc - - remaining_size -= pcode_inst.length - instruction._pcode_insts.extend(pcode_inst.ops) - - if remaining_size < 0: - logger.error( - "Disassembly discrepancy between Pypcode / IDA: sizes mismatch" - ) - raise quokka.PypcodeError( - f"Decoding error for block at 0x{block.start:x}" - ) - - pcode_instructions.extend(list(pcode_inst.ops)) - - return pcode_instructions - - def update_pypcode_context(program: quokka.Program, is_thumb: bool) -> pypcode.Context: """Return an appropriate pypcode context for the decoding @@ -246,19 +148,22 @@ def pypcode_decode_block(block: quokka.Block) -> List[pypcode.PcodeOp]: block.program, first_instruction.thumb ) - # Translate - translation = context.translate( - code=block.bytes, - base=block.start, - max_inst=0, - ) - - if translation.error: - logger.error(translation.error.explain) - raise quokka.PypcodeError(f"Decoding error for block at 0x{block.start:x}") + try: + # Translate + translation = context.translate( + block.bytes, # buf + block.start, # base_address + 0, # max_bytes + 0, # max_instructions + ) + return translation.ops - pcode_instructions = combine_instructions(block, translation.instructions) - return pcode_instructions + except pypcode.BadDataError as e: + logger.error(e) + raise quokka.PypcodeError(f"Decoding error for block at 0x{block.start:x} (BadDataError)") + except pypcode.UnimplError as e: + logger.error(e) + raise quokka.PypcodeError(f"Decoding error for block at 0x{block.start:x} (UnimplError)") def pypcode_decode_instruction( @@ -268,7 +173,7 @@ def pypcode_decode_instruction( This will return the list of Pcode operations done for the instruction. Note that a (binary) instruction is expected to have several pcode instructions - associated. + associated. When decoding a single instruction IMARK instructions are excluded! Arguments: inst: Instruction to translate @@ -281,22 +186,19 @@ def pypcode_decode_instruction( """ context: pypcode.Context = update_pypcode_context(inst.program, inst.thumb) - translation = context.translate( - code=inst.bytes, - base=inst.address, - max_inst=1, - ) - - if not translation.error: - - instructions = translation.instructions - if len(instructions) > 1: - logger.warning("Mismatch of instruction size IDA/Pypcode") - - instructions = list( - itertools.chain.from_iterable(inst.ops for inst in instructions) + try: + translation = context.translate( + inst.bytes, # buf + inst.address, # base_address + 0, # max_bytes + 1, # max_instructions ) - return instructions - logger.error(translation.error.explain) - raise quokka.PypcodeError("Unable to decode instruction") + return [x for x in translation.ops if x.opcode != pypcode.OpCode.IMARK] + + except pypcode.BadDataError as e: + logger.error(e) + raise quokka.PypcodeError(f"Unable to decode instruction (BadDataError)") + except pypcode.UnimplError as e: + logger.error(e) + raise quokka.PypcodeError(f"Unable to decode instruction (UnimplError)") diff --git a/pyproject.toml b/pyproject.toml index d03dd2a..4b44a94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,9 +24,9 @@ test = [ "pytest-mock", "pytest-cov", "coverage[toml]", - "pypcode>=1.1.1", + "pypcode>=1.1.2", ] -pypcode = ["pypcode>=1.1.1"] +pypcode = ["pypcode>=1.1.2"] doc = [ "mkdocs", "mkdocs-material", @@ -45,7 +45,7 @@ dev = [ "mypy", "mypy-protobuf", "nox", - "pypcode>=1.1.1", + "pypcode>=1.1.2", ] [tool.setuptools] diff --git a/tests/python/tests/backends/test_pypcode.py b/tests/python/tests/backends/test_pypcode.py index c520005..4107d34 100644 --- a/tests/python/tests/backends/test_pypcode.py +++ b/tests/python/tests/backends/test_pypcode.py @@ -8,16 +8,16 @@ def test_pypcode_context(): context = pypcode_backend.get_pypcode_context(quokka.analysis.ArchX86) - assert context.lang.id == "x86:LE:32:default" + assert context.language.id == "x86:LE:32:default" context = pypcode_backend.get_pypcode_context(quokka.analysis.ArchX64) - assert context.lang.id == "x86:LE:64:default" + assert context.language.id == "x86:LE:64:default" context = pypcode_backend.get_pypcode_context(quokka.analysis.ArchARM64) - assert context.lang.id == "AARCH64:LE:64:v8A" + assert context.language.id == "AARCH64:LE:64:v8A" context = pypcode_backend.get_pypcode_context(quokka.analysis.ArchARM) - assert context.lang.id == "ARM:LE:32:v8" + assert context.language.id == "ARM:LE:32:v8" with pytest.raises(quokka.PypcodeError): pypcode_backend.get_pypcode_context(quokka.analysis.QuokkaArch) From c338ab727c626db833251319686d52628378d4a5 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Thu, 14 Mar 2024 14:31:25 +0100 Subject: [PATCH 2/3] Update dependency of pypcode >= 2.0.0 --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4b44a94..66e0c7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,9 +24,9 @@ test = [ "pytest-mock", "pytest-cov", "coverage[toml]", - "pypcode>=1.1.2", + "pypcode>=2.0.0", ] -pypcode = ["pypcode>=1.1.2"] +pypcode = ["pypcode>=2.0.0"] doc = [ "mkdocs", "mkdocs-material", @@ -45,7 +45,7 @@ dev = [ "mypy", "mypy-protobuf", "nox", - "pypcode>=1.1.2", + "pypcode>=2.0.0", ] [tool.setuptools] From a155da2e65f87ff3dad02630f1545cb110d0269b Mon Sep 17 00:00:00 2001 From: Robin David Date: Sat, 16 Mar 2024 13:30:11 +0100 Subject: [PATCH 3/3] cache pcode from blocks --- bindings/python/quokka/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/quokka/block.py b/bindings/python/quokka/block.py index bcb5e64..49196ef 100644 --- a/bindings/python/quokka/block.py +++ b/bindings/python/quokka/block.py @@ -243,7 +243,7 @@ def bytes(self) -> bytes: return block_bytes - @property + @cached_property def pcode_insts(self) -> List[pypcode.PcodeOp]: """Generate PCode instructions for the block