Skip to content

Commit

Permalink
Multiple bug fixes and little improvements
Browse files Browse the repository at this point in the history
CORE: Try to use `cdifflib` instead of Python's standard `difflib` when possible to get some performance gains.
BUG: High addresses in operands could cause the Python's sqlite3 module to crash when inserting into the database.
ML: Try to use the Ridge classifier as just another method to get a similarity ratio in `check_ratio`.
ML: Simplifications of the supervised learning based experimental engine.
CONFIG: Added parameter `COMMIT_AFTER_EACH_GUI_UPDATE` to force committing.
CONFIG: Added parameter `EXPORTING_COMPILATION_UNITS` to enable/disable exporting them (with some huge databases it might take even hours!).
CONFIG: Added parameters handling SQLite pragmas `SQLITE_JOURNAL_MODE` and `SQLITE_PRAGMA_SYNCHRONOUS`.
CONFIG: Added parameter `SHOW_IMPORT_WARNINGS` to enable/disable showing warnings when some important but optional Python packages aren't found.
BUG: Be sure to delete orphaned comments when importing pseudo-code comments.
BUG: The workaround for "max non-trivial tinfo_t count has been reached" was wrong. Now, the Hex-Rays functions cache is cleared every 10,000 rows.
GUI: Display the progress when exporting a large number of compilation units.
BUG: Inserting the link between functions and compilation units was terribly-utterly-horribly wrong.
VULN: Add pattern "UNC" to potentially detect vulnerabilities fixed in Windows components involving UNC paths.
EXTRAS: Added independent IDA plugin `extras/diaphora_local.py` to be able to diff functions inside the current binary.
BUG: Do a commit after all functions are exported so, in case IDA crashes for a reason/bug, Diaphora can properly recover from errors and have all the functions already exported there.
  • Loading branch information
joxeankoret committed Feb 22, 2024
1 parent f9b28d5 commit 477dd87
Show file tree
Hide file tree
Showing 8 changed files with 604 additions and 103 deletions.
100 changes: 77 additions & 23 deletions diaphora.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,22 @@
from io import StringIO
from threading import Lock
from multiprocessing import cpu_count
from difflib import SequenceMatcher, unified_diff

import diaphora_config as config
import diaphora_heuristics

try:
from cdifflib import CSequenceMatcher as SequenceMatcher
HAS_CDIFFLIB = True
except ImportError:
HAS_CDIFFLIB = False
if config.SHOW_IMPORT_WARNINGS:
print("WARNING: Python library 'cdifflib' not found. Installing it will significantly improve text diffing performance.")
print("INFO: Alternatively, you can silence this warning by changing the value of SHOW_IMPORT_WARNINGS in diaphora_config.py.")
from difflib import SequenceMatcher

from difflib import unified_diff

import ml.model
from ml.model import ML_ENABLED, train, predict, get_model_name

Expand Down Expand Up @@ -709,6 +720,10 @@ def save_instructions_to_database(self, cur, bb_data, func_id):
cls=CBytesEncoder,
)
)
elif isinstance(instruction_property, int):
if instruction_property > 0x8000000000000000:
instruction_property = str(instruction_property)
instruction_properties.append(instruction_property)
else:
instruction_properties.append(instruction_property)

Expand Down Expand Up @@ -1104,7 +1119,7 @@ def save_function(self, props):
insert_args.append([func_id, str(caller), "caller"])

for callee in callees:
insert_args.append([func_id, str(callee), "callee"])
insert_args.append([func_id, str(callee), "callee"])
cur.executemany(sql, insert_args)

# Phase 3: Insert the constants of the function
Expand Down Expand Up @@ -1880,20 +1895,28 @@ def check_ratio(self, main_d, diff_d):
self.ratios_cache[key] = 1.0
return 1.0

r = max(v1, v2, v3, v4, v5)
v6 = 0.0
if ML_ENABLED and self.machine_learning:
v6 = self.get_ml_ratio(main_d, diff_d)

values_set = set([v1, v2, v3, v4, v5, v6])
r = max(values_set)
if r == 1.0 and md1 != md2:
# We cannot assign a 1.0 ratio if both MD indices are different, that's an
# error
r = 0
for v in [v1, v2, v3, v4, v5]:
for v in values_set:
if v != 1.0 and v > r:
r = v

if r < 1.0:
score = self.deep_ratio(main_d, diff_d, r)
if r + score < 1.0:
r += score
else:
r = 0.99

debug_refresh(f"self.ratios_cache[{main_d['name']}-{diff_d['name']}] = {r}")
self.ratios_cache[key] = r
return r

Expand Down Expand Up @@ -2868,6 +2891,45 @@ def add_multimatches_to_chooser(self, multi, ignore_list, dones):

return ignore_list, dones

def get_ml_ratio(self, main_d, diff_d):
ea1 = int(main_d["ea"])
ea2 = int(diff_d["ea"])

ml_ratio = 0.0

cur = self.db_cursor()
sql = "select * from {db}.functions where address = ?"
try:
cur.execute(sql.format(db="main"), (str(ea1),))
main_row = cur.fetchone()

cur.execute(sql.format(db="diff"), (str(ea2),))
diff_row = cur.fetchone()

ml_add = False
ml_ratio = 0
if ML_ENABLED and self.machine_learning:
ml_ratio = predict(main_row, diff_row)
if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
log(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
ml_add = True
else:
ml_ratio = 0.0

if ml_add:
vfname1 = main_d["name"]
vfname2 = diff_d["name"]
nodes1 = main_d["nodes"]
nodes2 = diff_d["nodes"]
desc = f"ML {get_model_name()}"

tmp_item = CChooser.Item(ea1, vfname1, ea2, vfname2, desc, ml_ratio, nodes1, nodes2)
self.ml_chooser.add_item(tmp_item)
finally:
cur.close()

return ml_ratio

def deep_ratio(self, main_d, diff_d, ratio):
"""
Try to get a score to add to the value returned by `check_ratio()` so less
Expand Down Expand Up @@ -2940,25 +3002,6 @@ def deep_ratio(self, main_d, diff_d, ratio):
set_result = set1.intersection(set2)
if len(set_result) > 0:
score += len(set_result) * 0.0005

ml_add = False
if ML_ENABLED and self.machine_learning:
ml_ratio = predict(main_row, diff_row, ratio)
if ml_ratio > 0:
debug_refresh(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
score += config.ML_DEEP_RATIO_ADDED_SCORE
ml_add = True

if ml_add:
vfname1 = main_d["name"]
vfname2 = diff_d["name"]
nodes1 = main_d["nodes"]
nodes2 = diff_d["nodes"]
desc = f"ML {get_model_name()}"

tmp_item = CChooser.Item(ea1, vfname1, ea2, vfname2, desc, ratio, nodes1, nodes2)
self.ml_chooser.add_item(tmp_item)

finally:
cur.close()

Expand Down Expand Up @@ -3626,6 +3669,17 @@ def train_local_model(self):
debug_refresh("[i] Machine learning module enabled.")
train(self, self.all_matches)

def get_callers_callees(self, db_name, func_id):
cur = self.db_cursor()
rows = []
try:
sql = "select * from {db}.callgraph where func_id = ?"
cur.execute(sql.format(db=db_name), (func_id,))
rows = list(cur.fetchall())
finally:
cur.close()
return rows

def diff(self, db):
"""
Diff the current two databases (main and diff).
Expand Down
33 changes: 30 additions & 3 deletions diaphora_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@
# Number of rows that must be inserted to commit the transaction
EXPORTING_FUNCTIONS_TO_COMMIT = 5000

# Every time the GUI export dialog is updated a commit is issued. This is useful
# whenever we are facing long export times with known IDA bugs that might cause
# it to fail at an unknown moment and we want to recover from errors. You might
# want to set it to False if you're finding small little performance wins.
COMMIT_AFTER_EACH_GUI_UPDATE = True

# The minimum number of functions in a database to, by default, disable running
# slow queries.
MIN_FUNCTIONS_TO_DISABLE_SLOW = 4001
Expand All @@ -73,13 +79,23 @@
# Block size to use to generate fuzzy hashes for pseudo-codes with DeepToad
FUZZY_HASHING_BLOCK_SIZE = 512

# Use it to disable finding compilation units. In some rare cases, there are too
# many compilation units and Diaphora might take very long to find them.
EXPORTING_COMPILATION_UNITS = True

################################################################################
# Default SQL related configuration options
# Default SQL and SQLite related configuration options

# Diaphora won't process more than the given value of rows (per heuristic)
SQL_MAX_PROCESSED_ROWS = 1000000
# SQL queries will timeout after the given number of seconds
SQL_TIMEOUT_LIMIT = 60 * 5
# Set this to DELETE, TRUNCATE, PERSIST, MEMORY, WAL, OFF, or None to use the
# default value.
SQLITE_JOURNAL_MODE = "MEMORY"
# Set this to 0/OFF, 1/NORMAL, 2/FULL, 3/EXTRA, or None to use the default
# value.
SQLITE_PRAGMA_SYNCHRONOUS = "1"

################################################################################
# Heuristics related configuration options
Expand Down Expand Up @@ -189,12 +205,23 @@

# What is the minimum ratio required for a match to be considered for usage to
# train a local model?
ML_MATCHES_MIN_RATIO = 0.5
ML_MATCHES_MIN_RATIO = 0.6
ML_MIN_PREDICTION_RATIO = 0.72

# What value should be added to the final similarity ratio when the specialized
# classifier (trained with known good and bad results found for the current two
# binaries being compared) finds what it thinks is a good match.
ML_DEEP_RATIO_ADDED_SCORE = 0.04
ML_DEEP_RATIO_ADDED_SCORE = 0.1

# Show a chooser with all the matches that the classifier think are good ones?
ML_DEBUG_SHOW_MATCHES = True

#-------------------------------------------------------------------------------
# Some imports improve performance or add features to Diaphora but aren't 100%
# required. Diaphora will warn the reverser when these libraries failed to be
# imported. Change this directive to shutup this warning.
SHOW_IMPORT_WARNINGS = True

#-------------------------------------------------------------------------------
# Workarounds for IDA bugs
DIAPHORA_WORKAROUND_MAX_TINFO_T = True
52 changes: 38 additions & 14 deletions diaphora_ida.py
Original file line number Diff line number Diff line change
Expand Up @@ -1152,9 +1152,16 @@ def recalculate_primes(self):
return callgraph_primes, callgraph_all_primes

def commit_and_start_transaction(self):
self.db.commit()
self.db.execute("PRAGMA synchronous = OFF")
self.db.execute("PRAGMA journal_mode = MEMORY")
try:
self.db.execute("commit")
except sqlite3.OperationalError as e:
# Ignore the "cannot commit - no transaction active" error
pass

if config.SQLITE_PRAGMA_SYNCHRONOUS is not None:
self.db.execute(f"PRAGMA synchronous = {config.SQLITE_PRAGMA_SYNCHRONOUS}")
if config.SQLITE_JOURNAL_MODE is not None:
self.db.execute(f"PRAGMA journal_mode = {config.SQLITE_JOURNAL_MODE}")
self.db.execute("BEGIN transaction")

def do_export(self, crashed_before=False):
Expand Down Expand Up @@ -1185,10 +1192,12 @@ def do_export(self, crashed_before=False):
self._funcs_cache = {}
for func in func_list:
if user_cancelled():
raise Exception("Canceled.")
raise Exception("Cancelled.")

i += 1
if (total_funcs >= 100) and i % (int(total_funcs / 100)) == 0 or i == 1:
if config.COMMIT_AFTER_EACH_GUI_UPDATE:
self.commit_and_start_transaction()
line = "Exported %d function(s) out of %d total.\nElapsed %d:%02d:%02d second(s), remaining time ~%d:%02d:%02d"
elapsed = time.monotonic() - t
remaining = (elapsed / i) * (total_funcs - i)
Expand Down Expand Up @@ -1231,6 +1240,7 @@ def do_export(self, crashed_before=False):
if i % (total_funcs / 10) == 0:
self.commit_and_start_transaction()

self.commit_and_start_transaction()
md5sum = GetInputFileMD5()
self.save_callgraph(
str(callgraph_primes), json.dumps(callgraph_all_primes), md5sum
Expand All @@ -1240,7 +1250,9 @@ def do_export(self, crashed_before=False):
self.export_til()
except:
log(f"Error reading type libraries: {str(sys.exc_info()[1])}")
self.save_compilation_units()

if config.EXPORTING_COMPILATION_UNITS:
self.save_compilation_units()

log_refresh("Creating indices...")
self.create_indices()
Expand Down Expand Up @@ -1898,6 +1910,7 @@ def import_instruction(self, ins_data1, ins_data2):

comment = mcmt
cfunc.set_user_cmt(tl, comment)
cfunc.del_orphan_cmts()
cfunc.save_user_cmts()

tmp_ea = None
Expand Down Expand Up @@ -2342,8 +2355,9 @@ def decompile_and_get(self, ea):
#
# max non-trivial tinfo_t count has been reached
#
if os.getenv("DIAPHORA_WORKAROUND_MAX_TINFO_T") is not None:
idaapi.clear_cached_cfuncs()
if config.DIAPHORA_WORKAROUND_MAX_TINFO_T:
if len(self._funcs_cache) % 10000 == 0:
idaapi.clear_cached_cfuncs()

decompiler_plugin = os.getenv("DIAPHORA_DECOMPILER_PLUGIN")
if decompiler_plugin is None:
Expand Down Expand Up @@ -3225,7 +3239,9 @@ def get_modules_using_lfa(self):
return new_modules

def save_compilation_units(self):
log_refresh("Finding compilation units...")
lfa_modules = self.get_modules_using_lfa()
log_refresh("Saving compilation units...")

sql1 = """insert into compilation_units (name, start_ea, end_ea)
values (?, ?, ?)"""
Expand All @@ -3241,7 +3257,12 @@ def save_compilation_units(self):
cur = self.db_cursor()
try:
dones = set()
for module in lfa_modules:
total = len(lfa_modules)
checkpoint = int(total / 10)
for i, module in enumerate(lfa_modules):
if i > 0 and checkpoint > 0 and i % checkpoint == 0:
log_refresh(f"Processing compilation unit {i} out of {total}...")

module_name = None
if module["name"] != "":
module_name = module["name"]
Expand All @@ -3250,12 +3271,15 @@ def save_compilation_units(self):
cur.execute(sql1, vals)
cu_id = cur.lastrowid

for values in self._funcs_cache.values():
func_id = values[0]
if func_id not in dones:
dones.add(func_id)
cur.execute(sql2, (cu_id, func_id))
cur.execute(sql4, (module_name, func_id))
for func in self._funcs_cache:
item = self._funcs_cache[func]
func = int(func)
if func >= module["start"] and func <= module["end"]:
func_id = item[0]
if func_id not in dones:
dones.add(func_id)
cur.execute(sql2, (cu_id, func_id))
cur.execute(sql4, (module_name, func_id))

cur.execute(
sql3,
Expand Down
13 changes: 13 additions & 0 deletions extras/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Diaphora local

This is a pure Python IDA plugin to diff pseudo-codes and assembly for functions inside the current binary, instead of diffing functions in different binaries.

# Installation

Simply copy this script in the directory `$IDA_DIR/plugins`.

# Usage

Put the cursor in IDA in some function and press Ctrl + Shift + D, choose the function to diff against the current function and 2 choosers (windows) will open showing the differences at pseudo-code and assembly levels.


Loading

0 comments on commit 477dd87

Please sign in to comment.