Skip to content

Commit

Permalink
Support for a specialized trained ML model in Diaphora
Browse files Browse the repository at this point in the history
ML: Dropped support for training local models. They were not working properly at all.
BUG: HEUR: Added field 'bytes_hash' to the '100% equal' heuristic, as it was ignoring some minimal changes (issue #313)
BUG: HEUR: Always check if there are differences even for structurally 100% equal databases (issue #313).
  • Loading branch information
joxeankoret committed Sep 17, 2024
1 parent 15b281e commit 7ff7058
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 401 deletions.
146 changes: 83 additions & 63 deletions diaphora.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@

from difflib import unified_diff

import ml.model
from ml.model import ML_AVAILABLE, train, predict, get_model_name, int_compare_ratio, is_fitted
import ml
from ml.basic_engine import get_model_comparison_data, ML_AVAILABLE

from diaphora_heuristics import (
HEURISTICS,
Expand Down Expand Up @@ -87,7 +87,7 @@
except ImportError:
IS_IDA = False

importlib.reload(ml.model)
importlib.reload(ml.basic_engine)
importlib.reload(config)
importlib.reload(schema)
importlib.reload(jk_threads)
Expand Down Expand Up @@ -392,11 +392,9 @@ def __init__(self, db_name, chooser=CChooser):
self.slow_heuristics = self.get_value_for(
"slow_heuristics", config.DIFFING_ENABLE_SLOW_HEURISTICS
)
self.train_local_model = self.get_value_for(
"train_local_model", config.ML_TRAIN_LOCAL_MODEL
self.use_trained_model = self.get_value_for(
"use_trained_model", config.ML_USE_TRAINED_MODEL
)
if self.train_local_model:
log("Machine Learning module available")
self.exclude_library_thunk = self.get_value_for(
"exclude_library_thunk", config.EXPORTING_EXCLUDE_LIBRARY_THUNK
)
Expand Down Expand Up @@ -427,6 +425,8 @@ def __init__(self, db_name, chooser=CChooser):
# How much do call graphs from both binaries differ?
self.percent = 0

self.classifier = None

####################################################################
# LIMITS
#
Expand Down Expand Up @@ -1572,8 +1572,8 @@ def find_equal_matches(self):
self.total_functions1 = rows[0]["total"]
self.total_functions2 = rows[1]["total"]

fields = "id, address, mangled_function, nodes, edges, size"
sql = f"""select address ea, mangled_function, nodes
fields = "id, address, mangled_function, nodes, edges, size, bytes_hash"
sql = f"""select address ea, mangled_function, nodes, bytes_hash
from (select {fields}
from functions
intersect
Expand Down Expand Up @@ -1906,6 +1906,7 @@ def check_ratio(self, main_d, diff_d):
return 1.0

values_set = set([v1, v2, v3, v4, v5])

r = max(values_set)
if r == 1.0 and md1 != md2:
# We cannot assign a 1.0 ratio if both MD indices are different, that's an
Expand Down Expand Up @@ -2734,7 +2735,7 @@ def search_just_stripped_binaries(self):
)
log_refresh(f"Finding via {repr(heur)}")

self.add_matches_from_query(sql, "best")
self.add_matches_from_query_ratio(sql, "best", "partial")
ret = True
finally:
cur.close()
Expand Down Expand Up @@ -2903,48 +2904,6 @@ def add_multimatches_to_chooser(self, multi, ignore_list, dones):

return ignore_list, dones

def get_ml_ratio(self, main_d, diff_d):
ea1 = int(main_d["ea"])
ea2 = int(diff_d["ea"])

ml_ratio = 0.0

cur = self.db_cursor()
sql = "select * from {db}.functions where address = ?"
try:
cur.execute(sql.format(db="main"), (str(ea1),))
main_row = cur.fetchone()

cur.execute(sql.format(db="diff"), (str(ea2),))
diff_row = cur.fetchone()

ml_add = False
ml_ratio = 0
if ML_AVAILABLE and self.train_local_model and is_fitted:
if min(main_row["nodes"], diff_row["nodes"]) > 3:
ml_ratio = int_compare_ratio(main_row["nodes"], diff_row["nodes"])
if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
ml_ratio = predict(main_row, diff_row)
if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
log(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
ml_add = True
else:
ml_ratio = 0.0

if ml_add:
vfname1 = main_d["name"]
vfname2 = diff_d["name"]
nodes1 = main_d["nodes"]
nodes2 = diff_d["nodes"]
desc = f"ML {get_model_name()}"

tmp_item = CChooser.Item(ea1, vfname1, ea2, vfname2, desc, ml_ratio, nodes1, nodes2)
self.ml_chooser.add_item(tmp_item)
finally:
cur.close()

return ml_ratio

def deep_ratio(self, main_d, diff_d, ratio):
"""
Try to get a score to add to the value returned by `check_ratio()` so less
Expand Down Expand Up @@ -3018,13 +2977,18 @@ def deep_ratio(self, main_d, diff_d, ratio):
else:
tmp = config.INCREASE_RATIO_PER_CONSTANT_MATCH
score += len(set_result) * tmp

if score > 0.1:
log(f"CONSTANTS: 0x%08x 0x%08x {score} %d constants matched" % (ea1, ea2, len(set_result)))

if ML_AVAILABLE and self.train_local_model:
tmp = self.get_ml_ratio(main_d, diff_d)
score += 0.01
if self.classifier is not None:
if self.get_model_ratio(main_d, diff_d) == 1:
score += config.ML_TRAINED_MODEL_MATCH_SCORE
vfname1 = main_d["name"]
vfname2 = diff_d["name"]
nodes1 = main_d["nodes"]
nodes2 = diff_d["nodes"]
desc = f"ML {self.classifier}"

tmp_item = CChooser.Item(ea1, vfname1, ea2, vfname2, desc, ratio + score, nodes1, nodes2)
self.ml_chooser.add_item(tmp_item)
finally:
cur.close()

Expand Down Expand Up @@ -3687,10 +3651,66 @@ def find_related_matches(self, iteration):
if main_row["constants_count"] > 0 and diff_row["constants_count"] > 0:
self.find_related_constants(main_row, diff_row)

def do_train_local_model(self):
if ML_AVAILABLE and self.train_local_model:
debug_refresh("[i] Machine learning module enabled.")
train(self, self.all_matches)
def get_model_ratio(self, main_d, diff_d):
SELECT_FIELDS = """f.name name1,
f.nodes nodes1,
f.edges edges1,
f.indegree indegree1,
f.outdegree outdegree1,
f.cyclomatic_complexity cc1,
f.primes_value primes_value1,
f.clean_pseudo clean_pseudo1,
f.pseudocode_primes pseudocode_primes1,
f.strongly_connected strongly_connected1,
f.strongly_connected_spp strongly_connected_spp1,
f.loops loops1,
f.constants constants1,
f.source_file source_file1,
df.name name2,
df.nodes nodes2,
df.edges edges2,
df.indegree indegree2,
df.outdegree outdegree2,
df.cyclomatic_complexity cc2,
df.primes_value primes_value2,
df.clean_pseudo clean_pseudo2,
df.pseudocode_primes pseudocode_primes2,
df.strongly_connected strongly_connected2,
df.strongly_connected_spp strongly_connected_spp2,
df.loops loops2,
df.constants constants2,
df.source_file source_file2,
f.id id1,
df.id id2,
f.address ea1,
df.address ea2 """

sql = f"""select {SELECT_FIELDS}
from main.functions f,
diff.functions df
where f.address = ?
and df.address = ? """
cur = self.db_cursor()

ret = 0
try:
cur.execute(sql, (main_d["ea"], diff_d["ea"]))
row = cur.fetchone()
d = dict(row)
cmp_data = get_model_comparison_data(dict(row), self.is_same_processor)
ret = self.classifier.predict(cmp_data)[0]
if ret == 1:
debug_refresh(f"ML model predicted {ret} for {main_d['name']} {diff_d['name']}")
finally:
cur.close()

return ret

def apply_machine_learning(self):
if ML_AVAILABLE and self.use_trained_model:
import joblib
self.classifier = joblib.load(config.ML_TRAINED_MODEL)
log(f"Using ML classifier {self.classifier}")

def get_callers_callees(self, db_name, func_id):
cur = self.db_cursor()
Expand Down Expand Up @@ -3771,7 +3791,7 @@ def diff(self, db):
log_refresh("Finding partial matches")
self.find_partial_matches()

self.do_train_local_model()
self.apply_machine_learning()

if self.unreliable:
# Find using likely unreliable methods modified functions
Expand Down
23 changes: 5 additions & 18 deletions diaphora_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,25 +201,12 @@


#-------------------------------------------------------------------------------
# Diaphora can try to train using Ridge regression a classifier specific for the
# current set of binaries using matches labelled as "Best" or "Partial" in order
# to try to learn what is a good match specifically for the two binaries being
# compared. This approach seems to work when there are a lot of initial matches,
# and seems to cause a lot of false positives when there aren't enough good
# initial matches. This configuration directive is used to enable/disable this
# experimental feature.
ML_TRAIN_LOCAL_MODEL = False
# Diaphora can use a local mode, enable this configuration directive to use it.
ML_USE_TRAINED_MODEL = True

# What is the minimum ratio required for a match to be considered for usage to
# train a local model?
ML_MATCHES_MIN_RATIO = 0.7
ML_MIN_PREDICTION_RATIO = 0.75

# What value should be added to the final similarity ratio when the specialized
# classifier (trained with known good and bad results found for the current two
# binaries being compared) finds what it thinks is a good match.
ML_DEEP_RATIO_ADDED_SCORE = 0.1
# Model trained with a decision tree classifier: fast and accurate enough
ML_TRAINED_MODEL = os.path.join(CONFIGURATION_DIRECTORY, "ml/diaphora-amalgamation-model.pkl")
# The value added to the similarity ratio for a positive match using the model.
ML_TRAINED_MODEL_MATCH_SCORE = 0.15

# Show a chooser with all the matches that the classifier think are good ones?
ML_DEBUG_SHOW_MATCHES = True
Expand Down
10 changes: 1 addition & 9 deletions diaphora_ida.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,8 +733,7 @@ def __init__(self):
<Use speed ups:{rExperimental}##Use tricks to speed ups some of the most common diffing tasks>
<#Enable this option to ignore sub_* names for the 'Same name' heuristic.#Ignore automatically generated names:{rIgnoreSubNames}>
<#Enable this option to ignore all function names for the 'Same name' heuristic.#Ignore all function names:{rIgnoreAllNames}>
<#Enable this option to use the Machine Learning engine and generate a dataset with known good and bad results specific to the 2 binaries being compared.#Train a specialized local classifier (experimental ML support):{rMachineLearning}>
<#Enable this option to use the Machine Learning engine with an already trained model.#Use the model $DIAPHORA_DIR/ml/clf.pkl:{rUseTrainedModel}>{cGroup1}>
<#Enable this option to use the Machine Learning engine with an already trained model specified in diaphora_config.py!ML_TRAINED_MODEL.#Use an already trained model:{rUseTrainedModel}>{cGroup1}>
Project specific rules:
<#Select the project specific Python script rules#Python script:{iProjectSpecificRules}>
Expand Down Expand Up @@ -764,7 +763,6 @@ def __init__(self):
"rExperimental",
"rIgnoreSubNames",
"rIgnoreAllNames",
"rMachineLearning",
"rUseTrainedModel"
)
),
Expand All @@ -790,7 +788,6 @@ def set_options(self, opts):
self.rExcludeLibraryThunk.checked = opts.exclude_library_thunk
self.rUnreliable.checked = opts.unreliable
self.rSlowHeuristics.checked = opts.slow
self.rMachineLearning.checked = opts.train_local_model
self.rUseTrainedModel.checked = opts.use_trained_model
self.rRelaxRatio.checked = opts.relax
self.rExperimental.checked = opts.experimental
Expand All @@ -814,7 +811,6 @@ def get_options(self):
exclude_library_thunk=self.rExcludeLibraryThunk.checked,
unreliable=self.rUnreliable.checked,
slow=self.rSlowHeuristics.checked,
train_local_model=self.rMachineLearning.checked,
use_trained_model=self.rUseTrainedModel.checked,
relax=self.rRelaxRatio.checked,
experimental=self.rExperimental.checked,
Expand Down Expand Up @@ -3662,7 +3658,6 @@ def _diff_or_export(use_ui, **options):
bd.exclude_library_thunk = opts.exclude_library_thunk
bd.unreliable = opts.unreliable
bd.slow_heuristics = opts.slow
bd.train_local_model = opts.train_local_model
bd.use_trained_model = opts.use_trained_model
bd.relaxed_ratio = opts.relax
bd.experimental = opts.experimental
Expand Down Expand Up @@ -3754,9 +3749,6 @@ def __init__(self, **kwargs):
self.slow = kwargs.get(
"slow", total_functions <= config.MIN_FUNCTIONS_TO_DISABLE_SLOW
)
self.train_local_model = kwargs.get(
"train_local_model", config.ML_TRAIN_LOCAL_MODEL
)
self.use_trained_model = kwargs.get(
"use_trained_model", config.ML_USE_TRAINED_MODEL
)
Expand Down
Loading

0 comments on commit 7ff7058

Please sign in to comment.