Skip to content

Commit

Permalink
Small ML engine required changes
Browse files Browse the repository at this point in the history
ML: Simplified the support for training local models.
HEUR: Added specific `INCREASE_RATIO_XXX` values for when multiple constants match between two functions.
  • Loading branch information
joxeankoret committed Aug 29, 2024
1 parent 38414a3 commit ef44333
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 45 deletions.
48 changes: 27 additions & 21 deletions diaphora.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
from difflib import unified_diff

import ml.model
from ml.model import ML_ENABLED, train, predict, get_model_name, int_compare_ratio
from ml.model import ML_AVAILABLE, train, predict, get_model_name, int_compare_ratio, is_fitted

from diaphora_heuristics import (
HEURISTICS,
Expand Down Expand Up @@ -392,9 +392,11 @@ def __init__(self, db_name, chooser=CChooser):
self.slow_heuristics = self.get_value_for(
"slow_heuristics", config.DIFFING_ENABLE_SLOW_HEURISTICS
)
self.machine_learning = self.get_value_for(
"machine_learning", config.ML_TRAIN_LOCAL_MODEL
self.train_local_model = self.get_value_for(
"train_local_model", config.ML_TRAIN_LOCAL_MODEL
)
if self.train_local_model:
log("Machine Learning module available")
self.exclude_library_thunk = self.get_value_for(
"exclude_library_thunk", config.EXPORTING_EXCLUDE_LIBRARY_THUNK
)
Expand Down Expand Up @@ -1903,11 +1905,7 @@ def check_ratio(self, main_d, diff_d):
self.ratios_cache[key] = 1.0
return 1.0

v6 = 0.0
if ML_ENABLED and self.machine_learning:
v6 = self.get_ml_ratio(main_d, diff_d)

values_set = set([v1, v2, v3, v4, v5, v6])
values_set = set([v1, v2, v3, v4, v5])
r = max(values_set)
if r == 1.0 and md1 != md2:
# We cannot assign a 1.0 ratio if both MD indices are different, that's an
Expand Down Expand Up @@ -2049,7 +2047,7 @@ def add_matches_internal(
t = time.monotonic()
while self.continue_getting_sql_rows(i):
if time.monotonic() - t > self.timeout or cur_thread.timeout:
log_refresh(f"Timeout with heuristic '{cur_thread.name}'")
log(f"Timeout with heuristic '{cur_thread.name}'")
raise SystemExit()

i += 1
Expand Down Expand Up @@ -2922,7 +2920,7 @@ def get_ml_ratio(self, main_d, diff_d):

ml_add = False
ml_ratio = 0
if ML_ENABLED and self.machine_learning:
if ML_AVAILABLE and self.train_local_model and is_fitted:
if min(main_row["nodes"], diff_row["nodes"]) > 3:
ml_ratio = int_compare_ratio(main_row["nodes"], diff_row["nodes"])
if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
Expand Down Expand Up @@ -3011,14 +3009,22 @@ def deep_ratio(self, main_d, diff_d, ratio):
score += 0.001

if main_row["constants"] != "[]":
if main_row["constants"] == diff_row["constants"]:
score += 0.003
else:
set1 = set(json.loads(main_row["constants"]))
set2 = set(json.loads(diff_row["constants"]))
set_result = set1.intersection(set2)
if len(set_result) > 0:
score += len(set_result) * 0.001
set1 = set(json.loads(main_row["constants"]))
set2 = set(json.loads(diff_row["constants"]))
set_result = set1.intersection(set2)
if len(set_result) > 0:
if self.is_same_processor:
tmp = config.INCREASE_RATIO_PER_CONSTANT_MATCH_SAME_CPU
else:
tmp = config.INCREASE_RATIO_PER_CONSTANT_MATCH
score += len(set_result) * tmp

if score > 0.1:
log(f"CONSTANTS: 0x%08x 0x%08x {score} %d constants matched" % (ea1, ea2, len(set_result)))

if ML_AVAILABLE and self.train_local_model:
tmp = self.get_ml_ratio(main_d, diff_d)
score += 0.01
finally:
cur.close()

Expand Down Expand Up @@ -3681,8 +3687,8 @@ def find_related_matches(self, iteration):
if main_row["constants_count"] > 0 and diff_row["constants_count"] > 0:
self.find_related_constants(main_row, diff_row)

def train_local_model(self):
if ML_ENABLED and self.machine_learning:
def do_train_local_model(self):
if ML_AVAILABLE and self.train_local_model:
debug_refresh("[i] Machine learning module enabled.")
train(self, self.all_matches)

Expand Down Expand Up @@ -3765,7 +3771,7 @@ def diff(self, db):
log_refresh("Finding partial matches")
self.find_partial_matches()

self.train_local_model()
self.do_train_local_model()

if self.unreliable:
# Find using likely unreliable methods modified functions
Expand Down
8 changes: 8 additions & 0 deletions diaphora_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@
# can relax the minimum ratio needed to consider a match good or bad.
DEFAULT_TRUSTED_PARTIAL_RATIO = 0.3

# Every single constant that is matched for a functions pair adds a small value
# to the total generated ratio. A little hack here is that for different cpu
# targets, if we increase the ratio a little, it will match better multiple
# functions that were missed before (or generated a too low ratio).
INCREASE_RATIO_PER_CONSTANT_MATCH_SAME_CPU = 0.006
INCREASE_RATIO_PER_CONSTANT_MATCH = 0.008

# Regular expressions used to clean-up the pseudo-code and assembly dumps in
# order to get better comparison ratios.
CLEANING_CMP_REPS = ["loc_", "j_nullsub_", "nullsub_", "j_sub_", "sub_",
Expand Down Expand Up @@ -202,6 +209,7 @@
# initial matches. This configuration directive is used to enable/disable this
# experimental feature.
ML_TRAIN_LOCAL_MODEL = False
ML_USE_TRAINED_MODEL = True

# What is the minimum ratio required for a match to be considered for usage to
# train a local model?
Expand Down
22 changes: 15 additions & 7 deletions diaphora_ida.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,8 @@ def __init__(self):
<Use speed ups:{rExperimental}##Use tricks to speed ups some of the most common diffing tasks>
<#Enable this option to ignore sub_* names for the 'Same name' heuristic.#Ignore automatically generated names:{rIgnoreSubNames}>
<#Enable this option to ignore all function names for the 'Same name' heuristic.#Ignore all function names:{rIgnoreAllNames}>
<#Enable this option to use the Machine Learning engine and generate a dataset with known good and bad results specific to the 2 binaries being compared.#Train a specialized classifier (experimental ML support):{rMachineLearning}>{cGroup1}>
<#Enable this option to use the Machine Learning engine and generate a dataset with known good and bad results specific to the 2 binaries being compared.#Train a specialized local classifier (experimental ML support):{rMachineLearning}>
<#Enable this option to use the Machine Learning engine with an already trained model.#Use the model $DIAPHORA_DIR/ml/clf.pkl:{rUseTrainedModel}>{cGroup1}>
Project specific rules:
<#Select the project specific Python script rules#Python script:{iProjectSpecificRules}>
Expand Down Expand Up @@ -763,7 +764,8 @@ def __init__(self):
"rExperimental",
"rIgnoreSubNames",
"rIgnoreAllNames",
"rMachineLearning"
"rMachineLearning",
"rUseTrainedModel"
)
),
"iProjectSpecificRules": Form.FileInput(
Expand All @@ -788,7 +790,8 @@ def set_options(self, opts):
self.rExcludeLibraryThunk.checked = opts.exclude_library_thunk
self.rUnreliable.checked = opts.unreliable
self.rSlowHeuristics.checked = opts.slow
self.rMachineLearning.checked = opts.machine_learning
self.rMachineLearning.checked = opts.train_local_model
self.rUseTrainedModel.checked = opts.use_trained_model
self.rRelaxRatio.checked = opts.relax
self.rExperimental.checked = opts.experimental
self.iMinEA.value = opts.min_ea
Expand All @@ -811,7 +814,8 @@ def get_options(self):
exclude_library_thunk=self.rExcludeLibraryThunk.checked,
unreliable=self.rUnreliable.checked,
slow=self.rSlowHeuristics.checked,
machine_learning=self.rMachineLearning.checked,
train_local_model=self.rMachineLearning.checked,
use_trained_model=self.rUseTrainedModel.checked,
relax=self.rRelaxRatio.checked,
experimental=self.rExperimental.checked,
min_ea=self.iMinEA.value,
Expand Down Expand Up @@ -3658,7 +3662,8 @@ def _diff_or_export(use_ui, **options):
bd.exclude_library_thunk = opts.exclude_library_thunk
bd.unreliable = opts.unreliable
bd.slow_heuristics = opts.slow
bd.machine_learning = opts.machine_learning
bd.train_local_model = opts.train_local_model
bd.use_trained_model = opts.use_trained_model
bd.relaxed_ratio = opts.relax
bd.experimental = opts.experimental
bd.min_ea = opts.min_ea
Expand Down Expand Up @@ -3749,8 +3754,11 @@ def __init__(self, **kwargs):
self.slow = kwargs.get(
"slow", total_functions <= config.MIN_FUNCTIONS_TO_DISABLE_SLOW
)
self.machine_learning = kwargs.get(
"machine_learning", config.ML_TRAIN_LOCAL_MODEL
self.train_local_model = kwargs.get(
"train_local_model", config.ML_TRAIN_LOCAL_MODEL
)
self.use_trained_model = kwargs.get(
"use_trained_model", config.ML_USE_TRAINED_MODEL
)
self.experimental = kwargs.get(
"experimental", config.DIFFING_ENABLE_EXPERIMENTAL
Expand Down
40 changes: 23 additions & 17 deletions ml/model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python3

"""
Diaphora, a diffing plugin for IDA
Diaphora, a binary diffing tool
Copyright (c) 2015-2024, Joxean Koret
This program is free software: you can redistribute it and/or modify
Expand All @@ -18,8 +18,8 @@
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""

__all__ = ["ML_ENABLED", "ml_model", "train", "predict", "get_model_name",
"int_compare_ratio"]
__all__ = ["ML_AVAILABLE", "ml_model", "train", "predict", "get_model_name",
"int_compare_ratio", "is_fitted"]

import sys
import json
Expand All @@ -36,13 +36,14 @@
try:
import numpy as np

from sklearn.linear_model import RidgeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.tree import DecisionTreeClassifier

ML_ENABLED = True
import joblib

ML_AVAILABLE = True
except ImportError:
print("Both numpy and Scikit Learn are needed to use local models.")
ML_ENABLED = False
print("Scikit Learn, numpy and joblib python libraries are required to use ML models.")
ML_AVAILABLE = False

sys.path.append(".")
sys.path.append("..")
Expand Down Expand Up @@ -115,7 +116,7 @@ def count_callers_callees(db_name : str, func_id : int):
return callers, callees

#-------------------------------------------------------------------------------
def compare_rows(row1 : list, row2 : list) -> List[float]:
def compare_rows(row1 : list, row2 : list, check_calls : bool = True) -> List[float]:
"""
Compare two function rows and calculate a similarity ratio for it.
"""
Expand Down Expand Up @@ -154,19 +155,19 @@ def compare_rows(row1 : list, row2 : list) -> List[float]:
else:
scores.append(value1 == value2)


main_callers, main_callees = count_callers_callees("main", row1["id"])
diff_callers, diff_callees = count_callers_callees("diff", row2["id"])
scores.append(int_compare_ratio(main_callers, diff_callees))
scores.append(int_compare_ratio(diff_callers, diff_callers))
if check_calls:
main_callers, main_callees = count_callers_callees("main", row1["id"])
diff_callers, diff_callees = count_callers_callees("diff", row2["id"])
scores.append(int_compare_ratio(main_callers, diff_callees))
scores.append(int_compare_ratio(diff_callers, diff_callers))

return scores

#-------------------------------------------------------------------------------
class CClassifier:
def __init__(self, diaphora_obj : object):
self.diaphora = diaphora_obj
self.clf = RidgeClassifier()
self.clf = DecisionTreeClassifier()
self.matches = []
self.fitted = False

Expand Down Expand Up @@ -271,8 +272,8 @@ def train(self, matches : list):
def predict(self, row : dict) -> float:
ret = 0.0
if self.fitted:
d = self.clf.decision_function(row)[0]
ret = np.exp(d) / (1 + np.exp(d))
d = self.clf.predict(row)
ret = d[0]
return ret

#-------------------------------------------------------------------------------
Expand All @@ -281,6 +282,11 @@ def train(diaphora_obj : object, matches : list):
ml_model = CClassifier(diaphora_obj)
ml_model.train(matches)

#-------------------------------------------------------------------------------
def is_fitted() -> bool:
global ml_model
return ml_model.fitted

#-------------------------------------------------------------------------------
def predict(main_row : dict, diff_row : dict) -> float:
global ml_model
Expand Down

0 comments on commit ef44333

Please sign in to comment.