Small ML engine required changes

ML: Simplified the support for training local models. HEUR: Added specific `INCREASE_RATIO_XXX` values for when multiple constants match between two functions.
joxeankoret · Aug 29, 2024 · ef44333 · ef44333
1 parent 38414a3
commit ef44333
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 45 deletions.
diff --git a/diaphora.py b/diaphora.py
@@ -51,7 +51,7 @@
 from difflib import unified_diff
 
 import ml.model
-from ml.model import ML_ENABLED, train, predict, get_model_name, int_compare_ratio
+from ml.model import ML_AVAILABLE, train, predict, get_model_name, int_compare_ratio, is_fitted
 
 from diaphora_heuristics import (
   HEURISTICS,
@@ -392,9 +392,11 @@ def __init__(self, db_name, chooser=CChooser):
     self.slow_heuristics = self.get_value_for(
       "slow_heuristics", config.DIFFING_ENABLE_SLOW_HEURISTICS
     )
-    self.machine_learning = self.get_value_for(
-      "machine_learning", config.ML_TRAIN_LOCAL_MODEL
+    self.train_local_model = self.get_value_for(
+      "train_local_model", config.ML_TRAIN_LOCAL_MODEL
     )
+    if self.train_local_model:
+      log("Machine Learning module available")
     self.exclude_library_thunk = self.get_value_for(
       "exclude_library_thunk", config.EXPORTING_EXCLUDE_LIBRARY_THUNK
     )
@@ -1903,11 +1905,7 @@ def check_ratio(self, main_d, diff_d):
         self.ratios_cache[key] = 1.0
         return 1.0
 
-    v6 = 0.0
-    if ML_ENABLED and self.machine_learning:
-      v6 = self.get_ml_ratio(main_d, diff_d)
-
-    values_set = set([v1, v2, v3, v4, v5, v6])
+    values_set = set([v1, v2, v3, v4, v5])
     r = max(values_set)
     if r == 1.0 and md1 != md2:
       # We cannot assign a 1.0 ratio if both MD indices are different, that's an
@@ -2049,7 +2047,7 @@ def add_matches_internal(
     t = time.monotonic()
     while self.continue_getting_sql_rows(i):
       if time.monotonic() - t > self.timeout or cur_thread.timeout:
-        log_refresh(f"Timeout with heuristic '{cur_thread.name}'")
+        log(f"Timeout with heuristic '{cur_thread.name}'")
         raise SystemExit()
 
       i += 1
@@ -2922,7 +2920,7 @@ def get_ml_ratio(self, main_d, diff_d):
 
       ml_add = False
       ml_ratio = 0
-      if ML_ENABLED and self.machine_learning:
+      if ML_AVAILABLE and self.train_local_model and is_fitted:
         if min(main_row["nodes"], diff_row["nodes"]) > 3:
           ml_ratio = int_compare_ratio(main_row["nodes"], diff_row["nodes"])
           if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
@@ -3011,14 +3009,22 @@ def deep_ratio(self, main_d, diff_d, ratio):
         score += 0.001
 
       if main_row["constants"] != "[]":
-        if main_row["constants"] == diff_row["constants"]:
-          score += 0.003
-        else:
-          set1 = set(json.loads(main_row["constants"]))
-          set2 = set(json.loads(diff_row["constants"]))
-          set_result = set1.intersection(set2)
-          if len(set_result) > 0:
-            score += len(set_result) * 0.001
+        set1 = set(json.loads(main_row["constants"]))
+        set2 = set(json.loads(diff_row["constants"]))
+        set_result = set1.intersection(set2)
+        if len(set_result) > 0:
+          if self.is_same_processor:
+            tmp = config.INCREASE_RATIO_PER_CONSTANT_MATCH_SAME_CPU
+          else:
+            tmp = config.INCREASE_RATIO_PER_CONSTANT_MATCH
+          score += len(set_result) * tmp
+
+        if score > 0.1:
+          log(f"CONSTANTS: 0x%08x 0x%08x {score} %d constants matched" % (ea1, ea2, len(set_result)))
+
+      if ML_AVAILABLE and self.train_local_model:
+        tmp = self.get_ml_ratio(main_d, diff_d)
+        score += 0.01
     finally:
       cur.close()
 
@@ -3681,8 +3687,8 @@ def find_related_matches(self, iteration):
         if main_row["constants_count"] > 0 and diff_row["constants_count"] > 0:
           self.find_related_constants(main_row, diff_row)
 
-  def train_local_model(self):
-    if ML_ENABLED and self.machine_learning:
+  def do_train_local_model(self):
+    if ML_AVAILABLE and self.train_local_model:
       debug_refresh("[i] Machine learning module enabled.")
       train(self, self.all_matches)
 
@@ -3765,7 +3771,7 @@ def diff(self, db):
           log_refresh("Finding partial matches")
           self.find_partial_matches()
 
-          self.train_local_model()
+          self.do_train_local_model()
 
           if self.unreliable:
             # Find using likely unreliable methods modified functions

diff --git a/diaphora_config.py b/diaphora_config.py
@@ -140,6 +140,13 @@
 # can relax the minimum ratio needed to consider a match good or bad.
 DEFAULT_TRUSTED_PARTIAL_RATIO = 0.3
 
+# Every single constant that is matched for a functions pair adds a small value
+# to the total generated ratio. A little hack here is that for different cpu
+# targets, if we increase the ratio a little, it will match better multiple
+# functions that were missed before (or generated a too low ratio).
+INCREASE_RATIO_PER_CONSTANT_MATCH_SAME_CPU = 0.006
+INCREASE_RATIO_PER_CONSTANT_MATCH = 0.008
+
 # Regular expressions used to clean-up the pseudo-code and assembly dumps in
 # order to get better comparison ratios.
 CLEANING_CMP_REPS = ["loc_", "j_nullsub_", "nullsub_", "j_sub_", "sub_",
@@ -202,6 +209,7 @@
 # initial matches. This configuration directive is used to enable/disable this
 # experimental feature.
 ML_TRAIN_LOCAL_MODEL = False
+ML_USE_TRAINED_MODEL = True
 
 # What is the minimum ratio required for a match to be considered for usage to
 # train a local model?

diff --git a/diaphora_ida.py b/diaphora_ida.py
@@ -733,7 +733,8 @@ def __init__(self):
   <Use speed ups:{rExperimental}##Use tricks to speed ups some of the most common diffing tasks>
   <#Enable this option to ignore sub_* names for the 'Same name' heuristic.#Ignore automatically generated names:{rIgnoreSubNames}>
   <#Enable this option to ignore all function names for the 'Same name' heuristic.#Ignore all function names:{rIgnoreAllNames}>
-  <#Enable this option to use the Machine Learning engine and generate a dataset with known good and bad results specific to the 2 binaries being compared.#Train a specialized classifier (experimental ML support):{rMachineLearning}>{cGroup1}>
+  <#Enable this option to use the Machine Learning engine and generate a dataset with known good and bad results specific to the 2 binaries being compared.#Train a specialized local classifier (experimental ML support):{rMachineLearning}>
+  <#Enable this option to use the Machine Learning engine with an already trained model.#Use the model $DIAPHORA_DIR/ml/clf.pkl:{rUseTrainedModel}>{cGroup1}>
 
   Project specific rules:
   <#Select the project specific Python script rules#Python script:{iProjectSpecificRules}>
@@ -763,7 +764,8 @@ def __init__(self):
           "rExperimental",
           "rIgnoreSubNames",
           "rIgnoreAllNames",
-          "rMachineLearning"
+          "rMachineLearning",
+          "rUseTrainedModel"
         )
       ),
       "iProjectSpecificRules": Form.FileInput(
@@ -788,7 +790,8 @@ def set_options(self, opts):
     self.rExcludeLibraryThunk.checked = opts.exclude_library_thunk
     self.rUnreliable.checked = opts.unreliable
     self.rSlowHeuristics.checked = opts.slow
-    self.rMachineLearning.checked = opts.machine_learning
+    self.rMachineLearning.checked = opts.train_local_model
+    self.rUseTrainedModel.checked = opts.use_trained_model
     self.rRelaxRatio.checked = opts.relax
     self.rExperimental.checked = opts.experimental
     self.iMinEA.value = opts.min_ea
@@ -811,7 +814,8 @@ def get_options(self):
       exclude_library_thunk=self.rExcludeLibraryThunk.checked,
       unreliable=self.rUnreliable.checked,
       slow=self.rSlowHeuristics.checked,
-      machine_learning=self.rMachineLearning.checked,
+      train_local_model=self.rMachineLearning.checked,
+      use_trained_model=self.rUseTrainedModel.checked,
       relax=self.rRelaxRatio.checked,
       experimental=self.rExperimental.checked,
       min_ea=self.iMinEA.value,
@@ -3658,7 +3662,8 @@ def _diff_or_export(use_ui, **options):
     bd.exclude_library_thunk = opts.exclude_library_thunk
     bd.unreliable = opts.unreliable
     bd.slow_heuristics = opts.slow
-    bd.machine_learning = opts.machine_learning
+    bd.train_local_model = opts.train_local_model
+    bd.use_trained_model = opts.use_trained_model
     bd.relaxed_ratio = opts.relax
     bd.experimental = opts.experimental
     bd.min_ea = opts.min_ea
@@ -3749,8 +3754,11 @@ def __init__(self, **kwargs):
     self.slow = kwargs.get(
       "slow", total_functions <= config.MIN_FUNCTIONS_TO_DISABLE_SLOW
     )
-    self.machine_learning = kwargs.get(
-      "machine_learning", config.ML_TRAIN_LOCAL_MODEL
+    self.train_local_model = kwargs.get(
+      "train_local_model", config.ML_TRAIN_LOCAL_MODEL
+    )
+    self.use_trained_model = kwargs.get(
+      "use_trained_model", config.ML_USE_TRAINED_MODEL
     )
     self.experimental = kwargs.get(
       "experimental", config.DIFFING_ENABLE_EXPERIMENTAL

diff --git a/ml/model.py b/ml/model.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 
 """
-Diaphora, a diffing plugin for IDA
+Diaphora, a binary diffing tool
 Copyright (c) 2015-2024, Joxean Koret
 
 This program is free software: you can redistribute it and/or modify
@@ -18,8 +18,8 @@
 along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 
-__all__ = ["ML_ENABLED", "ml_model", "train", "predict", "get_model_name",
-  "int_compare_ratio"]
+__all__ = ["ML_AVAILABLE", "ml_model", "train", "predict", "get_model_name",
+  "int_compare_ratio", "is_fitted"]
 
 import sys
 import json
@@ -36,13 +36,14 @@
 try:
   import numpy as np
 
-  from sklearn.linear_model import RidgeClassifier
-  from sklearn.calibration import CalibratedClassifierCV
+  from sklearn.tree import DecisionTreeClassifier
 
-  ML_ENABLED = True
+  import joblib
+
+  ML_AVAILABLE = True
 except ImportError:
-  print("Both numpy and Scikit Learn are needed to use local models.")
-  ML_ENABLED = False
+  print("Scikit Learn, numpy and joblib python libraries are required to use ML models.")
+  ML_AVAILABLE = False
 
 sys.path.append(".")
 sys.path.append("..")
@@ -115,7 +116,7 @@ def count_callers_callees(db_name : str, func_id : int):
   return callers, callees
 
 #-------------------------------------------------------------------------------
-def compare_rows(row1 : list, row2 : list) -> List[float]:
+def compare_rows(row1 : list, row2 : list, check_calls : bool = True) -> List[float]:
   """
   Compare two function rows and calculate a similarity ratio for it.
   """
@@ -154,19 +155,19 @@ def compare_rows(row1 : list, row2 : list) -> List[float]:
     else:
       scores.append(value1 == value2)
 
-
-  main_callers, main_callees = count_callers_callees("main", row1["id"])
-  diff_callers, diff_callees = count_callers_callees("diff", row2["id"])
-  scores.append(int_compare_ratio(main_callers, diff_callees))
-  scores.append(int_compare_ratio(diff_callers, diff_callers))
+  if check_calls:
+    main_callers, main_callees = count_callers_callees("main", row1["id"])
+    diff_callers, diff_callees = count_callers_callees("diff", row2["id"])
+    scores.append(int_compare_ratio(main_callers, diff_callees))
+    scores.append(int_compare_ratio(diff_callers, diff_callers))
 
   return scores
 
 #-------------------------------------------------------------------------------
 class CClassifier:
   def __init__(self, diaphora_obj : object):
     self.diaphora = diaphora_obj
-    self.clf = RidgeClassifier()
+    self.clf = DecisionTreeClassifier()
     self.matches = []
     self.fitted = False
 
@@ -271,8 +272,8 @@ def train(self, matches : list):
   def predict(self, row : dict) -> float:
     ret = 0.0
     if self.fitted:
-      d = self.clf.decision_function(row)[0]
-      ret = np.exp(d) / (1 + np.exp(d))
+      d = self.clf.predict(row)
+      ret = d[0]
     return ret
 
 #-------------------------------------------------------------------------------
@@ -281,6 +282,11 @@ def train(diaphora_obj : object, matches : list):
   ml_model = CClassifier(diaphora_obj)
   ml_model.train(matches)
 
+#-------------------------------------------------------------------------------
+def is_fitted() -> bool:
+  global ml_model
+  return ml_model.fitted
+
 #-------------------------------------------------------------------------------
 def predict(main_row : dict, diff_row : dict) -> float:
   global ml_model