Support for a specialized trained ML model in Diaphora

ML: Dropped support for training local models. They were not working properly at all. BUG: HEUR: Added field 'bytes_hash' to the '100% equal' heuristic, as it was ignoring some minimal changes (issue #313) BUG: HEUR: Always check if there are differences even for structurally 100% equal databases (issue #313).
joxeankoret · Sep 17, 2024 · 7ff7058 · 7ff7058
1 parent 15b281e
commit 7ff7058
Show file tree

Hide file tree

Showing 6 changed files with 246 additions and 401 deletions.
diff --git a/diaphora.py b/diaphora.py
@@ -50,8 +50,8 @@
 
 from difflib import unified_diff
 
-import ml.model
-from ml.model import ML_AVAILABLE, train, predict, get_model_name, int_compare_ratio, is_fitted
+import ml
+from ml.basic_engine import get_model_comparison_data, ML_AVAILABLE
 
 from diaphora_heuristics import (
   HEURISTICS,
@@ -87,7 +87,7 @@
 except ImportError:
   IS_IDA = False
 
-importlib.reload(ml.model)
+importlib.reload(ml.basic_engine)
 importlib.reload(config)
 importlib.reload(schema)
 importlib.reload(jk_threads)
@@ -392,11 +392,9 @@ def __init__(self, db_name, chooser=CChooser):
     self.slow_heuristics = self.get_value_for(
       "slow_heuristics", config.DIFFING_ENABLE_SLOW_HEURISTICS
     )
-    self.train_local_model = self.get_value_for(
-      "train_local_model", config.ML_TRAIN_LOCAL_MODEL
+    self.use_trained_model = self.get_value_for(
+      "use_trained_model", config.ML_USE_TRAINED_MODEL
     )
-    if self.train_local_model:
-      log("Machine Learning module available")
     self.exclude_library_thunk = self.get_value_for(
       "exclude_library_thunk", config.EXPORTING_EXCLUDE_LIBRARY_THUNK
     )
@@ -427,6 +425,8 @@ def __init__(self, db_name, chooser=CChooser):
     # How much do call graphs from both binaries differ?
     self.percent = 0
 
+    self.classifier = None
+
     ####################################################################
     # LIMITS
     #
@@ -1572,8 +1572,8 @@ def find_equal_matches(self):
       self.total_functions1 = rows[0]["total"]
       self.total_functions2 = rows[1]["total"]
 
-      fields = "id, address, mangled_function, nodes, edges, size"
-      sql = f"""select address ea, mangled_function, nodes
+      fields = "id, address, mangled_function, nodes, edges, size, bytes_hash"
+      sql = f"""select address ea, mangled_function, nodes, bytes_hash
                  from (select {fields}
                          from functions
                     intersect
@@ -1906,6 +1906,7 @@ def check_ratio(self, main_d, diff_d):
         return 1.0
 
     values_set = set([v1, v2, v3, v4, v5])
+
     r = max(values_set)
     if r == 1.0 and md1 != md2:
       # We cannot assign a 1.0 ratio if both MD indices are different, that's an
@@ -2734,7 +2735,7 @@ def search_just_stripped_binaries(self):
         )
         log_refresh(f"Finding via {repr(heur)}")
 
-        self.add_matches_from_query(sql, "best")
+        self.add_matches_from_query_ratio(sql, "best", "partial")
         ret = True
     finally:
       cur.close()
@@ -2903,48 +2904,6 @@ def add_multimatches_to_chooser(self, multi, ignore_list, dones):
 
     return ignore_list, dones
 
-  def get_ml_ratio(self, main_d, diff_d):
-    ea1 = int(main_d["ea"])
-    ea2 = int(diff_d["ea"])
-
-    ml_ratio = 0.0
-
-    cur = self.db_cursor()
-    sql = "select * from {db}.functions where address = ?"
-    try:
-      cur.execute(sql.format(db="main"), (str(ea1),))
-      main_row = cur.fetchone()
-
-      cur.execute(sql.format(db="diff"), (str(ea2),))
-      diff_row = cur.fetchone()
-
-      ml_add = False
-      ml_ratio = 0
-      if ML_AVAILABLE and self.train_local_model and is_fitted:
-        if min(main_row["nodes"], diff_row["nodes"]) > 3:
-          ml_ratio = int_compare_ratio(main_row["nodes"], diff_row["nodes"])
-          if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
-            ml_ratio = predict(main_row, diff_row)
-            if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
-              log(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
-              ml_add = True
-            else:
-              ml_ratio = 0.0
-
-      if ml_add:
-        vfname1 = main_d["name"]
-        vfname2 = diff_d["name"]
-        nodes1 = main_d["nodes"]
-        nodes2 = diff_d["nodes"]
-        desc = f"ML {get_model_name()}"
-
-        tmp_item = CChooser.Item(ea1, vfname1, ea2, vfname2, desc, ml_ratio, nodes1, nodes2)
-        self.ml_chooser.add_item(tmp_item)
-    finally:
-      cur.close()
-
-    return ml_ratio
-
   def deep_ratio(self, main_d, diff_d, ratio):
     """
     Try to get a score to add to the value returned by `check_ratio()` so less
@@ -3018,13 +2977,18 @@ def deep_ratio(self, main_d, diff_d, ratio):
           else:
             tmp = config.INCREASE_RATIO_PER_CONSTANT_MATCH
           score += len(set_result) * tmp
-
-        if score > 0.1:
-          log(f"CONSTANTS: 0x%08x 0x%08x {score} %d constants matched" % (ea1, ea2, len(set_result)))
 
-      if ML_AVAILABLE and self.train_local_model:
-        tmp = self.get_ml_ratio(main_d, diff_d)
-        score += 0.01
+      if self.classifier is not None:
+        if self.get_model_ratio(main_d, diff_d) == 1:
+          score += config.ML_TRAINED_MODEL_MATCH_SCORE
+          vfname1 = main_d["name"]
+          vfname2 = diff_d["name"]
+          nodes1 = main_d["nodes"]
+          nodes2 = diff_d["nodes"]
+          desc = f"ML {self.classifier}"
+
+          tmp_item = CChooser.Item(ea1, vfname1, ea2, vfname2, desc, ratio + score, nodes1, nodes2)
+          self.ml_chooser.add_item(tmp_item)
     finally:
       cur.close()
 
@@ -3687,10 +3651,66 @@ def find_related_matches(self, iteration):
         if main_row["constants_count"] > 0 and diff_row["constants_count"] > 0:
           self.find_related_constants(main_row, diff_row)
 
-  def do_train_local_model(self):
-    if ML_AVAILABLE and self.train_local_model:
-      debug_refresh("[i] Machine learning module enabled.")
-      train(self, self.all_matches)
+  def get_model_ratio(self, main_d, diff_d):
+    SELECT_FIELDS = """f.name name1,
+       f.nodes nodes1,
+       f.edges edges1,
+       f.indegree indegree1,
+       f.outdegree outdegree1,
+       f.cyclomatic_complexity cc1,
+       f.primes_value primes_value1,
+       f.clean_pseudo clean_pseudo1,
+       f.pseudocode_primes pseudocode_primes1,
+       f.strongly_connected strongly_connected1,
+       f.strongly_connected_spp strongly_connected_spp1,
+       f.loops loops1,
+       f.constants constants1,
+       f.source_file source_file1,
+       df.name name2,
+       df.nodes nodes2,
+       df.edges edges2,
+       df.indegree indegree2,
+       df.outdegree outdegree2,
+       df.cyclomatic_complexity cc2,
+       df.primes_value primes_value2,
+       df.clean_pseudo clean_pseudo2,
+       df.pseudocode_primes pseudocode_primes2,
+       df.strongly_connected strongly_connected2,
+       df.strongly_connected_spp strongly_connected_spp2,
+       df.loops loops2,
+       df.constants constants2,
+       df.source_file source_file2,
+       f.id id1,
+       df.id id2,
+       f.address ea1,
+       df.address ea2 """
+
+    sql = f"""select {SELECT_FIELDS}
+                from main.functions f,
+                     diff.functions df
+               where f.address = ?
+                 and df.address = ? """
+    cur = self.db_cursor()
+
+    ret = 0
+    try:
+      cur.execute(sql, (main_d["ea"], diff_d["ea"]))
+      row = cur.fetchone()
+      d = dict(row)
+      cmp_data = get_model_comparison_data(dict(row), self.is_same_processor)
+      ret = self.classifier.predict(cmp_data)[0]
+      if ret == 1:
+        debug_refresh(f"ML model predicted {ret} for {main_d['name']} {diff_d['name']}")
+    finally:
+      cur.close()
+
+    return ret
+
+  def apply_machine_learning(self):
+    if ML_AVAILABLE and self.use_trained_model:
+      import joblib
+      self.classifier = joblib.load(config.ML_TRAINED_MODEL)
+      log(f"Using ML classifier {self.classifier}")
 
   def get_callers_callees(self, db_name, func_id):
     cur = self.db_cursor()
@@ -3771,7 +3791,7 @@ def diff(self, db):
           log_refresh("Finding partial matches")
           self.find_partial_matches()
 
-          self.do_train_local_model()
+          self.apply_machine_learning()
 
           if self.unreliable:
             # Find using likely unreliable methods modified functions

diff --git a/diaphora_config.py b/diaphora_config.py
@@ -201,25 +201,12 @@
 
 
 #-------------------------------------------------------------------------------
-# Diaphora can try to train using Ridge regression a classifier specific for the
-# current set of binaries using matches labelled as "Best" or "Partial" in order
-# to try to learn what is a good match specifically for the two binaries being
-# compared. This approach seems to work when there are a lot of initial matches,
-# and seems to cause a lot of false positives when there aren't enough good
-# initial matches. This configuration directive is used to enable/disable this
-# experimental feature.
-ML_TRAIN_LOCAL_MODEL = False
+# Diaphora can use a local mode, enable this configuration directive to use it.
 ML_USE_TRAINED_MODEL = True
-
-# What is the minimum ratio required for a match to be considered for usage to
-# train a local model?
-ML_MATCHES_MIN_RATIO = 0.7
-ML_MIN_PREDICTION_RATIO = 0.75
-
-# What value should be added to the final similarity ratio when the specialized
-# classifier (trained with known good and bad results found for the current two
-# binaries being compared) finds what it thinks is a good match.
-ML_DEEP_RATIO_ADDED_SCORE = 0.1
+# Model trained with a decision tree classifier: fast and accurate enough
+ML_TRAINED_MODEL = os.path.join(CONFIGURATION_DIRECTORY, "ml/diaphora-amalgamation-model.pkl")
+# The value added to the similarity ratio for a positive match using the model.
+ML_TRAINED_MODEL_MATCH_SCORE = 0.15
 
 # Show a chooser with all the matches that the classifier think are good ones?
 ML_DEBUG_SHOW_MATCHES = True

diff --git a/diaphora_ida.py b/diaphora_ida.py
@@ -733,8 +733,7 @@ def __init__(self):
   <Use speed ups:{rExperimental}##Use tricks to speed ups some of the most common diffing tasks>
   <#Enable this option to ignore sub_* names for the 'Same name' heuristic.#Ignore automatically generated names:{rIgnoreSubNames}>
   <#Enable this option to ignore all function names for the 'Same name' heuristic.#Ignore all function names:{rIgnoreAllNames}>
-  <#Enable this option to use the Machine Learning engine and generate a dataset with known good and bad results specific to the 2 binaries being compared.#Train a specialized local classifier (experimental ML support):{rMachineLearning}>
-  <#Enable this option to use the Machine Learning engine with an already trained model.#Use the model $DIAPHORA_DIR/ml/clf.pkl:{rUseTrainedModel}>{cGroup1}>
+  <#Enable this option to use the Machine Learning engine with an already trained model specified in diaphora_config.py!ML_TRAINED_MODEL.#Use an already trained model:{rUseTrainedModel}>{cGroup1}>
 
   Project specific rules:
   <#Select the project specific Python script rules#Python script:{iProjectSpecificRules}>
@@ -764,7 +763,6 @@ def __init__(self):
           "rExperimental",
           "rIgnoreSubNames",
           "rIgnoreAllNames",
-          "rMachineLearning",
           "rUseTrainedModel"
         )
       ),
@@ -790,7 +788,6 @@ def set_options(self, opts):
     self.rExcludeLibraryThunk.checked = opts.exclude_library_thunk
     self.rUnreliable.checked = opts.unreliable
     self.rSlowHeuristics.checked = opts.slow
-    self.rMachineLearning.checked = opts.train_local_model
     self.rUseTrainedModel.checked = opts.use_trained_model
     self.rRelaxRatio.checked = opts.relax
     self.rExperimental.checked = opts.experimental
@@ -814,7 +811,6 @@ def get_options(self):
       exclude_library_thunk=self.rExcludeLibraryThunk.checked,
       unreliable=self.rUnreliable.checked,
       slow=self.rSlowHeuristics.checked,
-      train_local_model=self.rMachineLearning.checked,
       use_trained_model=self.rUseTrainedModel.checked,
       relax=self.rRelaxRatio.checked,
       experimental=self.rExperimental.checked,
@@ -3662,7 +3658,6 @@ def _diff_or_export(use_ui, **options):
     bd.exclude_library_thunk = opts.exclude_library_thunk
     bd.unreliable = opts.unreliable
     bd.slow_heuristics = opts.slow
-    bd.train_local_model = opts.train_local_model
     bd.use_trained_model = opts.use_trained_model
     bd.relaxed_ratio = opts.relax
     bd.experimental = opts.experimental
@@ -3754,9 +3749,6 @@ def __init__(self, **kwargs):
     self.slow = kwargs.get(
       "slow", total_functions <= config.MIN_FUNCTIONS_TO_DISABLE_SLOW
     )
-    self.train_local_model = kwargs.get(
-      "train_local_model", config.ML_TRAIN_LOCAL_MODEL
-    )
     self.use_trained_model = kwargs.get(
       "use_trained_model", config.ML_USE_TRAINED_MODEL
     )