Multiple bug fixes and little improvements

CORE: Try to use `cdifflib` instead of Python's standard `difflib` when possible to get some performance gains. BUG: High addresses in operands could cause the Python's sqlite3 module to crash when inserting into the database. ML: Try to use the Ridge classifier as just another method to get a similarity ratio in `check_ratio`. ML: Simplifications of the supervised learning based experimental engine. CONFIG: Added parameter `COMMIT_AFTER_EACH_GUI_UPDATE` to force committing. CONFIG: Added parameter `EXPORTING_COMPILATION_UNITS` to enable/disable exporting them (with some huge databases it might take even hours!). CONFIG: Added parameters handling SQLite pragmas `SQLITE_JOURNAL_MODE` and `SQLITE_PRAGMA_SYNCHRONOUS`. CONFIG: Added parameter `SHOW_IMPORT_WARNINGS` to enable/disable showing warnings when some important but optional Python packages aren't found. BUG: Be sure to delete orphaned comments when importing pseudo-code comments. BUG: The workaround for "max non-trivial tinfo_t count has been reached" was wrong. Now, the Hex-Rays functions cache is cleared every 10,000 rows. GUI: Display the progress when exporting a large number of compilation units. BUG: Inserting the link between functions and compilation units was terribly-utterly-horribly wrong. VULN: Add pattern "UNC" to potentially detect vulnerabilities fixed in Windows components involving UNC paths. EXTRAS: Added independent IDA plugin `extras/diaphora_local.py` to be able to diff functions inside the current binary. BUG: Do a commit after all functions are exported so, in case IDA crashes for a reason/bug, Diaphora can properly recover from errors and have all the functions already exported there.
joxeankoret · Feb 22, 2024 · 477dd87 · 477dd87
1 parent f9b28d5
commit 477dd87
Show file tree

Hide file tree

Showing 8 changed files with 604 additions and 103 deletions.
diff --git a/diaphora.py b/diaphora.py
@@ -35,11 +35,22 @@
 from io import StringIO
 from threading import Lock
 from multiprocessing import cpu_count
-from difflib import SequenceMatcher, unified_diff
 
 import diaphora_config as config
 import diaphora_heuristics
 
+try:
+  from cdifflib import CSequenceMatcher as SequenceMatcher
+  HAS_CDIFFLIB = True
+except ImportError:
+  HAS_CDIFFLIB = False
+  if config.SHOW_IMPORT_WARNINGS:
+    print("WARNING: Python library 'cdifflib' not found. Installing it will significantly improve text diffing performance.")
+    print("INFO: Alternatively, you can silence this warning by changing the value of SHOW_IMPORT_WARNINGS in diaphora_config.py.")
+  from difflib import SequenceMatcher
+
+from difflib import unified_diff
+
 import ml.model
 from ml.model import ML_ENABLED, train, predict, get_model_name
 
@@ -709,6 +720,10 @@ def save_instructions_to_database(self, cur, bb_data, func_id):
                 cls=CBytesEncoder,
               )
             )
+          elif isinstance(instruction_property, int):
+            if instruction_property > 0x8000000000000000:
+              instruction_property = str(instruction_property)
+            instruction_properties.append(instruction_property)
           else:
             instruction_properties.append(instruction_property)
 
@@ -1104,7 +1119,7 @@ def save_function(self, props):
         insert_args.append([func_id, str(caller), "caller"])
 
       for callee in callees:
-        insert_args.append([func_id, str(callee), "callee"])      
+        insert_args.append([func_id, str(callee), "callee"])
       cur.executemany(sql, insert_args)
 
       # Phase 3: Insert the constants of the function
@@ -1880,20 +1895,28 @@ def check_ratio(self, main_d, diff_d):
         self.ratios_cache[key] = 1.0
         return 1.0
 
-    r = max(v1, v2, v3, v4, v5)
+    v6 = 0.0
+    if ML_ENABLED and self.machine_learning:
+      v6 = self.get_ml_ratio(main_d, diff_d)
+
+    values_set = set([v1, v2, v3, v4, v5, v6])
+    r = max(values_set)
     if r == 1.0 and md1 != md2:
       # We cannot assign a 1.0 ratio if both MD indices are different, that's an
       # error
       r = 0
-      for v in [v1, v2, v3, v4, v5]:
+      for v in values_set:
         if v != 1.0 and v > r:
           r = v
 
     if r < 1.0:
       score = self.deep_ratio(main_d, diff_d, r)
       if r + score < 1.0:
         r += score
+      else:
+        r = 0.99
 
+    debug_refresh(f"self.ratios_cache[{main_d['name']}-{diff_d['name']}] = {r}")
     self.ratios_cache[key] = r
     return r
 
@@ -2868,6 +2891,45 @@ def add_multimatches_to_chooser(self, multi, ignore_list, dones):
 
     return ignore_list, dones
 
+  def get_ml_ratio(self, main_d, diff_d):
+    ea1 = int(main_d["ea"])
+    ea2 = int(diff_d["ea"])
+
+    ml_ratio = 0.0
+
+    cur = self.db_cursor()
+    sql = "select * from {db}.functions where address = ?"
+    try:
+      cur.execute(sql.format(db="main"), (str(ea1),))
+      main_row = cur.fetchone()
+
+      cur.execute(sql.format(db="diff"), (str(ea2),))
+      diff_row = cur.fetchone()
+
+      ml_add = False
+      ml_ratio = 0
+      if ML_ENABLED and self.machine_learning:
+        ml_ratio = predict(main_row, diff_row)
+        if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
+          log(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
+          ml_add = True
+        else:
+          ml_ratio = 0.0
+
+      if ml_add:
+        vfname1 = main_d["name"]
+        vfname2 = diff_d["name"]
+        nodes1 = main_d["nodes"]
+        nodes2 = diff_d["nodes"]
+        desc = f"ML {get_model_name()}"
+
+        tmp_item = CChooser.Item(ea1, vfname1, ea2, vfname2, desc, ml_ratio, nodes1, nodes2)
+        self.ml_chooser.add_item(tmp_item)
+    finally:
+      cur.close()
+
+    return ml_ratio
+
   def deep_ratio(self, main_d, diff_d, ratio):
     """
     Try to get a score to add to the value returned by `check_ratio()` so less
@@ -2940,25 +3002,6 @@ def deep_ratio(self, main_d, diff_d, ratio):
           set_result = set1.intersection(set2)
           if len(set_result) > 0:
             score += len(set_result) * 0.0005
-
-      ml_add = False
-      if ML_ENABLED and self.machine_learning:
-        ml_ratio = predict(main_row, diff_row, ratio)
-        if ml_ratio > 0:
-          debug_refresh(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
-          score += config.ML_DEEP_RATIO_ADDED_SCORE
-          ml_add = True
-
-      if ml_add:
-        vfname1 = main_d["name"]
-        vfname2 = diff_d["name"]
-        nodes1 = main_d["nodes"]
-        nodes2 = diff_d["nodes"]
-        desc = f"ML {get_model_name()}"
-
-        tmp_item = CChooser.Item(ea1, vfname1, ea2, vfname2, desc, ratio, nodes1, nodes2)
-        self.ml_chooser.add_item(tmp_item)
-
     finally:
       cur.close()
 
@@ -3626,6 +3669,17 @@ def train_local_model(self):
       debug_refresh("[i] Machine learning module enabled.")
       train(self, self.all_matches)
 
+  def get_callers_callees(self, db_name, func_id):
+    cur = self.db_cursor()
+    rows = []
+    try:
+      sql = "select * from {db}.callgraph where func_id = ?"
+      cur.execute(sql.format(db=db_name), (func_id,))
+      rows = list(cur.fetchall())
+    finally:
+      cur.close()
+    return rows
+
   def diff(self, db):
     """
     Diff the current two databases (main and diff).

diff --git a/diaphora_config.py b/diaphora_config.py
@@ -60,6 +60,12 @@
 # Number of rows that must be inserted to commit the transaction
 EXPORTING_FUNCTIONS_TO_COMMIT = 5000
 
+# Every time the GUI export dialog is updated a commit is issued. This is useful
+# whenever we are facing long export times with known IDA bugs that might cause
+# it to fail at an unknown moment and we want to recover from errors. You might
+# want to set it to False if you're finding small little performance wins.
+COMMIT_AFTER_EACH_GUI_UPDATE = True
+
 # The minimum number of functions in a database to, by default, disable running
 # slow queries.
 MIN_FUNCTIONS_TO_DISABLE_SLOW = 4001
@@ -73,13 +79,23 @@
 # Block size to use to generate fuzzy hashes for pseudo-codes with DeepToad
 FUZZY_HASHING_BLOCK_SIZE = 512
 
+# Use it to disable finding compilation units. In some rare cases, there are too
+# many compilation units and Diaphora might take very long to find them.
+EXPORTING_COMPILATION_UNITS = True
+
 ################################################################################
-# Default SQL related configuration options
+# Default SQL and SQLite related configuration options
 
 # Diaphora won't process more than the given value of rows (per heuristic)
 SQL_MAX_PROCESSED_ROWS = 1000000
 # SQL queries will timeout after the given number of seconds
 SQL_TIMEOUT_LIMIT = 60 * 5
+# Set this to DELETE, TRUNCATE, PERSIST, MEMORY, WAL, OFF, or None to use the
+# default value.
+SQLITE_JOURNAL_MODE = "MEMORY"
+# Set this to 0/OFF,  1/NORMAL,  2/FULL, 3/EXTRA, or None to use the default
+# value.
+SQLITE_PRAGMA_SYNCHRONOUS = "1"
 
 ################################################################################
 # Heuristics related configuration options
@@ -189,12 +205,23 @@
 
 # What is the minimum ratio required for a match to be considered for usage to
 # train a local model?
-ML_MATCHES_MIN_RATIO = 0.5
+ML_MATCHES_MIN_RATIO = 0.6
+ML_MIN_PREDICTION_RATIO = 0.72
 
 # What value should be added to the final similarity ratio when the specialized
 # classifier (trained with known good and bad results found for the current two
 # binaries being compared) finds what it thinks is a good match.
-ML_DEEP_RATIO_ADDED_SCORE = 0.04
+ML_DEEP_RATIO_ADDED_SCORE = 0.1
 
 # Show a chooser with all the matches that the classifier think are good ones?
 ML_DEBUG_SHOW_MATCHES = True
+
+#-------------------------------------------------------------------------------
+# Some imports improve performance or add features to Diaphora but aren't 100%
+# required. Diaphora will warn the reverser when these libraries failed to be
+# imported. Change this directive to shutup this warning.
+SHOW_IMPORT_WARNINGS = True
+
+#-------------------------------------------------------------------------------
+# Workarounds for IDA bugs
+DIAPHORA_WORKAROUND_MAX_TINFO_T = True
diff --git a/diaphora_ida.py b/diaphora_ida.py
@@ -1152,9 +1152,16 @@ def recalculate_primes(self):
     return callgraph_primes, callgraph_all_primes
 
   def commit_and_start_transaction(self):
-    self.db.commit()
-    self.db.execute("PRAGMA synchronous = OFF")
-    self.db.execute("PRAGMA journal_mode = MEMORY")
+    try:
+      self.db.execute("commit")
+    except sqlite3.OperationalError as e:
+      # Ignore the "cannot commit - no transaction active" error
+      pass
+
+    if config.SQLITE_PRAGMA_SYNCHRONOUS is not None:
+      self.db.execute(f"PRAGMA synchronous = {config.SQLITE_PRAGMA_SYNCHRONOUS}")
+    if config.SQLITE_JOURNAL_MODE is not None:
+      self.db.execute(f"PRAGMA journal_mode = {config.SQLITE_JOURNAL_MODE}")
     self.db.execute("BEGIN transaction")
 
   def do_export(self, crashed_before=False):
@@ -1185,10 +1192,12 @@ def do_export(self, crashed_before=False):
     self._funcs_cache = {}
     for func in func_list:
       if user_cancelled():
-        raise Exception("Canceled.")
+        raise Exception("Cancelled.")
 
       i += 1
       if (total_funcs >= 100) and i % (int(total_funcs / 100)) == 0 or i == 1:
+        if config.COMMIT_AFTER_EACH_GUI_UPDATE:
+          self.commit_and_start_transaction()
         line = "Exported %d function(s) out of %d total.\nElapsed %d:%02d:%02d second(s), remaining time ~%d:%02d:%02d"
         elapsed = time.monotonic() - t
         remaining = (elapsed / i) * (total_funcs - i)
@@ -1231,6 +1240,7 @@ def do_export(self, crashed_before=False):
         if i % (total_funcs / 10) == 0:
           self.commit_and_start_transaction()
 
+    self.commit_and_start_transaction()
     md5sum = GetInputFileMD5()
     self.save_callgraph(
       str(callgraph_primes), json.dumps(callgraph_all_primes), md5sum
@@ -1240,7 +1250,9 @@ def do_export(self, crashed_before=False):
       self.export_til()
     except:
       log(f"Error reading type libraries: {str(sys.exc_info()[1])}")
-    self.save_compilation_units()
+
+    if config.EXPORTING_COMPILATION_UNITS:
+      self.save_compilation_units()
 
     log_refresh("Creating indices...")
     self.create_indices()
@@ -1898,6 +1910,7 @@ def import_instruction(self, ins_data1, ins_data2):
 
         comment = mcmt
         cfunc.set_user_cmt(tl, comment)
+        cfunc.del_orphan_cmts()
         cfunc.save_user_cmts()
 
     tmp_ea = None
@@ -2342,8 +2355,9 @@ def decompile_and_get(self, ea):
     #
     # max non-trivial tinfo_t count has been reached
     #
-    if os.getenv("DIAPHORA_WORKAROUND_MAX_TINFO_T") is not None:
-      idaapi.clear_cached_cfuncs()
+    if config.DIAPHORA_WORKAROUND_MAX_TINFO_T:
+      if len(self._funcs_cache) % 10000 == 0:
+        idaapi.clear_cached_cfuncs()
 
     decompiler_plugin = os.getenv("DIAPHORA_DECOMPILER_PLUGIN")
     if decompiler_plugin is None:
@@ -3225,7 +3239,9 @@ def get_modules_using_lfa(self):
     return new_modules
 
   def save_compilation_units(self):
+    log_refresh("Finding compilation units...")
     lfa_modules = self.get_modules_using_lfa()
+    log_refresh("Saving compilation units...")
 
     sql1 = """insert into compilation_units (name, start_ea, end_ea)
                   values (?, ?, ?)"""
@@ -3241,7 +3257,12 @@ def save_compilation_units(self):
     cur = self.db_cursor()
     try:
       dones = set()
-      for module in lfa_modules:
+      total = len(lfa_modules)
+      checkpoint = int(total / 10)
+      for i, module in enumerate(lfa_modules):
+        if i > 0 and checkpoint > 0 and i % checkpoint == 0:
+          log_refresh(f"Processing compilation unit {i} out of {total}...")
+
         module_name = None
         if module["name"] != "":
           module_name = module["name"]
@@ -3250,12 +3271,15 @@ def save_compilation_units(self):
         cur.execute(sql1, vals)
         cu_id = cur.lastrowid
 
-        for values in self._funcs_cache.values():
-          func_id = values[0]
-          if func_id not in dones:
-            dones.add(func_id)
-            cur.execute(sql2, (cu_id, func_id))
-            cur.execute(sql4, (module_name, func_id))
+        for func in self._funcs_cache:
+          item = self._funcs_cache[func]
+          func = int(func)
+          if func >= module["start"] and func <= module["end"]:
+            func_id = item[0]
+            if func_id not in dones:
+              dones.add(func_id)
+              cur.execute(sql2, (cu_id, func_id))
+              cur.execute(sql4, (module_name, func_id))
 
         cur.execute(
           sql3,

diff --git a/extras/README.md b/extras/README.md
@@ -0,0 +1,13 @@
+# Diaphora local
+
+This is a pure Python IDA plugin to diff pseudo-codes and assembly for functions inside the current binary, instead of diffing functions in different binaries.
+
+# Installation
+
+Simply copy this script in the directory `$IDA_DIR/plugins`.
+
+# Usage
+
+Put the cursor in IDA in some function and press Ctrl + Shift + D, choose the function to diff against the current function and 2 choosers (windows) will open showing the differences at pseudo-code and assembly levels.
+
+