Fixes to the current beta

ML: The current local model causes false positives with small functions, and functions with a huge difference in basic blocks. Diaphora will ignore such matches. CORE: Increase the added similarity score in `deep_ratio` when constants (like strings or cryptographic constants) match. HEUR: Remove the unreliable flag from heuristics "Pseudo-code fuzzy AST hash" and "Loop Count". VULN: Do not use difflib.unified_diff as it's terribly slow; instead use difflib.ndiff.
joxeankoret · Feb 23, 2024 · 3ad0686 · 3ad0686
1 parent d74d687
commit 3ad0686
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 18 deletions.
diff --git a/diaphora.py b/diaphora.py
@@ -52,7 +52,7 @@
 from difflib import unified_diff
 
 import ml.model
-from ml.model import ML_ENABLED, train, predict, get_model_name
+from ml.model import ML_ENABLED, train, predict, get_model_name, int_compare_ratio
 
 from diaphora_heuristics import (
   HEURISTICS,
@@ -2909,12 +2909,15 @@ def get_ml_ratio(self, main_d, diff_d):
       ml_add = False
       ml_ratio = 0
       if ML_ENABLED and self.machine_learning:
-        ml_ratio = predict(main_row, diff_row)
-        if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
-          log(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
-          ml_add = True
-        else:
-          ml_ratio = 0.0
+        if min(main_row["nodes"], diff_row["nodes"]) > 3:
+          ml_ratio = int_compare_ratio(main_row["nodes"], diff_row["nodes"])
+          if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
+            ml_ratio = predict(main_row, diff_row)
+            if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
+              log(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
+              ml_add = True
+            else:
+              ml_ratio = 0.0
 
       if ml_add:
         vfname1 = main_d["name"]
@@ -3001,7 +3004,7 @@ def deep_ratio(self, main_d, diff_d, ratio):
           set2 = set(json.loads(diff_row["constants"]))
           set_result = set1.intersection(set2)
           if len(set_result) > 0:
-            score += len(set_result) * 0.0005
+            score += len(set_result) * 0.001
     finally:
       cur.close()
 

diff --git a/diaphora_heuristics.py b/diaphora_heuristics.py
@@ -834,7 +834,7 @@ def get_query_fields(heur, quote=True):
         %POSTFIX%
       order by f.source_file = df.source_file""",
   "min": 0.35,
-  "flags":[HEUR_FLAG_UNRELIABLE]
+  "flags":[]
 })
 
 NAME = "Partial pseudo-code fuzzy hash (normal)"
@@ -992,7 +992,7 @@ def get_query_fields(heur, quote=True):
         %POSTFIX%
       order by f.source_file = df.source_file""",
   "min":0.49,
-  "flags":[HEUR_FLAG_SLOW, HEUR_FLAG_UNRELIABLE]
+  "flags":[HEUR_FLAG_SLOW]
 })
 
 NAME = "Same graph"

diff --git a/ml/model.py b/ml/model.py
@@ -18,7 +18,8 @@
 along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 
-__all__ = ["ML_ENABLED", "ml_model", "train", "predict", "get_model_name"]
+__all__ = ["ML_ENABLED", "ml_model", "train", "predict", "get_model_name",
+  "int_compare_ratio"]
 
 import sys
 import json
@@ -227,6 +228,10 @@ def train_local_model(self) -> bool:
         final = features1 + features2 + comparisons
         final = convert2numbers(final)
 
+        bbratio = int_compare_ratio(row1["nodes"], row2["nodes"])
+        if bbratio <= ML_MATCHES_MIN_RATIO and bbratio < ratio:
+          ratio = bbratio
+
         x = np.array(final)
 
         # The ratio could be the actual ratio we calculate, but we want to train

diff --git a/scripts/patch_diff_vulns.py b/scripts/patch_diff_vulns.py
@@ -8,7 +8,7 @@
 Public domain
 """
 
-from difflib import ndiff
+from difflib import unified_diff
 
 from diaphora import CChooser, log
 
@@ -136,7 +136,7 @@ def find_vulns_using_assembly(self, func1, func2, ratio):
     if asm1 is None or asm2 is None:
       return results
 
-    lines = ndiff(asm1.split("\n"), asm2.split("\n"))
+    lines = unified_diff(asm1.split("\n"), asm2.split("\n"))
     lines = list(lines)
     added = None
     removed = None
@@ -147,14 +147,16 @@ def find_vulns_using_assembly(self, func1, func2, ratio):
       # Only consider removed/added lines (which also means modified lines)
       if c in ["-", "+"]:
         if c == "+":
-          added = line
+          added = line[1:]
         elif c == "-":
-          removed = line
+          if line[1:].endswith(":"):
+            continue
+          removed = line[1:]
 
         if added is not None and removed is not None:
           # Check the list of known signed <-> unsigned instructions
-          mnem1 = added.split(" ")[1].lower()
-          mnem2 = removed.split(" ")[1].lower()
+          mnem1 = added.split(" ")[0].lower()
+          mnem2 = removed.split(" ")[0].lower()
           if mnem1 in SIGNED_UNSIGNED_LIST:
             if SIGNED_UNSIGNED_LIST[mnem1] == mnem2:
               found = True
@@ -183,7 +185,7 @@ def find_vulns_using_pseudocode(self, func1, func2, ratio):
     if pseudo1 is None or pseudo2 is None:
       return results
 
-    lines = ndiff(pseudo1.split("\n"), pseudo2.split("\n"))
+    lines = unified_diff(pseudo1.split("\n"), pseudo2.split("\n"))
     for line in lines:
       c = line[0]
       # Only consider removed/added lines (which also means modified lines)