Skip to content

Commit

Permalink
Fixes to the current beta
Browse files Browse the repository at this point in the history
ML: The current local model causes false positives with small functions, and functions with a huge difference in basic blocks. Diaphora will ignore such matches.
CORE: Increase the added similarity score in `deep_ratio` when constants (like strings or cryptographic constants) match.
HEUR: Remove the unreliable flag from heuristics "Pseudo-code fuzzy AST hash" and "Loop Count".
VULN: Do not use difflib.unified_diff as it's terribly slow; instead use difflib.ndiff.
  • Loading branch information
joxeankoret committed Feb 23, 2024
1 parent d74d687 commit 3ad0686
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 18 deletions.
19 changes: 11 additions & 8 deletions diaphora.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from difflib import unified_diff

import ml.model
from ml.model import ML_ENABLED, train, predict, get_model_name
from ml.model import ML_ENABLED, train, predict, get_model_name, int_compare_ratio

from diaphora_heuristics import (
HEURISTICS,
Expand Down Expand Up @@ -2909,12 +2909,15 @@ def get_ml_ratio(self, main_d, diff_d):
ml_add = False
ml_ratio = 0
if ML_ENABLED and self.machine_learning:
ml_ratio = predict(main_row, diff_row)
if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
log(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
ml_add = True
else:
ml_ratio = 0.0
if min(main_row["nodes"], diff_row["nodes"]) > 3:
ml_ratio = int_compare_ratio(main_row["nodes"], diff_row["nodes"])
if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
ml_ratio = predict(main_row, diff_row)
if ml_ratio >= config.ML_MIN_PREDICTION_RATIO:
log(f"ML ratio {ml_ratio} for {main_d['name']} - {diff_d['name']}")
ml_add = True
else:
ml_ratio = 0.0

if ml_add:
vfname1 = main_d["name"]
Expand Down Expand Up @@ -3001,7 +3004,7 @@ def deep_ratio(self, main_d, diff_d, ratio):
set2 = set(json.loads(diff_row["constants"]))
set_result = set1.intersection(set2)
if len(set_result) > 0:
score += len(set_result) * 0.0005
score += len(set_result) * 0.001
finally:
cur.close()

Expand Down
4 changes: 2 additions & 2 deletions diaphora_heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ def get_query_fields(heur, quote=True):
%POSTFIX%
order by f.source_file = df.source_file""",
"min": 0.35,
"flags":[HEUR_FLAG_UNRELIABLE]
"flags":[]
})

NAME = "Partial pseudo-code fuzzy hash (normal)"
Expand Down Expand Up @@ -992,7 +992,7 @@ def get_query_fields(heur, quote=True):
%POSTFIX%
order by f.source_file = df.source_file""",
"min":0.49,
"flags":[HEUR_FLAG_SLOW, HEUR_FLAG_UNRELIABLE]
"flags":[HEUR_FLAG_SLOW]
})

NAME = "Same graph"
Expand Down
7 changes: 6 additions & 1 deletion ml/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""

__all__ = ["ML_ENABLED", "ml_model", "train", "predict", "get_model_name"]
__all__ = ["ML_ENABLED", "ml_model", "train", "predict", "get_model_name",
"int_compare_ratio"]

import sys
import json
Expand Down Expand Up @@ -227,6 +228,10 @@ def train_local_model(self) -> bool:
final = features1 + features2 + comparisons
final = convert2numbers(final)

bbratio = int_compare_ratio(row1["nodes"], row2["nodes"])
if bbratio <= ML_MATCHES_MIN_RATIO and bbratio < ratio:
ratio = bbratio

x = np.array(final)

# The ratio could be the actual ratio we calculate, but we want to train
Expand Down
16 changes: 9 additions & 7 deletions scripts/patch_diff_vulns.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
Public domain
"""

from difflib import ndiff
from difflib import unified_diff

from diaphora import CChooser, log

Expand Down Expand Up @@ -136,7 +136,7 @@ def find_vulns_using_assembly(self, func1, func2, ratio):
if asm1 is None or asm2 is None:
return results

lines = ndiff(asm1.split("\n"), asm2.split("\n"))
lines = unified_diff(asm1.split("\n"), asm2.split("\n"))
lines = list(lines)
added = None
removed = None
Expand All @@ -147,14 +147,16 @@ def find_vulns_using_assembly(self, func1, func2, ratio):
# Only consider removed/added lines (which also means modified lines)
if c in ["-", "+"]:
if c == "+":
added = line
added = line[1:]
elif c == "-":
removed = line
if line[1:].endswith(":"):
continue
removed = line[1:]

if added is not None and removed is not None:
# Check the list of known signed <-> unsigned instructions
mnem1 = added.split(" ")[1].lower()
mnem2 = removed.split(" ")[1].lower()
mnem1 = added.split(" ")[0].lower()
mnem2 = removed.split(" ")[0].lower()
if mnem1 in SIGNED_UNSIGNED_LIST:
if SIGNED_UNSIGNED_LIST[mnem1] == mnem2:
found = True
Expand Down Expand Up @@ -183,7 +185,7 @@ def find_vulns_using_pseudocode(self, func1, func2, ratio):
if pseudo1 is None or pseudo2 is None:
return results

lines = ndiff(pseudo1.split("\n"), pseudo2.split("\n"))
lines = unified_diff(pseudo1.split("\n"), pseudo2.split("\n"))
for line in lines:
c = line[0]
# Only consider removed/added lines (which also means modified lines)
Expand Down

0 comments on commit 3ad0686

Please sign in to comment.