Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Guy/hashes be hashes #309

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
6 changes: 4 additions & 2 deletions idaplugin/rematch/collectors/vectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from .assembly_hash import AssemblyHashVector
from .mnemonic_hash import MnemonicHashVector
from .mnemonic_hist import MnemonicHistVector

from .fnv_hash import FnvHashVector
from .apidom_hash import ApiDomintorHashVector

__all__ = ["Vector", "IdentityHashVector", "NameHashVector",
"AssemblyHashVector", "MnemonicHashVector", "MnemonicHistVector"]
"AssemblyHashVector", "MnemonicHashVector", "MnemonicHistVector",
"FnvHashVector", "ApiDomintorHashVector", ]
41 changes: 41 additions & 0 deletions idaplugin/rematch/collectors/vectors/apidom_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from ida_gdl import FlowChart
from ida_idp import is_call_insn
from idaapi import get_func

from . import vector
from collections import defaultdict


class ApiDomintorHashVector(vector.Vector):
type = 'apidom_hash'
type_version = 0

@classmethod
def data(cls, offset):
# iterate over the function's basic blocks
flwchrt = FlowChart(get_func(offset))
bbcall = defaultdict(list)

for blck in flwchrt:
start = blck.startEA
curr_ea = start
end = blck.endEA

# bucketsize every basic block

# TODO XXX
# find a decent way to get imports
# maybe a helper function instead
# of inlining it here.

bbinsn = []
while curr_ea < end:
bbinsn.append(GetMnem(curr_ea)) # noqa: F821

if is_call_insn(curr_ea):
bbcall[start].append(bbinsn)
bbinsn = []

curr_ea = NextHead(curr_ea) # noqa: F821

return bbcall
55 changes: 55 additions & 0 deletions idaplugin/rematch/collectors/vectors/fnv_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import idautils
import idaapi
import idc


from . import vector


class FnvHashVector(vector.Vector):
FNV1_64A_PRIME = 0x100000001b3
FNV1_32A_PRIME = 0x01000193
FNV1_32A_INIT = 0x811c9dc5
FNV1_32A_SIZE = 2**32
FNV1_64A_INIT = 0xcbf29ce484222325
FNV1_64A_SIZE = 2**64
type = 'fnv_hash'
type_version = 0

def fnv_64a(self, data):
val = self.FNV1_64A_INIT
val = val ^ data
val = (val * self.FNV1_64A_PRIME) % self.FNV1_64A_SIZE
return val

def fnv_32a(self, data):
val = self.FNV1_32A_INIT
val = val ^ data
val = (val * self.FNV1_32A_PRIME) % self.FNV1_32A_SIZE
return val

@classmethod
def data(cls, offset):
bitness = idaapi.get_inf_structure()

# assuming there is no 128-bit architecture yet...
# also if it's 16b we'll hash it as 32b, kinda hoping
# this won't fuckup things too much.
if bitness.is_64():
fnv_fn = cls.fnv_64a
else:
fnv_fn = cls.fnv_32a
if len(list(idautils.FuncItems(offset))) < 3:
return None

for ea in idautils.FuncItems(offset):
h = fnv_fn(idc.Byte(ea))
has_coderefs = idautils.CodeRefsFrom(ea, True) or \
idautils.DataRefsFrom(ea)
if has_coderefs:
continue

for i in range(ea + 1, ea + idc.ItemSize(ea)):
h = fnv_fn(h, idc.Byte(i))

return h
30 changes: 30 additions & 0 deletions idaplugin/rematch/collectors/vectors/mdindx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from collect import defaultdict
import ida_gdl
import idaapi

from . import vector


class MDIndexVector(vector.Vector):
type = 'MDIndex_Hash'
type_version = 0

@classmethod
def data(cls, offset):
# we're assuming offset is actually a function which has boundaries,
# this assumption is reasonable as we assume the underlying framework
# (IDA, Binja, r2), iterates only over functions.
fn = idaapi.get_func(offset)
bbs = ida_gdl.FlowChart(fn)

bbset = defaultdict(dict)

for bb in bbs:
if bb not in bbset:
bbset[bb] = {'in': 0, 'out': 0}
bbset[bb]['in'] += 1

chunks = [chunk for chunk in bb.succs()]
if chunks[-1] not in bbset:
bbset[bb] = {'in': 0, 'out': 0}
bbset[bb]['out'] += 1
3 changes: 2 additions & 1 deletion idaplugin/rematch/instances/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ def __init__(self, *args, **kwargs):
self.vectors |= {collectors.vectors.IdentityHashVector,
collectors.vectors.AssemblyHashVector,
collectors.vectors.MnemonicHashVector,
collectors.vectors.MnemonicHistVector}
collectors.vectors.MnemonicHistVector,
collectors.vectors.FnvHashVector, }
self.annotations |= {collectors.annotations.AssemblyAnnotation}
9 changes: 7 additions & 2 deletions server/collab/matchers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
from .mnemonic_hash import MnemonicHashMatcher
from .name_hash import NameHashMatcher
from .mnemonic_hist import MnemonicHistogramMatcher
from .fnv_hash import FnvHashMatcher
from .apidom_hash import ApiDominatorMatcher
from .fuzzy_matcher import FuzzyHashMatcher


matchers_list = [IdentityHashMatcher, NameHashMatcher, AssemblyHashMatcher,
MnemonicHashMatcher, MnemonicHistogramMatcher]
MnemonicHashMatcher, MnemonicHistogramMatcher,
FnvHashMatcher, ApiDominatorMatcher, FuzzyHashMatcher, ]

__all__ = ['IdentityHashMatcher', 'AssemblyHashMatcher', 'MnemonicHashMatcher',
'NameHashMatcher', 'MnemonicHistogramMatcher', 'matchers_list']
'NameHashMatcher', 'MnemonicHistogramMatcher', 'matchers_list',
'FnvHashMatcher', 'ApiDominatorMatcher', 'FuzzyHashMatcher', ]
7 changes: 7 additions & 0 deletions server/collab/matchers/apidom_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from . import fuzzy_matcher


class ApiDominatorMatcher(fuzzy_matcher.FuzzyHashMatcher):
vector_type = 'apidom_hash'
match_type = 'apidom_hash'
matcher_name = 'API Call Dominator Hash'
7 changes: 7 additions & 0 deletions server/collab/matchers/fnv_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from . import fuzzy_matcher


class FnvHashMatcher(fuzzy_matcher.FuzzyHashMatcher):
vector_type = 'fnv_hash'
match_type = 'fnv_hash'
matcher_name = 'FNV Hash'
40 changes: 40 additions & 0 deletions server/collab/matchers/fuzzy_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import itertools
import json
from operator import xor as xorred_fn

import numpy as np
import sklearn as skl
import sklearn.metrics # noqa flake8 importing as a different name
import sklearn.feature_extraction # noqa flake8 importing as a different name

from . import matcher


class FuzzyHashMatcher(matcher.Matcher):
@classmethod
def match(cls, source, target):
target_values = itertools.izip(*source.value_list('instance_id', 'data'))
source_values = itertools.izip(*target.value_list('instance_id', 'data'))

source_instance_ids, source_data = source_values
target_instance_ids, target_data = target_values

source_list = [json.loads(d) for d in source_data]
target_list = [json.loads(d) for d in target_data]

dictvect = skl.feature_extraction.DictVectorizer()
source_matrix = dictvect.fit_transform(source_list)
target_matrix = dictvect.transform(target_list)

distance_matrix = skl.metric.pairwise_distances(source_matrix,
target_matrix,
xorred_fn)
max_distance = distance_matrix.max()
score_matrix = (1 - (distance_matrix / max_distance)) * 100

for source_i, target_i in np.ndindex(*distance_matrix.shape):
source_instance_id = source_instance_ids[source_i]
target_instance_id = target_instance_ids[target_i]

score = score_matrix[source_i][target_i]
yield (source_instance_id, target_instance_id, score)
3 changes: 1 addition & 2 deletions server/collab/matchers/hist_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

import numpy as np
import sklearn as skl
import sklearn.metrics # noqa flake8 importing as a different name
import sklearn.preprocessing # noqa flake8 importing as a different name
import sklearn.metrics
import sklearn.feature_extraction # noqa flake8 importing as a different name

from . import matcher
Expand Down