diff --git a/src/scalib/metrics/benchmark.py b/src/scalib/metrics/benchmark.py new file mode 100644 index 00000000..fb21035a --- /dev/null +++ b/src/scalib/metrics/benchmark.py @@ -0,0 +1,147 @@ +import numpy as np +from kNNMutualInformation import kNNInformationEstimator +from scipy.special import logsumexp, binom +from scipy.integrate import quad +from scipy.stats import norm +import matplotlib.pyplot as plt + + +""" + MI ESTIMATION + Numerical estimmation of the mutual information between n bit secret and its HW + AWGN(sigma) leakages +""" + +# Integration bounds +lb = -np.inf +ub = np.inf +# Absolute Error and Relative Error Tollerated in Numerical Integration +epsabs = 10**-3 +epsrel = 10**-3 +# Maximal Number of Subinterval in Adapative Integration Method +limit = 100 + + +def integrand(y, sigma, n): + b = np.array([binom(n, w) for w in range(n + 1)]) + I = 0 + for u in range(n + 1): + a = np.array([(y - u) ** 2 - (y - w) ** 2 for w in range(n + 1)]) / ( + 2 * sigma**2 + ) + I -= norm.pdf(y - u, scale=sigma) * logsumexp(a, b=b) * binom(n, u) * (2**-n) + return I + + +def integral(sigma, n): + QUAD = quad( + integrand, lb, ub, args=(sigma, n), limit=limit, epsabs=epsabs, epsrel=epsrel + ) + return QUAD[0] + + +def mi_hw_awgn(sigma, n): + return n + integral(sigma, n) / np.log(2) + + +def hw_vec(arr): + t = arr.dtype.type + mask = t(-1) + s55 = t(0x5555555555555555 & mask) # Add more digits for 128bit support + s33 = t(0x3333333333333333 & mask) + s0F = t(0x0F0F0F0F0F0F0F0F & mask) + s01 = t(0x0101010101010101 & mask) + arr = arr - ((arr >> 1) & s55) + arr = (arr & s33) + ((arr >> 2) & s33) + arr = (arr + (arr >> 4)) & s0F + return (arr * s01) >> (8 * (arr.itemsize - 1)) + + +def gen_sample(M, sigma, n_sample): + X = np.random.randint(low=0, high=M, size=n_sample) + Y = hw_vec(X) + sigma * np.random.randn(n_sample) + Y = np.reshape(Y, (n_sample, 1)) + return X, Y + + +def multiple_estimations( + n=8, sigma=2, ensemble_k=[3, 5, 10, 15], n_sample=10**4, repeat=100, dummy_dim=0 +): + M = 1 << n + KSG_pred = np.zeros(repeat) + KSG_pred_ensemble = np.zeros((repeat, len(ensemble_k))) + for i in range(repeat): + X, Y = gen_sample(M, sigma, n_sample) + Pure_noise = np.random.randn(n_sample, dummy_dim).reshape((n_sample, dummy_dim)) + Y_with_dummy_dim = np.concatenate([Y, Pure_noise], axis=1) + KSG = kNNInformationEstimator(M, X, Y_with_dummy_dim) + prediction, ensemble_pred = KSG.ensemble_predict(ensemble_k, p=2) + KSG_pred[i] = prediction + KSG_pred_ensemble[i] = ensemble_pred + return KSG_pred, KSG_pred_ensemble + + +n = 8 +n_sigma = 15 +repeat = 50 +n_sample = 5 * 10**4 +dummy_dim = 0 +Sigma = np.geomspace(5 * 10**-2, 5 * 10**2, n_sigma) + +ensemble_k = [ + 3, + 5, + 7, + 10, +] +c = ["blue", "purple", "green", "cyan"] + +MI_true = np.array([mi_hw_awgn(sigma, n) for sigma in Sigma]) + +MI_predictions = np.zeros((n_sigma, repeat)) +MI_predictions_ensemble = np.zeros((n_sigma, repeat, len(ensemble_k))) + +for i in range(n_sigma): + MI_predictions[i], MI_predictions_ensemble[i] = multiple_estimations( + n, + Sigma[i], + ensemble_k=ensemble_k, + n_sample=n_sample, + repeat=repeat, + dummy_dim=dummy_dim, + ) + + +# To make the points with evaluation equal to zero appear on the graph +EPSILON = 10**-15 +MI_predictions = np.where(MI_predictions == 0, EPSILON, MI_predictions) +MI_predictions_ensemble = np.where( + MI_predictions_ensemble == 0, EPSILON, MI_predictions_ensemble +) + +for r in range(repeat): + plt.scatter(Sigma, MI_predictions[:, r], color="red", alpha=0.5) + +for j in range(len(ensemble_k)): + k = ensemble_k[j] + plt.plot( + Sigma, + np.median(MI_predictions_ensemble[:, :, j], axis=1), + label=f"k = {k}", + color=c[j], + ) + +plt.plot( + Sigma, np.median(MI_predictions, axis=1), color="red", label=f"KSG k={ensemble_k}" +) + + +plt.plot(Sigma, MI_true, color="black", label="Ground Truth") + +plt.title(f"Benchmark on Hamming Weight Model + AWGN Noise, Nsample= {n_sample}") +plt.xlabel(r"$\sigma$") +plt.ylabel(r"$\hat{I(X;Y)}$") +plt.semilogx(base=10) +plt.semilogy(base=10) +plt.grid() +plt.legend() +plt.show() diff --git a/src/scalib/metrics/kNNMutualInformation.py b/src/scalib/metrics/kNNMutualInformation.py new file mode 100644 index 00000000..cbd1a838 --- /dev/null +++ b/src/scalib/metrics/kNNMutualInformation.py @@ -0,0 +1,112 @@ +import numpy as np + +from scipy.special import digamma, gamma +from scipy.spatial import cKDTree + + +class kNNInformationEstimator: + r"""Mutual Information Estimator for discrete X and continous Y + + Based on "Mutual Information between Discrete and Continuous Data Sets" from Brian C. Ross + """ + + def __init__(self, M: int, X: np.ndarray[np.uint32], Y: np.ndarray[float]): + + self.Nsample = len(X) + self.M = M + self.sample_per_class = np.zeros(M, dtype=np.uint32) + + self.main_tree = cKDTree(Y, leafsize=8) + self.list_class_trees = [] + for i in range(self.M): + self.sample_per_class[i] = np.sum(X == i) + self.list_class_trees.append(cKDTree(Y[X == i], leafsize=8)) + + self.max_k = np.min(self.sample_per_class) - 1 + + def predict(self, k, p=2, base=2): + + if k > self.max_k: + raise ValueError( + f"Invalid Inputs, with these samples k can be at most {self.max_k} which is less than k = {k}" + ) + + digamma_neigh = 0 + for num_class in range(self.M): + d, _ = self.list_class_trees[num_class].query( + self.list_class_trees[num_class].data, k + 1, p=p, workers=1, eps=0 + ) + + # !!!! The center of the ball should not be counted !!!! + num_neigh = ( + self.main_tree.query_ball_point( + self.list_class_trees[num_class].data, + d[:, k], + return_length=True, + workers=1, + eps=0, + ) + - 1 + ) + digamma_neigh += np.sum(digamma(num_neigh)) + digamma_neigh /= self.Nsample + + mi_kNN = ( + digamma(self.Nsample) + + digamma(k) + - np.sum(self.sample_per_class * digamma(self.sample_per_class)) + / self.Nsample + - digamma_neigh + ) + mi_kNN /= np.log(base) + + return np.maximum(0, mi_kNN) + + def ensemble_predict(self, ensemble_k, p=2, base=2): + ensemble_k = np.asarray(ensemble_k) + max_k_ensemble = np.max(ensemble_k) + + if max_k_ensemble >= self.max_k: + raise ValueError( + f"Invalid Inputs, with these samples k can be at most {self.max_k} which is less than k = {max_k_ensemble}" + ) + + digamma_neigh = np.zeros(len(ensemble_k)) + for num_class in range(self.M): + d, _ = self.list_class_trees[num_class].query( + self.list_class_trees[num_class].data, + max_k_ensemble + 1, + p=p, + workers=1, + eps=0, + ) + + for i in range(len(ensemble_k)): + # !!!! The center of the ball should not be counted !!!! + k = ensemble_k[i] + num_neigh = ( + self.main_tree.query_ball_point( + self.list_class_trees[num_class].data, + d[:, k], + return_length=True, + workers=1, + eps=0, + ) + - 1 + ) + digamma_neigh[i] += np.sum(digamma(num_neigh)) + digamma_neigh /= self.Nsample + + mi_kNNs = ( + digamma(self.Nsample) + + digamma(ensemble_k) + - np.sum(self.sample_per_class * digamma(self.sample_per_class)) + / self.Nsample + - digamma_neigh + ) + mi_kNNS = mi_kNNs / np.log(base) + mi_kNNS = np.maximum(0, mi_kNNS) + + # Taking maximum inside the mean increases the MSE but deacreases the risk to underestimate the Mutual Information + # Otherwise we can take the median ? + return np.median(mi_kNNS), mi_kNNS