From 6f12490f706e395784caae8fd89bb0c5838413c6 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Mon, 20 Nov 2023 14:18:04 +0100 Subject: [PATCH 01/20] Added the DIFFI Methods to iForest --- pyod/models/iforest.py | 597 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 597 insertions(+) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index c4c20e278..4726f0a2b 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -14,6 +14,20 @@ from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted +from sklearn.ensemble._iforest import _average_path_length +from sklearn.utils.validation import _num_samples +from sklearn.utils import gen_batches, get_chunk_n_rows +import numpy as np +from math import ceil +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.pyplot import cm + +import pandas as pd +import os +import pickle +import time + from .base import BaseDetector # noinspection PyProtectedMember from ..utils.utility import invert_order @@ -322,3 +336,586 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + + # The functions below have been adapted from the sklearn source code + + def decision_function_single_tree(self, tree_idx, X): + return self._score_samples(tree_idx, X) - self.offset_ + + + def _score_samples(self, tree_idx, X): + n_feat= self.n_features_in_ + if n_feat != X.shape[1]: + raise ValueError("Number of features of the model must " + "match the input. Model n_features is {0} and " + "input n_features is {1}." + "".format(n_feat, X.shape[1])) + return -self._compute_chunked_score_samples(tree_idx, X) + + + def _compute_chunked_score_samples(self, tree_idx, X): + n_samples = _num_samples(X) + if int(self.max_features*X.shape[1]) == X.shape[1]: + subsample_features = False + else: + subsample_features = True + chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features, + max_n_rows=n_samples) + slices = gen_batches(n_samples, chunk_n_rows) + scores = np.zeros(n_samples, order="f") + for sl in slices: + scores[sl] = self._compute_score_samples_single_tree(tree_idx, X[sl], subsample_features) + return scores + + + def _compute_score_samples_single_tree(self, tree_idx, X, subsample_features): + n_samples = X.shape[0] + depths = np.zeros(n_samples, order="f") + tree = self.estimators_[tree_idx] + features = self.estimators_features_[tree_idx] + X_subset = X[:, features] if subsample_features else X + leaves_index = tree.apply(X_subset) + node_indicator = tree.decision_path(X_subset) + n_samples_leaf = tree.tree_.n_node_samples[leaves_index] + depths += (np.ravel(node_indicator.sum(axis=1)) + _average_path_length(n_samples_leaf) - 1.0) + scores = 2 ** (-depths / (1 * _average_path_length([self.max_samples_]))) + return scores + + def fs_datasets_hyperparams(self,dataset): + data = { + # cardio + ('cardio'): {'contamination': 0.1, 'max_samples': 64, 'n_estimators': 150}, + # ionosphere + ('ionosphere'): {'contamination': 0.2, 'max_samples': 256, 'n_estimators': 100}, + # lympho + ('lympho'): {'contamination': 0.05, 'max_samples': 64, 'n_estimators': 150}, + # letter + ('letter'): {'contamination': 0.1, 'max_samples': 256, 'n_estimators': 50}, + # musk + ('musk'): {'contamination': 0.05, 'max_samples': 128, 'n_estimators': 100}, + # satellite + ('satellite'): {'contamination': 0.15, 'max_samples': 64, 'n_estimators': 150} + } + return data[dataset] + + def diffi_ib(self, X, adjust_iic=True): # "ib" stands for "in-bag" + # start time + start = time.time() + # initialization + num_feat = X.shape[1] + estimators = self.estimators_ + cfi_outliers_ib = np.zeros(num_feat).astype('float') + cfi_inliers_ib = np.zeros(num_feat).astype('float') + counter_outliers_ib = np.zeros(num_feat).astype('int') + counter_inliers_ib = np.zeros(num_feat).astype('int') + in_bag_samples = self.estimators_samples_ + # for every iTree in the iForest + for k, estimator in enumerate(estimators): + # get in-bag samples indices + in_bag_sample = list(in_bag_samples[k]) + # get in-bag samples (predicted inliers and predicted outliers) + X_ib = X[in_bag_sample,:] + as_ib = self.decision_function_single_tree(k, X_ib) + X_outliers_ib = X_ib[np.where(as_ib < 0)] + X_inliers_ib = X_ib[np.where(as_ib > 0)] + if X_inliers_ib.shape[0] == 0 or X_outliers_ib.shape[0] == 0: + continue + # compute relevant quantities + n_nodes = estimator.tree_.node_count + children_left = estimator.tree_.children_left + children_right = estimator.tree_.children_right + feature = estimator.tree_.feature + node_depth = np.zeros(shape=n_nodes, dtype=np.int64) + is_leaves = np.zeros(shape=n_nodes, dtype=bool) + # compute node depths + stack = [(0, -1)] + while len(stack) > 0: + node_id, parent_depth = stack.pop() + node_depth[node_id] = parent_depth + 1 + # if we have a test node + if (children_left[node_id] != children_right[node_id]): + stack.append((children_left[node_id], parent_depth + 1)) + stack.append((children_right[node_id], parent_depth + 1)) + else: + is_leaves[node_id] = True + # OUTLIERS + # compute IICs for outliers + lambda_outliers_ib = self._get_iic(estimator, X_outliers_ib, is_leaves, adjust_iic) + # update cfi and counter for outliers + node_indicator_all_points_outliers_ib = estimator.decision_path(X_outliers_ib) + node_indicator_all_points_array_outliers_ib = node_indicator_all_points_outliers_ib.toarray() + # for every point judged as abnormal + for i in range(len(X_outliers_ib)): + path = list(np.where(node_indicator_all_points_array_outliers_ib[i] == 1)[0]) + depth = node_depth[path[-1]] + for node in path: + current_feature = feature[node] + if lambda_outliers_ib[node] == -1: + continue + else: + cfi_outliers_ib[current_feature] += (1 / depth) * lambda_outliers_ib[node] + counter_outliers_ib[current_feature] += 1 + # INLIERS + # compute IICs for inliers + lambda_inliers_ib = self._get_iic(estimator, X_inliers_ib, is_leaves, adjust_iic) + # update cfi and counter for inliers + node_indicator_all_points_inliers_ib = estimator.decision_path(X_inliers_ib) + node_indicator_all_points_array_inliers_ib = node_indicator_all_points_inliers_ib.toarray() + # for every point judged as normal + for i in range(len(X_inliers_ib)): + path = list(np.where(node_indicator_all_points_array_inliers_ib[i] == 1)[0]) + depth = node_depth[path[-1]] + for node in path: + current_feature = feature[node] + if lambda_inliers_ib[node] == -1: + continue + else: + cfi_inliers_ib[current_feature] += (1 / depth) * lambda_inliers_ib[node] + counter_inliers_ib[current_feature] += 1 + # compute FI + fi_outliers_ib = np.where(counter_outliers_ib > 0, cfi_outliers_ib / counter_outliers_ib, 0) + fi_inliers_ib = np.where(counter_inliers_ib > 0, cfi_inliers_ib / counter_inliers_ib, 0) + fi_ib = fi_outliers_ib / fi_inliers_ib + end = time.time() + exec_time = end - start + return fi_ib, exec_time + + + def local_diffi(self, x): + # start time + start = time.time() + # initialization + estimators = self.estimators_ + cfi = np.zeros(len(x)).astype('float') + counter = np.zeros(len(x)).astype('int') + max_depth = int(np.ceil(np.log2(self.max_samples))) + # for every iTree in the iForest + for estimator in estimators: + n_nodes = estimator.tree_.node_count + children_left = estimator.tree_.children_left + children_right = estimator.tree_.children_right + feature = estimator.tree_.feature + node_depth = np.zeros(shape=n_nodes, dtype=np.int64) + is_leaves = np.zeros(shape=n_nodes, dtype=bool) + # compute node depths + stack = [(0, -1)] + while len(stack) > 0: + node_id, parent_depth = stack.pop() + node_depth[node_id] = parent_depth + 1 + # if test node + if (children_left[node_id] != children_right[node_id]): + stack.append((children_left[node_id], parent_depth + 1)) + stack.append((children_right[node_id], parent_depth + 1)) + else: + is_leaves[node_id] = True + # update cumulative importance and counter + x = x.reshape(1,-1) + node_indicator = estimator.decision_path(x) + node_indicator_array = node_indicator.toarray() + path = list(np.where(node_indicator_array == 1)[1]) + leaf_depth = node_depth[path[-1]] + for node in path: + if not is_leaves[node]: + current_feature = feature[node] + cfi[current_feature] += (1 / leaf_depth) - (1 / max_depth) + counter[current_feature] += 1 + # compute FI + fi = np.zeros(len(cfi)) + for i in range(len(cfi)): + if counter[i] != 0: + fi[i] = cfi[i] / counter[i] + end = time.time() + exec_time = end - start + return fi, exec_time + + + def _get_iic(estimator, predictions, is_leaves, adjust_iic): + desired_min = 0.5 + desired_max = 1.0 + epsilon = 0.0 + n_nodes = estimator.tree_.node_count + lambda_ = np.zeros(n_nodes) + children_left = estimator.tree_.children_left + children_right = estimator.tree_.children_right + # compute samples in each node + node_indicator_all_samples = estimator.decision_path(predictions).toarray() + num_samples_in_node = np.sum(node_indicator_all_samples, axis=0) + # ASSIGN INDUCED IMBALANCE COEFFICIENTS (IIC) + for node in range(n_nodes): + # compute relevant quantities for current node + num_samples_in_current_node = num_samples_in_node[node] + num_samples_in_left_children = num_samples_in_node[children_left[node]] + num_samples_in_right_children = num_samples_in_node[children_right[node]] + # if there is only 1 feasible split or node is leaf -> no IIC is assigned + if num_samples_in_current_node == 0 or num_samples_in_current_node == 1 or is_leaves[node]: + lambda_[node] = -1 + # if useless split -> assign epsilon + elif num_samples_in_left_children == 0 or num_samples_in_right_children == 0: + lambda_[node] = epsilon + else: + if num_samples_in_current_node%2==0: # even + current_min = 0.5 + else: # odd + current_min = ceil(num_samples_in_current_node/2)/num_samples_in_current_node + current_max = (num_samples_in_current_node-1)/num_samples_in_current_node + tmp = np.max([num_samples_in_left_children, num_samples_in_right_children]) / num_samples_in_current_node + if adjust_iic and current_min!=current_max: + lambda_[node] = ((tmp-current_min)/(current_max-current_min))*(desired_max-desired_min)+desired_min + else: + lambda_[node] = tmp + return lambda_ + + def local_diffi_batch(self, X): + fi = [] + ord_idx = [] + exec_time = [] + for i in range(X.shape[0]): + x_curr = X[i, :] + fi_curr, exec_time_curr = self.local_diffi(x_curr) + fi.append(fi_curr) + ord_idx_curr = np.argsort(fi_curr)[::-1] + ord_idx.append(ord_idx_curr) + exec_time.append(exec_time_curr) + fi = np.vstack(fi) + ord_idx = np.vstack(ord_idx) + return fi, ord_idx, exec_time + + + + + def compute_local_importances(self,X: pd.DataFrame,name: str,pwd_imp_score: str = os.getcwd(), pwd_plt_data: str = os.getcwd()) -> tuple[np.array,dict,str,str]: + """ + Collect useful information that will be successively used by the plt_importances_bars,plt_global_importance_bar and plt_feat_bar_plot + functions. + + Parameters + ---------- + self: Current instance of the Isolation Forest model + X: Input dataset + name: Dataset's name + pwd_imp_score: Directory where the Importance Scores results will be saved as pkl files, by default the current working directory + pwd_plt_data: Directory where the plot data results will be saved as pkl files, by default the current working directory + + Returns: + imps: 2-dimensional array containing the local Feature Importance values for the samples of the input dataset X. The array is also locally saved in a pkl file for the sake of reproducibility. + plt_data: Dictionary containig the average Importance Scores values, the feature order and the standard deviations on the Importance Scores. The dictionary is also locally saved in a pkl file for the sake of reproducibility. + path_fi: Path of the pkl file containing the Importance Scores + path_plt_data: Path of the pkl file containing the plt data + """ + + name='LFI_'+name + fi,_,_=self.local_diffi_batch(X) + + # Save the Importance Scores in a pkl file + path_fi = pwd_imp_score + '/imp_scores_' + name + '.pkl' + with open(path_fi, 'wb') as fl: + pickle.dump(fi,fl) + + """ + Take the mean feature importance scores over the different runs for the Feature Importance Plot + and put it in decreasing order of importance. + To remove the possible np.nan or np.inf values from the mean computation use assign np.nan to the np.inf values + and then ignore the np.nan values using np.nanmean + """ + + fi[fi==np.inf]=np.nan + mean_imp=np.nanmean(fi,axis=0) + std_imp=np.nanstd(fi,axis=0) + mean_imp_val=np.sort(mean_imp) + feat_order=mean_imp.argsort() + + plt_data={'Importances': mean_imp_val, + 'feat_order': feat_order, + 'std': std_imp[mean_imp.argsort()]} + + # Save the plt_data dictionary in a pkl file + path_plt_data = pwd_plt_data + '/plt_data_' + name + '.pkl' + with open(path_plt_data, 'wb') as fl: + pickle.dump(plt_data,fl) + + + return fi,plt_data,path_fi,path_plt_data + + def compute_global_importances(self,X: pd.DataFrame, n_runs:int, name: str,pwd_imp_score: str = os.getcwd(), pwd_plt_data: str = os.getcwd()) -> tuple[np.array,dict,str,str]: + """ + Collect useful information that will be successively used by the plt_importances_bars,plt_global_importance_bar and plt_feat_bar_plot + functions. + + Parameters: + model: An instance of the Isolation Forest model + X: Input Dataset + n_runs: Number of runs to perform in order to compute the Global Feature Importance Scores. + name: Dataset's name + pwd_imp_score: Directory where the Importance Scores results will be saved as pkl files, by default the current working directory + pwd_plt_data: Directory where the plot data results will be saved as pkl files, by default the current working directory + + Returns: + imps: 2-dimensional array containing the local Feature Importance values for the samples of the input dataset X. The array is also locally saved in a pkl file for the sake of reproducibility. + plt_data: Dictionary containig the average Importance Scores values, the feature order and the standard deviations on the Importance Scores. The dictionary is also locally saved in a pkl file for the sake of reproducibility. + path_fi: Path of the pkl file containing the Importance Scores + path_plt_data: Path of the pkl file containing the plt data + """ + + name='GFI_'+name + fi=np.zeros(shape=(n_runs,X.shape[1])) + for i in range(n_runs): + self.fit(X) + fi[i,:],_=self.diffi_ib(X) + + # Save the Importance Scores in a pkl file + path_fi = pwd_imp_score + '/imp_scores_' + name + '.pkl' + with open(path_fi, 'wb') as fl: + pickle.dump(fi,fl) + + + fi[fi==np.inf]=np.nan + mean_imp=np.nanmean(fi,axis=0) + std_imp=np.nanstd(fi,axis=0) + mean_imp_val=np.sort(mean_imp) + feat_order=mean_imp.argsort() + + plt_data={'Importances': mean_imp_val, + 'feat_order': feat_order, + 'std': std_imp[mean_imp.argsort()]} + + # Save the plt_data dictionary in a pkl file + path_plt_data = pwd_plt_data + '/plt_data' + name + '.pkl' + with open(path_plt_data, 'wb') as fl: + pickle.dump(plt_data,fl) + + + return fi,plt_data,path_fi,path_plt_data + + def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f: int = 6,save: bool =True): + """ + Obtain the Global Importance Bar Plot given the Importance Scores values computed in the compute_imps function. + + Parameters: + imps_path: Path of the pkl file containing the 2-dimensional array of the LFI/GFI Scores for the input dataset.Obtained from the compute_imps function. + name: Dataset's name + pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. + f: Number of vertical bars to include in the Bar Plot. By default f is set to 6. + save: Boolean variable used to decide weather to save the Bar Plot locally as a PDF or not. + + Returns: + Obtain the Bar Plot which is then saved locally as a PDF. + """ + + #Load the imps array from the pkl file contained in imps_path -> the imps_path is returned from the + #compute_local_importances or compute_global_importances functions so we have it for free + with open(imps_path, 'rb') as file: + importances = pickle.load(file) + + number_colours = 20 + color = plt.cm.get_cmap('tab20',number_colours).colors + patterns = [None, "/" , "\\" , "|" , "-" , "+" , "x", "o", "O", ".", "*" ] + importances_matrix = np.array([np.array(pd.Series(x).sort_values(ascending = False).index).T for x in importances]) + dim=importances.shape[1] + dim=int(dim) + bars = [[(list(importances_matrix[:,j]).count(i)/len(importances_matrix))*100 for i in range(dim)] for j in range(dim)] + bars = pd.DataFrame(bars) + #display(bars) + + tick_names=[] + for i in range(1,f+1): + if i==1: + tick_names.append(r'${}'.format(i) + r'^{st}$') + elif i==2: + tick_names.append(r'${}'.format(i) + r'^{nd}$') + elif i==3: + tick_names.append(r'${}'.format(i) + r'^{rd}$') + else: + tick_names.append(r'${}'.format(i) + r'^{th}$') + + barWidth = 0.85 + r = range(dim) + ncols=1 + if importances.shape[1]>15: + ncols=2 + + + fig, ax = plt.subplots() + + for i in range(dim): + ax.bar(r[:f], bars.T.iloc[i, :f].values, bottom=bars.T.iloc[:i, :f].sum().values, color=color[i % number_colours], edgecolor='white', width=barWidth, label=str(i), hatch=patterns[i // number_colours]) + + ax.set_xlabel("Rank", fontsize=20) + ax.set_xticks(range(f), tick_names[:f]) + ax.set_ylabel("Percentage count", fontsize=20) + ax.set_yticks(range(10, 101, 10), [str(x) + "%" for x in range(10, 101, 10)]) + ax.legend(bbox_to_anchor=(1.05, 0.95), loc="upper left",ncol=ncols) + + if save: + plt.savefig(pwd + '/{}_bar_plot.pdf'.format(name), bbox_inches='tight') + + return fig, ax, bars + + + def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),is_local: bool =True,save: bool =True): + """ + Obtain the Global Feature Importance Score Plot exploiting the information obtained from compute_imps function. + + Parameters + ---------- + plt_data_path: Dictionary generated from the compute_imps function with the necessary information to create the Score Plot. + name: Dataset's name + pwd: Directory where the plot will be saved as pkl files. By default the value of pwd is set to the current working directory. + is_local: Boolean variable used to specify weather we are plotting the Global or Local Feature Importance in order to set the file name. + If is_local is True the result will be the LFI Score Plot (based on the LFI scores of the input samples), otherwise the result is the GFI + Score Plot (based on the GFI scores obtained in the different n_runs execution of the model). + save: Boolean variable used to decide weather to save the Score Plot locally as a PDF or not. + + Returns: + Obtain the Score Plot which is also locally saved as a PDF. + """ + #Load the plt_data dictionary from the pkl file contained in plt_data_path -> the plt_data_path is returned from the + #compute_local_importances or compute_global_importances functions so we have it for free + with open(plt_data_path, 'rb') as f: + plt_data = pickle.load(f) + + name_file='Score_plot_'+name + + patterns = [None, "/" , "\\" , "|" , "-" , "+" , "x", "o", "O", ".", "*" ] + imp_vals=plt_data['Importances'] + feat_imp=pd.DataFrame({'Global Importance': np.round(imp_vals,3), + 'Feature': plt_data['feat_order'], + 'std': plt_data['std'] + }) + + if len(feat_imp)>15: + feat_imp=feat_imp.iloc[-15:].reset_index(drop=True) + + dim=feat_imp.shape[0] + + number_colours = 20 + + plt.style.use('default') + plt.rcParams['axes.facecolor'] = '#F2F2F2' + plt.rcParams['axes.axisbelow'] = True + color = plt.cm.get_cmap('tab20',number_colours).colors + ax1=feat_imp.plot(y='Global Importance',x='Feature',kind="barh",color=color[feat_imp['Feature']%number_colours],xerr='std', + capsize=5, alpha=1,legend=False, + hatch=[patterns[i//number_colours] for i in feat_imp['Feature']]) + xlim=np.min(imp_vals)-0.2*np.min(imp_vals) + + ax1.grid(alpha=0.7) + ax2 = ax1.twinx() + # Add labels on the right side of the bars + values=[] + for i, v in enumerate(feat_imp['Global Importance']): + values.append(str(v) + ' +- ' + str(np.round(feat_imp['std'][i],2))) + + ax2.set_ylim(ax1.get_ylim()) + ax2.set_yticks(range(dim)) + ax2.set_yticklabels(values) + ax2.grid(alpha=0) + plt.axvline(x=0, color=".5") + ax1.set_xlabel('Importance Score',fontsize=20) + ax1.set_ylabel('Features',fontsize=20) + plt.xlim(xlim) + plt.subplots_adjust(left=0.3) + if save: + plt.savefig(pwd+'/{}.pdf'.format(name_file),bbox_inches='tight') + + return ax1,ax2 + + + def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array ,resolution: int, + pwd: str =os.getcwd(),save: bool =True,m: bool =None,factor: int =3,feats_plot: tuple[int,int] =(0,1),ax=None): + """ + Produce the Local Feature Importance Scoremap. + + Parameters: + name: Dataset's name + model: Instance of the Isolation Forest model. + X_train: Training Set + y_train: Dataset training labels + resolution: Scoremap resolution + pwd: Directory where the plot will be saved. By default the value of pwd is set to the current working directory. + save: Boolean variable used to decide weather to save the Score Plot locally as a PDF or not. + m: Boolean variable regulating the plt.pcolor advanced settings. By defualt the value of m is set to None + factor: Integer factor used to define the minimum and maximum value of the points used to create the scoremap. By default the value of f is set to 3. + feats_plot: This tuple contains the indexes of the pair features to compare in the Scoremap. By default the value of feats_plot + is set to (0,1) + plt: Plt object used to create the plot. + + Returns: + Obtain the Scoremap which is also locally saved as a PDF. + """ + mins = X_train.min(axis=0)[list(feats_plot)] + maxs = X_train.max(axis=0)[list(feats_plot)] + mean = X_train.mean(axis = 0) + mins = list(mins-(maxs-mins)*factor/10) + maxs = list(maxs+(maxs-mins)*factor/10) + xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution), np.linspace(mins[1], maxs[1], resolution)) + mean = np.repeat(np.expand_dims(mean,0),len(xx)**2,axis = 0) + mean[:,feats_plot[0]]=xx.reshape(len(xx)**2) + mean[:,feats_plot[1]]=yy.reshape(len(yy)**2) + + importance_matrix = np.zeros_like(mean) + self.max_samples = len(X_train) + for i in range(importance_matrix.shape[0]): + importance_matrix[i] = self.local_diffi(mean[i])[0] + + sign = np.sign(importance_matrix[:,feats_plot[0]]-importance_matrix[:,feats_plot[1]]) + Score = sign*((sign>0)*importance_matrix[:,feats_plot[0]]+(sign<0)*importance_matrix[:,feats_plot[1]]) + x = X_train[:,feats_plot[0]].squeeze() + y = X_train[:,feats_plot[1]].squeeze() + + Score = Score.reshape(xx.shape) + + # Create a new pyplot object if plt is not provided + if ax is None: + fig, ax = plt.subplots() + + if m is not None: + cp = ax.pcolor(xx, yy, Score, cmap=cm.RdBu, vmin=-m, vmax=m, shading='nearest') + else: + cp = ax.pcolor(xx, yy, Score, cmap=cm.RdBu, shading='nearest', norm=colors.CenteredNorm()) + + ax.contour(xx, yy, (importance_matrix[:, feats_plot[0]] + importance_matrix[:, feats_plot[1]]).reshape(xx.shape), levels=7, cmap=cm.Greys, alpha=0.7) + + try: + ax.scatter(x[y_train == 0], y[y_train == 0], s=40, c="tab:blue", marker="o", edgecolors="k", label="inliers") + ax.scatter(x[y_train == 1], y[y_train == 1], s=60, c="tab:orange", marker="*", edgecolors="k", label="outliers") + except IndexError: + print('Handling the IndexError Exception...') + ax.scatter(x[(y_train == 0)[:, 0]], y[(y_train == 0)[:, 0]], s=40, c="tab:blue", marker="o", edgecolors="k", label="inliers") + ax.scatter(x[(y_train == 1)[:, 0]], y[(y_train == 1)[:, 0]], s=60, c="tab:orange", marker="*", edgecolors="k", label="outliers") + + ax.legend() + if save: + plt.savefig(pwd + '/Local_Importance_Scoremap_{}.pdf'.format(name), bbox_inches='tight') + else: + fig,ax=None,None + + return fig, ax + + def plot_complete_scoremap(self,name:str,dim:int,X: pd.DataFrame, y: np.array, pwd:str =os.getcwd()): + """ + Produce the Complete Local Feature Importance Scoremap: a Scoremap for each pair of features in the input dataset. + + Parameters: + name: Dataset's name + dim: Number of input features in the dataset + model: Instance of the Isolation Forest model. + X: Input dataset + y: Dataset labels + pwd: Directory where the plot will be saved. By default the value of pwd is set to the current working directory. + + Returns: + Obtain the Complete Scoremap which is also locally saved as a PDF. + """ + + fig, ax = plt.subplots(dim, dim, figsize=(50, 50)) + for i in range(dim): + for j in range(i+1,dim): + features = [i,j] + # One of the successive two lines can be commented so that we obtain only one "half" of the + #matrix of plots to reduce a little bit the execution time. + _,_=self.plot_importance_map(name,X, y, 50, pwd, feats_plot = (features[0],features[1]), ax=ax[i,j],save=False) + _,_=self.plot_importance_map(name,X, y, 50, pwd, feats_plot = (features[1],features[0]), ax=ax[j,i],save=False) + #fig.suptitle("comparison between DIFFI and ExIFFI "+name+" dataset",fontsize=20) + + plt.savefig(pwd+'/Local_Importance_Scoremap_{}_complete.pdf'.format(name),bbox_inches='tight') + return fig,ax From e3a28d3e3025e9d1eb4ac2ea6d4917813e9feb0e Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Mon, 20 Nov 2023 16:07:14 +0100 Subject: [PATCH 02/20] Added _max_features to iForest --- pyod/models/iforest.py | 271 ++++++++++++++++++++++++++++++----------- 1 file changed, 201 insertions(+), 70 deletions(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 4726f0a2b..9c0fe2e51 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -281,6 +281,13 @@ def max_samples_(self): Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.max_samples_ + + @property + def _max_features(self): + """The number of features used by the model (i.e. self.max_features * X.shape[1]). + Decorator for scikit-learn Isolation Forest attributes. + """ + return self.detector_._max_features @property def estimators_features_(self): @@ -340,10 +347,33 @@ def feature_importances_(self): # The functions below have been adapted from the sklearn source code def decision_function_single_tree(self, tree_idx, X): + """Modification of the decision_function method from sklearn.ensemble.IsolationForest which compute the decision function + for a single tree of the forest. + + Parameters + ---------- + tree_idx : Index of the iTree on which the decision function is computed. + X : numpy array of shape (n_samples, n_features) representing the in-bag sample of the tree. + + Returns + ------- + decision_function : numpy array of shape (n_samples,) representing the decision function of the tree on its in-bag sample. + """ return self._score_samples(tree_idx, X) - self.offset_ def _score_samples(self, tree_idx, X): + """Modification of the score_samples method from sklearn.ensemble.IsolationForest which compute the score samples for a single tree of the forest. + + Parameters + ---------- + tree_idx : Index of the iTree on which the decision function is computed. + X : numpy array of shape (n_samples, n_features) representing the in-bag sample of the tree. + + Returns + ------- + score_samples : numpy array of shape (n_samples,) representing the score samples of the tree on its in-bag sample. + """ n_feat= self.n_features_in_ if n_feat != X.shape[1]: raise ValueError("Number of features of the model must " @@ -354,6 +384,18 @@ def _score_samples(self, tree_idx, X): def _compute_chunked_score_samples(self, tree_idx, X): + """Modification of the compute_chunked_score_samples method from sklearn.ensemble.IsolationForest + used to compute the score samples on the maximum number of rows processable by the working memory for a single tree of the forest. + + Parameters + ---------- + tree_idx : Index of the iTree on which the decision function is computed. + X : numpy array of shape (n_samples, n_features) representing the in-bag sample of the tree. + + Returns + ------- + score_samples : numpy array of shape (n_samples,) representing the score samples of the tree on a batch of the in-bag sample. + """ n_samples = _num_samples(X) if int(self.max_features*X.shape[1]) == X.shape[1]: subsample_features = False @@ -369,6 +411,19 @@ def _compute_chunked_score_samples(self, tree_idx, X): def _compute_score_samples_single_tree(self, tree_idx, X, subsample_features): + """Modification of the _compute_score_samples method from sklearn.ensemble.IsolationForest + used to compute the score samples for each sample for a single tree of the forest. + + Parameters + ---------- + tree_idx : Index of the iTree on which the decision function is computed. + X : numpy array of shape (n_samples, n_features) representing the in-bag sample of the tree. + subsample_features : boolean indicating if the tree has been trained on a subsample of the features. + + Returns + ------- + score_samples : numpy array of shape (n_samples,) representing the score samples of the tree in its in-bag sample. + """ n_samples = X.shape[0] depths = np.zeros(n_samples, order="f") tree = self.estimators_[tree_idx] @@ -382,6 +437,12 @@ def _compute_score_samples_single_tree(self, tree_idx, X, subsample_features): return scores def fs_datasets_hyperparams(self,dataset): + """Returns a list of hyperparametr values to train the iForest model for different datasets. + + Parameters + ---------- + dataset : Dataset name. Available names are: 'cardio', 'ionosphere', 'lympho', 'letter', 'musk', 'satellite'. + """ data = { # cardio ('cardio'): {'contamination': 0.1, 'max_samples': 64, 'n_estimators': 150}, @@ -399,6 +460,18 @@ def fs_datasets_hyperparams(self,dataset): return data[dataset] def diffi_ib(self, X, adjust_iic=True): # "ib" stands for "in-bag" + """Computes the Global Feature Importance scores for a set of input samples according to the DIFFI algorithm. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) representing the input samples. + adjust_iic : boolean indicating if the IICs (Induced Imbalance Coefficients) should be adjusted or not. + + Returns + ------- + fi_ib : numpy array of shape (n_features,) representing the Global Feature Importance scores. + exec_time : float representing the execution time of the algorithm. + """ # start time start = time.time() # initialization @@ -482,6 +555,17 @@ def diffi_ib(self, X, adjust_iic=True): # "ib" stands for "in-bag" def local_diffi(self, x): + """Compute the Local Feature Importance scores for a single input sample according to the DIFFI algorithm. + + Parameters + ---------- + x : numpy array of shape (n_features,) representing the input sample. + + Returns + ------- + fi : numpy array of shape (n_features,) representing the Local Feature Importance scores. + exec_time : float representing the execution time of the algorithm. + """ # start time start = time.time() # initialization @@ -529,7 +613,20 @@ def local_diffi(self, x): return fi, exec_time - def _get_iic(estimator, predictions, is_leaves, adjust_iic): + def _get_iic(self,estimator, predictions, is_leaves, adjust_iic): + """Computes the Induced Imbalance Coefficients (IIC) for a tree of the iForest. + + Parameters + ---------- + estimator : Tree of the iForest. + predictions : Subset of the initial training set, containing the inliers or the outliers, on which the IIC are computed. + is_leaves : Boolean array of shape (n_nodes,) indicating if a node is a leaf or not. + adjust_iic : Boolean indicating if the IIC should be adjusted or not. + + Returns + ------- + lambda_ : numpy array of shape (n_nodes,) representing the IIC for each node of the tree. + """ desired_min = 0.5 desired_max = 1.0 epsilon = 0.0 @@ -566,6 +663,19 @@ def _get_iic(estimator, predictions, is_leaves, adjust_iic): return lambda_ def local_diffi_batch(self, X): + """Computes the Local Feature Importance scores for a set of input samples according to the DIFFI algorithm. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) representing the input samples. + + Returns + ------- + fi : numpy array of shape (n_samples, n_features) representing the Local Feature Importance scores. + ord_idx : numpy array of shape (n_samples, n_features) representing the order of the features according to their Local Feature Importance scores. + The samples are sorted in decreasing order of Feature Importance. + exec_time : float representing the execution time of the algorithm. + """ fi = [] ord_idx = [] exec_time = [] @@ -590,17 +700,19 @@ def compute_local_importances(self,X: pd.DataFrame,name: str,pwd_imp_score: str Parameters ---------- - self: Current instance of the Isolation Forest model X: Input dataset name: Dataset's name pwd_imp_score: Directory where the Importance Scores results will be saved as pkl files, by default the current working directory pwd_plt_data: Directory where the plot data results will be saved as pkl files, by default the current working directory - Returns: - imps: 2-dimensional array containing the local Feature Importance values for the samples of the input dataset X. The array is also locally saved in a pkl file for the sake of reproducibility. - plt_data: Dictionary containig the average Importance Scores values, the feature order and the standard deviations on the Importance Scores. The dictionary is also locally saved in a pkl file for the sake of reproducibility. - path_fi: Path of the pkl file containing the Importance Scores - path_plt_data: Path of the pkl file containing the plt data + Returns + ---------- + imps: array of shape (n_samples,n_features) containing the local Feature Importance values for the samples of the input dataset X. + The array is also locally saved in a pkl file for the sake of reproducibility. + plt_data: Dictionary containig the average Importance Scores values, the feature order and the standard deviations on the Importance Scores. + The dictionary is also locally saved in a pkl file for the sake of reproducibility. + path_fi: Path of the pkl file containing the Importance Scores. + path_plt_data: Path of the pkl file containing the plt data. """ name='LFI_'+name @@ -641,19 +753,22 @@ def compute_global_importances(self,X: pd.DataFrame, n_runs:int, name: str,pwd_i Collect useful information that will be successively used by the plt_importances_bars,plt_global_importance_bar and plt_feat_bar_plot functions. - Parameters: - model: An instance of the Isolation Forest model - X: Input Dataset - n_runs: Number of runs to perform in order to compute the Global Feature Importance Scores. - name: Dataset's name - pwd_imp_score: Directory where the Importance Scores results will be saved as pkl files, by default the current working directory - pwd_plt_data: Directory where the plot data results will be saved as pkl files, by default the current working directory + Parameters + ---------- + X: Input Dataset + n_runs: Number of runs to perform in order to compute the Global Feature Importance Scores. + name: Dataset's name + pwd_imp_score: Directory where the Importance Scores results will be saved as pkl files, by default the current working directory + pwd_plt_data: Directory where the plot data results will be saved as pkl files, by default the current working directory - Returns: - imps: 2-dimensional array containing the local Feature Importance values for the samples of the input dataset X. The array is also locally saved in a pkl file for the sake of reproducibility. - plt_data: Dictionary containig the average Importance Scores values, the feature order and the standard deviations on the Importance Scores. The dictionary is also locally saved in a pkl file for the sake of reproducibility. - path_fi: Path of the pkl file containing the Importance Scores - path_plt_data: Path of the pkl file containing the plt data + Returns + ---------- + imps: array of shape (n_samples,n_features) containing the local Feature Importance values for the samples of the input dataset X. + The array is also locally saved in a pkl file for the sake of reproducibility. + plt_data: Dictionary containing the average Importance Scores values, the feature order and the standard deviations on the Importance Scores. + The dictionary is also locally saved in a pkl file for the sake of reproducibility. + path_fi: Path of the pkl file containing the Importance Scores + path_plt_data: Path of the pkl file containing the plt data """ name='GFI_'+name @@ -686,20 +801,32 @@ def compute_global_importances(self,X: pd.DataFrame, n_runs:int, name: str,pwd_i return fi,plt_data,path_fi,path_plt_data - def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f: int = 6,save: bool =True): + def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f: int = 6,is_local: bool=False, save: bool =True): """ - Obtain the Global Importance Bar Plot given the Importance Scores values computed in the compute_imps function. + Obtain the Global Importance Bar Plot given the Importance Scores values computed in the compute_local_importance or compute_global_importance functions. - Parameters: - imps_path: Path of the pkl file containing the 2-dimensional array of the LFI/GFI Scores for the input dataset.Obtained from the compute_imps function. - name: Dataset's name - pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. - f: Number of vertical bars to include in the Bar Plot. By default f is set to 6. - save: Boolean variable used to decide weather to save the Bar Plot locally as a PDF or not. + Parameters + ---------- + imps_path: Path of the pkl file containing the array of shape (n_samples,n_features) with the LFI/GFI Scores for the input dataset. + Obtained from the compute_local_importance or compute_global_importance functions. + name: Dataset's name + pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. + f: Number of vertical bars to include in the Bar Plot. By default f is set to 6. + is_local: Boolean variable used to specify weather we are plotting the Global or Local Feature Importance in order to set the file name. + If is_local is True the result will be the LFI Score Plot (based on the LFI scores of the input samples), otherwise the result is the GFI + Score Plot (based on the GFI scores obtained in the different n_runs execution of the model). By default is_local is set to False. + save: Boolean variable used to decide weather to save the Bar Plot locally as a PDF or not. BY default save is set to True. - Returns: - Obtain the Bar Plot which is then saved locally as a PDF. + Returns + ---------- + fig,ax : plt.figure and plt.axes objects used to create the plot + bars: pd.DataFrame containing the percentage count of the features in the first f positions of the Bar Plot. """ + + name_file='GFI_Bar_plot_'+name + + if is_local: + name_file='LFI_Bar_plot_'+name #Load the imps array from the pkl file contained in imps_path -> the imps_path is returned from the #compute_local_importances or compute_global_importances functions so we have it for free @@ -714,7 +841,6 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f dim=int(dim) bars = [[(list(importances_matrix[:,j]).count(i)/len(importances_matrix))*100 for i in range(dim)] for j in range(dim)] bars = pd.DataFrame(bars) - #display(bars) tick_names=[] for i in range(1,f+1): @@ -746,34 +872,39 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f ax.legend(bbox_to_anchor=(1.05, 0.95), loc="upper left",ncol=ncols) if save: - plt.savefig(pwd + '/{}_bar_plot.pdf'.format(name), bbox_inches='tight') + plt.savefig(pwd + '/{}_bar_plot.pdf'.format(name_file), bbox_inches='tight') return fig, ax, bars - def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),is_local: bool =True,save: bool =True): + def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),is_local: bool =False,save: bool =True): """ - Obtain the Global Feature Importance Score Plot exploiting the information obtained from compute_imps function. + Obtain the Global Feature Importance Score Plot exploiting the information obtained from the compute_local_importance or compute_global_importance functions. Parameters ---------- - plt_data_path: Dictionary generated from the compute_imps function with the necessary information to create the Score Plot. - name: Dataset's name - pwd: Directory where the plot will be saved as pkl files. By default the value of pwd is set to the current working directory. - is_local: Boolean variable used to specify weather we are plotting the Global or Local Feature Importance in order to set the file name. - If is_local is True the result will be the LFI Score Plot (based on the LFI scores of the input samples), otherwise the result is the GFI - Score Plot (based on the GFI scores obtained in the different n_runs execution of the model). - save: Boolean variable used to decide weather to save the Score Plot locally as a PDF or not. + plt_data_path: Dictionary generated from the compute_local_importance or compute_global_importance functions + with the necessary information to create the Score Plot. + name: Dataset's name + pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. + is_local: Boolean variable used to specify weather we are plotting the Global or Local Feature Importance in order to set the file name. + If is_local is True the result will be the LFI Score Plot (based on the LFI scores of the input samples), otherwise the result is the GFI + Score Plot (based on the GFI scores obtained in the different n_runs execution of the model). By default is_local is set to False. + save: Boolean variable used to decide weather to save the Score Plot locally as a PDF or not. By default save is set to True. - Returns: - Obtain the Score Plot which is also locally saved as a PDF. + Returns + ---------- + ax1,ax2: The two plt.axes objects used to create the plot. """ #Load the plt_data dictionary from the pkl file contained in plt_data_path -> the plt_data_path is returned from the #compute_local_importances or compute_global_importances functions so we have it for free with open(plt_data_path, 'rb') as f: plt_data = pickle.load(f) - name_file='Score_plot_'+name + name_file='GFI_Score_plot_'+name + + if is_local: + name_file='LFI_Score_plot_'+name patterns = [None, "/" , "\\" , "|" , "-" , "+" , "x", "o", "O", ".", "*" ] imp_vals=plt_data['Importances'] @@ -825,22 +956,23 @@ def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array """ Produce the Local Feature Importance Scoremap. - Parameters: - name: Dataset's name - model: Instance of the Isolation Forest model. - X_train: Training Set - y_train: Dataset training labels - resolution: Scoremap resolution - pwd: Directory where the plot will be saved. By default the value of pwd is set to the current working directory. - save: Boolean variable used to decide weather to save the Score Plot locally as a PDF or not. - m: Boolean variable regulating the plt.pcolor advanced settings. By defualt the value of m is set to None - factor: Integer factor used to define the minimum and maximum value of the points used to create the scoremap. By default the value of f is set to 3. - feats_plot: This tuple contains the indexes of the pair features to compare in the Scoremap. By default the value of feats_plot - is set to (0,1) - plt: Plt object used to create the plot. + Parameters + ---------- + name: Dataset's name + X_train: Training Set + y_train: Dataset training labels + resolution: Scoremap resolution + pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. + save: Boolean variable used to decide weather to save the Score Plot locally as a PDF or not. By default save is set to True. + m: Boolean variable regulating the plt.pcolor advanced settings. By defualt the value of m is set to None. + factor: Integer factor used to define the minimum and maximum value of the points used to create the scoremap. By default the value of f is set to 3. + feats_plot: This tuple contains the indexes of the pair features to compare in the Scoremap. By default the value of feats_plot + is set to (0,1). + ax: plt.axes object used to create the plot. By default ax is set to None. - Returns: - Obtain the Scoremap which is also locally saved as a PDF. + Returns + ---------- + fig,ax : plt.figure and plt.axes objects used to create the plot """ mins = X_train.min(axis=0)[list(feats_plot)] maxs = X_train.max(axis=0)[list(feats_plot)] @@ -892,19 +1024,19 @@ def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array return fig, ax def plot_complete_scoremap(self,name:str,dim:int,X: pd.DataFrame, y: np.array, pwd:str =os.getcwd()): - """ - Produce the Complete Local Feature Importance Scoremap: a Scoremap for each pair of features in the input dataset. + """Produce the Complete Local Feature Importance Scoremap: a Scoremap for each pair of features in the input dataset. - Parameters: - name: Dataset's name - dim: Number of input features in the dataset - model: Instance of the Isolation Forest model. - X: Input dataset - y: Dataset labels - pwd: Directory where the plot will be saved. By default the value of pwd is set to the current working directory. + Parameters + ---------- + name: Dataset's name + dim: Number of input features in the dataset + X: Input dataset + y: Dataset labels + pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. - Returns: - Obtain the Complete Scoremap which is also locally saved as a PDF. + Returns + ---------- + fig,ax : plt.figure and plt.axes objects used to create the plot """ fig, ax = plt.subplots(dim, dim, figsize=(50, 50)) @@ -915,7 +1047,6 @@ def plot_complete_scoremap(self,name:str,dim:int,X: pd.DataFrame, y: np.array, p #matrix of plots to reduce a little bit the execution time. _,_=self.plot_importance_map(name,X, y, 50, pwd, feats_plot = (features[0],features[1]), ax=ax[i,j],save=False) _,_=self.plot_importance_map(name,X, y, 50, pwd, feats_plot = (features[1],features[0]), ax=ax[j,i],save=False) - #fig.suptitle("comparison between DIFFI and ExIFFI "+name+" dataset",fontsize=20) plt.savefig(pwd+'/Local_Importance_Scoremap_{}_complete.pdf'.format(name),bbox_inches='tight') return fig,ax From 774893ba86e433009964fb985e0d9b7cc3efce90 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Mon, 20 Nov 2023 18:03:54 +0100 Subject: [PATCH 03/20] Added unit tests on IForest --- pyod/test/test_iforest.py | 451 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 451 insertions(+) diff --git a/pyod/test/test_iforest.py b/pyod/test/test_iforest.py index 4b80cc532..e1b1ea6a8 100644 --- a/pyod/test/test_iforest.py +++ b/pyod/test/test_iforest.py @@ -6,10 +6,15 @@ import os import sys import unittest +import numpy as np +import pandas as pd +import pickle # noinspection PyProtectedMember from numpy.testing import assert_allclose from numpy.testing import assert_array_less +from numpy.testing import assert_array_equal +from numpy.testing import assert_array_almost_equal from numpy.testing import assert_equal from numpy.testing import assert_raises from scipy.stats import rankdata @@ -54,6 +59,8 @@ def test_parameters(self): self.clf.estimators_samples_ is not None) assert (hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) + assert (hasattr(self.clf, '_max_features') and + self.clf._max_features is not None) assert (hasattr(self.clf, 'estimators_features_') and self.clf.estimators_features_ is not None) assert (hasattr(self.clf, 'n_features_in_') and @@ -155,6 +162,450 @@ def test_feature_importances(self): feature_importances = self.clf.feature_importances_ assert (len(feature_importances) == 2) + def test_decision_function_single_tree(self): + + X_train = np.array([[1, 1], [1, 2], [2, 1]]) + clf1 = IForest(contamination=0.1).fit(X_train) + clf2 = IForest().fit(X_train) + X=np.array([[2.0, 2.0]]) + tree_idx=np.random.randint(0,len(clf1.estimators_)) + + assert_array_equal( + clf1.decision_function_single_tree(tree_idx,X), + clf1._score_samples(tree_idx,X) - clf1.offset_, + ) + assert_array_equal( + clf2.decision_function_single_tree(tree_idx,X), + clf2._score_samples(tree_idx,X) - clf2.offset_, + ) + + #The decision function values could not be equal because clf1 and clf2 have + #two different contamination values. + + assert_array_almost_equal( + clf1.decision_function_single_tree(tree_idx,X), clf2.decision_function_single_tree(tree_idx,X), + decimal=1 + ) + + #Check weather the two decision function values are different + + assert not np.array_equal(clf1.decision_function_single_tree(tree_idx,X), clf2.decision_function_single_tree(tree_idx,X)) + + def test_score_samples(self): + + X_train = np.array([[1, 1], [1, 2], [2, 1]]) + clf1 = IForest(contamination=0.1).fit(X_train) + clf2 = IForest().fit(X_train) + X=np.array([[2.0, 2.0]]) + tree_idx=np.random.randint(0,len(clf1.estimators_)) + assert_array_equal( + clf1._score_samples(tree_idx,X), + clf1.decision_function_single_tree(tree_idx,X) + clf1.offset_, + ) + assert_array_equal( + clf1._score_samples(tree_idx,X), + clf2.decision_function_single_tree(tree_idx,X) + clf2.offset_, + ) + assert_array_equal( + clf1._score_samples(tree_idx,X), clf2._score_samples(tree_idx,X) + ) + + def test_compute_chunked_score_samples(self): + + X_train = np.array([[1, 1], [1, 2], [2, 1]]) + clf1 = IForest(contamination=0.1).fit(X_train) + clf2 = IForest().fit(X_train) + X=np.array([[2.0, 2.0]]) + tree_idx=np.random.randint(0,len(clf1.estimators_)) + + assert not np.array_equal( + clf1._compute_chunked_score_samples(tree_idx,X), + clf1.decision_function_single_tree(tree_idx,X) + clf1.offset_, + ) + assert not np.array_equal( + clf2._compute_chunked_score_samples(tree_idx,X), + clf2.decision_function_single_tree(tree_idx,X) + clf2.offset_, + ) + + assert_array_equal( + clf1._compute_chunked_score_samples(tree_idx,X), clf2._compute_chunked_score_samples(tree_idx,X) + ) + + def test_compute_score_samples_single_tree(self): + + X_train = np.array([[1, 1], [1, 2], [2, 1]]) + clf1 = IForest(contamination=0.1).fit(X_train) + clf2 = IForest().fit(X_train) + X=np.array([[2.0, 2.0]]) + tree_idx=np.random.randint(0,len(clf1.estimators_)) + subsample_features=np.random.choice([True, False], size=1) + + assert not np.array_equal( + clf1._compute_score_samples_single_tree(tree_idx,X,subsample_features), + clf1.decision_function_single_tree(tree_idx,X) + clf1.offset_, + ) + assert not np.array_equal( + clf2._compute_score_samples_single_tree(tree_idx,X,subsample_features), + clf2.decision_function_single_tree(tree_idx,X) + clf2.offset_, + ) + + assert_array_equal( + clf1._compute_score_samples_single_tree(tree_idx,X,subsample_features), clf2._compute_score_samples_single_tree(tree_idx,X,subsample_features) + ) + + def test_diffi_ib(self): + # create a random dataset + np.random.seed(0) + X = np.random.randn(100, 10) + # create an isolation forest model + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X) + # run the diffi_ib function + fi_ib, exec_time = iforest.diffi_ib(X) + #Check that all the elements of fi_ib are finite + assert np.all(np.isfinite(fi_ib)) == True + # check that the output has the correct shape + assert fi_ib.shape[0] == X.shape[1] + # check that the execuiton time is positive + assert exec_time > 0 + + def test_get_iic(self): + # create a random dataset + np.random.seed(0) + X = np.random.randn(100, 10) + # create an isolation forest model + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X) + estimator=iforest.estimators_[np.random.randint(0,iforest.n_estimators)] + is_leaves=np.random.choice([True, False], size=X.shape[0]) + adjust_iic=np.random.choice([True, False], size=1) + lambda_outliers_ib = iforest._get_iic(estimator, X, is_leaves, adjust_iic=adjust_iic) + + assert type(lambda_outliers_ib) == np.ndarray + assert lambda_outliers_ib.shape[0] == estimator.tree_.node_count + assert np.all(lambda_outliers_ib >= -1) == True + + def test_local_diffi(self): + # create a random dataset + np.random.seed(0) + X = np.random.randn(100,10) + # create an isolation forest model + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X) + #Select a single sample from X at random + x=X[np.random.randint(0,X.shape[0]),:] + fi_ib, exec_time = iforest.local_diffi(x) + + assert np.all(np.isfinite(fi_ib)) == True + assert fi_ib.shape[0] == x.shape[0] + assert exec_time >= 0 + + def test_local_diffi_batch(self): + np.random.seed(0) + X = np.random.randn(100,10) + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X) + + fi_ib,ord_idx,exec_time=iforest.local_diffi_batch(X) + + assert np.all(np.isfinite(fi_ib)) == True + assert fi_ib.shape[0] == X.shape[0] + assert ord_idx.shape == X.shape + # Every element in ord_idx must be between 0 and X.shape[0]-1 + assert np.all(ord_idx >= X.shape[1]) == False + assert type(exec_time)==list + assert np.all(np.array(exec_time)>=0) == True + + def test_compute_local_importances(self): + + #Create a path to save the pkl files created by compute_local_importances + test_imp_score_path=os.path.join(os.getcwd(),'tests','test_imp_score_local') + test_plt_data_path=os.path.join(os.getcwd(),'tests','test_plt_data_local') + name='test_local' + + #If the folder do not exist create them: + if not os.path.exists(test_imp_score_path): + os.makedirs(test_imp_score_path) + if not os.path.exists(test_plt_data_path): + os.makedirs(test_plt_data_path) + + np.random.seed(0) + X = np.random.randn(100, 10) + # create an isolation forest model + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X) + + fi,plt_data,path_fi,path_plt_data=iforest.compute_local_importances(X,name,pwd_imp_score=test_imp_score_path,pwd_plt_data=test_plt_data_path) + + """ + Tests on the pkl files + """ + #Check that the returned path are strings + assert type(path_fi) == str + assert type(path_plt_data) == str + #Check that the pkl files have been created + assert os.path.exists(path_fi) == True + assert os.path.exists(path_plt_data) == True + #Check that the pkl files are not empty + assert os.path.getsize(path_fi) > 0 + assert os.path.getsize(path_plt_data) > 0 + #Check that the pkl files can be loaded + assert pickle.load(open(path_fi,'rb')) is not None + assert pickle.load(open(path_plt_data,'rb')) is not None + + """ + Tests on fi and plt_data + """ + #Check that all the elements of fi are finite + assert np.all(np.isfinite(fi)) == True + # check that the output has the correct shape + assert fi.shape[0] == X.shape[0] + #Extract the keys of plt_data + plt_data_keys=list(plt_data.keys()) + imp,feat_ord,std=plt_data[plt_data_keys[0]],plt_data[plt_data_keys[1]],plt_data[plt_data_keys[2]] + #Check that all the elements of imp are finite + assert np.all(np.isfinite(imp)) == True + #Check that the size of imp is correct + assert imp.shape[0] == X.shape[1] + #Check that the size of feat_ord is correct + assert feat_ord.shape[0] == X.shape[1] + #Values in feat_ord cannot be greater than X.shape[1] + assert np.all(feat_ord>=X.shape[1]) == False + #Check that the size of std is correct + assert std.shape[0] == X.shape[1] + #Check that all the elements of std are positive (standard deviation cannot be negative) + assert np.all(std>=0) == True + + def test_compute_global_importances(self): + + #Create a path to save the pkl files created by compute_local_importances + test_imp_score_path=os.path.join(os.getcwd(),'tests','test_imp_score_global') + test_plt_data_path=os.path.join(os.getcwd(),'tests','test_plt_data_global') + name='test_global' + + #If the folder do not exist create them: + if not os.path.exists(test_imp_score_path): + os.makedirs(test_imp_score_path) + if not os.path.exists(test_plt_data_path): + os.makedirs(test_plt_data_path) + + np.random.seed(0) + X = np.random.randn(100, 10) + # create an isolation forest model + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X) + nruns=np.random.randint(1,10) + + fi,plt_data,path_fi,path_plt_data=iforest.compute_global_importances(X,nruns,name,pwd_imp_score=test_imp_score_path,pwd_plt_data=test_plt_data_path) + + """ + Tests on the pkl files + """ + + #Check that the returned path are strings + assert type(path_fi) == str + assert type(path_plt_data) == str + #Check that the pkl files have been created + assert os.path.exists(path_fi) == True + assert os.path.exists(path_plt_data) == True + #Check that the pkl files are not empty + assert os.path.getsize(path_fi) > 0 + assert os.path.getsize(path_plt_data) > 0 + #Check that the pkl files can be loaded + assert pickle.load(open(path_fi,'rb')) is not None + assert pickle.load(open(path_plt_data,'rb')) is not None + + """ + Tests on fi and plt_data + """ + #Check that nruns is positive + assert nruns >= 0 + #Check that all the elements of fi are finite + assert np.all(np.isfinite(fi)) == True + # check that the output has the correct shape + assert fi.shape[1] == X.shape[1] + #Extract the keys of plt_data + plt_data_keys=list(plt_data.keys()) + imp,feat_ord,std=plt_data[plt_data_keys[0]],plt_data[plt_data_keys[1]],plt_data[plt_data_keys[2]] + #Check that all the elements of imp are finite + assert np.all(np.isfinite(imp)) == True + #Check that the size of imp is correct + assert imp.shape[0] == X.shape[1] + #Check that the size of feat_ord is correct + assert feat_ord.shape[0] == X.shape[1] + #Values in feat_ord cannot be greater than X.shape[1] + assert np.all(feat_ord>=X.shape[1]) == False + #Check that the size of std is correct + assert std.shape[0] == X.shape[1] + #Check that all the elements of std are positive (standard deviation cannot be negative) + assert np.all(std>=0) == True + + def test_plot_importances_bars(self): + + # We need a feature importance 2d array with the importance values. + # We can extract it from the pkl files created by the test_compure_global_importances + # and test_compute_local_importances functions + + #We create the plot with plot_importances_bars and we will then compare it with the + #expected result contained in GFI_glass_synt.pdf + imps_path=os.path.join(os.getcwd(),'imp_scores','imp_score_GFI_glass.pkl') + + imps=pickle.load(open(imps_path,'rb')) + + #Create a path to save the plot image + plot_path=os.path.join(os.getcwd(),'tests','test_plots') + + #If the folder do not exist create it: + if not os.path.exists(plot_path): + os.makedirs(plot_path) + + #Create an IForest object to call the plt_importances_bars method + iforest=IForest() + + #Create a name for the plot + name='test_Glass' + f=6 + fig,ax,bars=iforest.plt_importances_bars(imps_path,name,pwd=plot_path,f=f) + + """ + Tests on ax + """ + #Check that the returned ax is not None + assert ax is not None + assert fig is not None + #Check that the returned ax is an axis object + #assert type(ax) == matplotlib.axes._subplots.AxesSubplot + #Check that the x label is correct + assert ax.get_xlabel() == 'Rank' + #Check that the y label is correct + assert ax.get_ylabel() == 'Percentage count' + #Check that the xtick and y tick labels are correct + x_tick_labels = [tick.get_text() for tick in ax.get_xticklabels()] + y_tick_labels = [tick.get_text() for tick in ax.get_yticklabels()] + assert x_tick_labels == ['$1^{st}$', '$2^{nd}$', '$3^{rd}$', '$4^{th}$', '$5^{th}$', '$6^{th}$'] + assert y_tick_labels == ['10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%'] + + #See if the plot correctly changes if I pass from f=6 (default value) to f=9 + f1=9 + fig1,ax1,bars1=iforest.plt_importances_bars(imps_path,name='test_Glass_9',pwd=plot_path,f=f1) + + #Check that the xtick and y tick labels are correct + x_tick_labels1 = [tick.get_text() for tick in ax1.get_xticklabels()] + assert x_tick_labels1 == ['$1^{st}$', '$2^{nd}$', '$3^{rd}$', '$4^{th}$', '$5^{th}$', '$6^{th}$','$7^{th}$','$8^{th}$','$9^{th}$'] + + """ + Tests on bars + + The main test o perform on bars is that the sum of the percentages values on each column should be 100. + """ + assert type(bars) == pd.DataFrame + assert bars.shape == (imps.shape[1],imps.shape[1]) + assert np.all(bars.sum()==100) == True + #Same on bars1 + assert type(bars1) == pd.DataFrame + assert bars1.shape == (imps.shape[1],imps.shape[1]) + assert np.all(bars1.sum()==100) == True + + def test_plt_feat_bar_plot(self): + + # We need the plt_data array: let's consider the global case with plt_data_GFI_glass.pkl and + # the local case with plt_data_LFI_glass.pkl + + plt_data_global_path=os.path.join(os.getcwd(),'plt_data','plt_data_GFI_glass.pkl') + plt_data_local_path=os.path.join(os.getcwd(),'plt_data','plt_data_LFI_glass.pkl') + + name_global='test_GFI_Glass' + name_local='test_LFI_Glass' + + plot_path=os.path.join(os.getcwd(),'tests','test_plots') + + #Create an IForest object to call the plt_feat_bar_plot method + iforest=IForest() + + ax1,ax2=iforest.plt_feat_bar_plot(plt_data_global_path,name_global,pwd=plot_path,is_local=False) + ax3,ax4=iforest.plt_feat_bar_plot(plt_data_local_path,name_local,pwd=plot_path,is_local=True) + + y_tick_labels_local = [tick.get_text() for tick in ax3.get_yticklabels()] + y_tick_labels2_local = [tick.get_text() for tick in ax4.get_yticklabels()] + y_tick_labels_global = [tick.get_text() for tick in ax1.get_yticklabels()] + y_tick_labels2_global = [tick.get_text() for tick in ax2.get_yticklabels()] + + """ + Tests on ax1,ax2,ax3,ax4 + """ + #Check that the returned ax is not None + assert ax1 is not None + assert ax2 is not None + assert ax3 is not None + assert ax4 is not None + #Check that the x label is correct + assert ax1.get_xlabel() == 'Importance Score' + #Check that the y label is correct + assert ax1.get_ylabel() == 'Features' + #Check that the x label is correct + assert ax3.get_xlabel() == 'Importance Score' + #Check that the y label is correct + assert ax3.get_ylabel() == 'Features' + #Check that the xtick and ytick labels are correct + assert np.all(np.array(y_tick_labels_local).astype('float')>=len(y_tick_labels2_local)-1) == False + assert np.all(np.array(y_tick_labels_global).astype('float')>=len(y_tick_labels2_global)-1) == False + + def test_plot_importance_map(self): + + # Let's perform the test on the Glass dataset + with open(os.path.join(os.getcwd(), 'data', 'local', 'glass.pkl'), 'rb') as f: + data = pickle.load(f) + # training data (inliers and outliers) + X_tr = np.concatenate((data['X_in'], data['X_out_5'], data['X_out_6'])) + y_tr = np.concatenate((data['y_in'], data['y_out_5'], data['y_out_6'])) + X_tr, y_tr = shuffle(X_tr, y_tr, random_state=0) + # test outliers + X_te = data['X_out_7'] + y_te = data['y_out_7'] + y_te=np.ones(shape=X_te.shape[0]) + X=np.r_[X_tr,X_te] + y=np.r_[y_tr,y_te] + name='Glass' + # create an isolation forest model + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X_tr) + plot_path=os.path.join(os.getcwd(),'tests','test_plots') + + fig,ax=iforest.plot_importance_map(name,X,y,30,pwd=plot_path) + + """ + Tests on ax + """ + + #Check that the returned ax is not None + assert ax is not None + assert fig is not None + + def test_plot_complete_scoremap(self): + + # Here we'll use a random dataset with just 3 features otherwise it takes too much time to + #create the plots + np.random.seed(0) + X = np.random.randn(100, 3) + #Assign at random the anomalous/not anomaoous labels + #Create a random array of 0 and 1 of shape=(100,) + y=np.random.randint(0,2,size=100) + name='test_complete' + # create an isolation forest model + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X) + plot_path=os.path.join(os.getcwd(),'tests','test_plots') + + fig,ax=iforest.plot_complete_scoremap(name,X.shape[1],iforest,X,y,pwd=plot_path) + + """ + Tests on ax + """ + + #Check that the returned ax is not None + assert ax is not None + assert fig is not None + def tearDown(self): pass From 5c47bee535e71895c6d0f8c416fb6f8bb796ce23 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Tue, 21 Nov 2023 08:07:14 +0100 Subject: [PATCH 04/20] Update test_iforest.py --- pyod/test/test_iforest.py | 79 ++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/pyod/test/test_iforest.py b/pyod/test/test_iforest.py index e1b1ea6a8..04c43f4e6 100644 --- a/pyod/test/test_iforest.py +++ b/pyod/test/test_iforest.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd import pickle +import scipy # noinspection PyProtectedMember from numpy.testing import assert_allclose @@ -20,6 +21,7 @@ from scipy.stats import rankdata from sklearn.base import clone from sklearn.metrics import roc_auc_score +from sklearn.utils import shuffle # temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line @@ -162,6 +164,8 @@ def test_feature_importances(self): feature_importances = self.clf.feature_importances_ assert (len(feature_importances) == 2) + # New tests inserted from here + def test_decision_function_single_tree(self): X_train = np.array([[1, 1], [1, 2], [2, 1]]) @@ -319,9 +323,9 @@ def test_local_diffi_batch(self): def test_compute_local_importances(self): #Create a path to save the pkl files created by compute_local_importances - test_imp_score_path=os.path.join(os.getcwd(),'tests','test_imp_score_local') - test_plt_data_path=os.path.join(os.getcwd(),'tests','test_plt_data_local') - name='test_local' + test_imp_score_path=os.path.join(os.getcwd(),'test_data','test_imp_score_local') + test_plt_data_path=os.path.join(os.getcwd(),'test_data','test_plt_data_local') + name='test_local_pima' #If the folder do not exist create them: if not os.path.exists(test_imp_score_path): @@ -329,8 +333,13 @@ def test_compute_local_importances(self): if not os.path.exists(test_plt_data_path): os.makedirs(test_plt_data_path) - np.random.seed(0) - X = np.random.randn(100, 10) + # We will use the data contained in pima.mat for the tests + path = os.path.join(os.getcwd(), 'data', 'ufs', 'pima.mat') + data = scipy.io.loadmat(path) + X_tr=data['X'] + y_tr=data['y'] + X, _ = shuffle(X_tr, y_tr, random_state=0) + # create an isolation forest model iforest = IForest(n_estimators=10, max_samples=64, random_state=0) iforest.fit(X) @@ -379,9 +388,9 @@ def test_compute_local_importances(self): def test_compute_global_importances(self): #Create a path to save the pkl files created by compute_local_importances - test_imp_score_path=os.path.join(os.getcwd(),'tests','test_imp_score_global') - test_plt_data_path=os.path.join(os.getcwd(),'tests','test_plt_data_global') - name='test_global' + test_imp_score_path=os.path.join(os.getcwd(),'test_data','test_imp_score_global') + test_plt_data_path=os.path.join(os.getcwd(),'test_data','test_plt_data_global') + name='test_global_pima' #If the folder do not exist create them: if not os.path.exists(test_imp_score_path): @@ -389,8 +398,13 @@ def test_compute_global_importances(self): if not os.path.exists(test_plt_data_path): os.makedirs(test_plt_data_path) - np.random.seed(0) - X = np.random.randn(100, 10) + # We will use the data contained in pima.mat for the tests + path = os.path.join(os.getcwd(), 'data', 'ufs', 'pima.mat') + data = scipy.io.loadmat(path) + X_tr=data['X'] + y_tr=data['y'] + X, _ = shuffle(X_tr, y_tr, random_state=0) + # create an isolation forest model iforest = IForest(n_estimators=10, max_samples=64, random_state=0) iforest.fit(X) @@ -448,12 +462,12 @@ def test_plot_importances_bars(self): #We create the plot with plot_importances_bars and we will then compare it with the #expected result contained in GFI_glass_synt.pdf - imps_path=os.path.join(os.getcwd(),'imp_scores','imp_score_GFI_glass.pkl') + imps_path=os.path.join(os.getcwd(),'test_data','test_imp_score_global','imp_score_LFI_test_global_pima.pkl') imps=pickle.load(open(imps_path,'rb')) #Create a path to save the plot image - plot_path=os.path.join(os.getcwd(),'tests','test_plots') + plot_path=os.path.join(os.getcwd(),'test_data','test_plots') #If the folder do not exist create it: if not os.path.exists(plot_path): @@ -463,7 +477,7 @@ def test_plot_importances_bars(self): iforest=IForest() #Create a name for the plot - name='test_Glass' + name='test_pima' f=6 fig,ax,bars=iforest.plt_importances_bars(imps_path,name,pwd=plot_path,f=f) @@ -487,7 +501,7 @@ def test_plot_importances_bars(self): #See if the plot correctly changes if I pass from f=6 (default value) to f=9 f1=9 - fig1,ax1,bars1=iforest.plt_importances_bars(imps_path,name='test_Glass_9',pwd=plot_path,f=f1) + fig1,ax1,bars1=iforest.plt_importances_bars(imps_path,name='test_pima_9',pwd=plot_path,f=f1) #Check that the xtick and y tick labels are correct x_tick_labels1 = [tick.get_text() for tick in ax1.get_xticklabels()] @@ -511,13 +525,13 @@ def test_plt_feat_bar_plot(self): # We need the plt_data array: let's consider the global case with plt_data_GFI_glass.pkl and # the local case with plt_data_LFI_glass.pkl - plt_data_global_path=os.path.join(os.getcwd(),'plt_data','plt_data_GFI_glass.pkl') - plt_data_local_path=os.path.join(os.getcwd(),'plt_data','plt_data_LFI_glass.pkl') + plt_data_global_path=os.path.join(os.getcwd(),'plt_data','plt_data_GFI_test_global_pima.pkl') + plt_data_local_path=os.path.join(os.getcwd(),'plt_data','plt_data_LFI_test_local_pima.pkl') - name_global='test_GFI_Glass' - name_local='test_LFI_Glass' + name_global='test_GFI_pima' + name_local='test_LFI_pima' - plot_path=os.path.join(os.getcwd(),'tests','test_plots') + plot_path=os.path.join(os.getcwd(),'test_data','test_plots') #Create an IForest object to call the plt_feat_bar_plot method iforest=IForest() @@ -552,24 +566,19 @@ def test_plt_feat_bar_plot(self): def test_plot_importance_map(self): - # Let's perform the test on the Glass dataset - with open(os.path.join(os.getcwd(), 'data', 'local', 'glass.pkl'), 'rb') as f: - data = pickle.load(f) - # training data (inliers and outliers) - X_tr = np.concatenate((data['X_in'], data['X_out_5'], data['X_out_6'])) - y_tr = np.concatenate((data['y_in'], data['y_out_5'], data['y_out_6'])) - X_tr, y_tr = shuffle(X_tr, y_tr, random_state=0) - # test outliers - X_te = data['X_out_7'] - y_te = data['y_out_7'] - y_te=np.ones(shape=X_te.shape[0]) - X=np.r_[X_tr,X_te] - y=np.r_[y_tr,y_te] - name='Glass' + # Let's perform the test on the pima.mat dataset + path = os.path.join(os.getcwd(), 'data', 'ufs', 'pima.mat') + data = scipy.io.loadmat(path) + X_tr=data['X'] + y_tr=data['y'] + X, y = shuffle(X_tr, y_tr, random_state=0) + + name='test_pima' + # create an isolation forest model iforest = IForest(n_estimators=10, max_samples=64, random_state=0) iforest.fit(X_tr) - plot_path=os.path.join(os.getcwd(),'tests','test_plots') + plot_path=os.path.join(os.getcwd(),'test_data','test_plots') fig,ax=iforest.plot_importance_map(name,X,y,30,pwd=plot_path) @@ -594,7 +603,7 @@ def test_plot_complete_scoremap(self): # create an isolation forest model iforest = IForest(n_estimators=10, max_samples=64, random_state=0) iforest.fit(X) - plot_path=os.path.join(os.getcwd(),'tests','test_plots') + plot_path=os.path.join(os.getcwd(),'test_data','test_plots') fig,ax=iforest.plot_complete_scoremap(name,X.shape[1],iforest,X,y,pwd=plot_path) From 7359d86ac3f06bc647dbdb868f27d54757f1b6cd Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Tue, 21 Nov 2023 12:18:25 +0100 Subject: [PATCH 05/20] Correct typo in iforest.py --- pyod/models/iforest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 9c0fe2e51..080a66580 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -794,7 +794,7 @@ def compute_global_importances(self,X: pd.DataFrame, n_runs:int, name: str,pwd_i 'std': std_imp[mean_imp.argsort()]} # Save the plt_data dictionary in a pkl file - path_plt_data = pwd_plt_data + '/plt_data' + name + '.pkl' + path_plt_data = pwd_plt_data + '/plt_data_' + name + '.pkl' with open(path_plt_data, 'wb') as fl: pickle.dump(plt_data,fl) From c332d77e0fedac59d07d50e6840940471f4a9934 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Tue, 21 Nov 2023 15:01:23 +0100 Subject: [PATCH 06/20] Inserted labels parameter in plot_importance_map --- pyod/models/iforest.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 080a66580..a229622b6 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -952,7 +952,7 @@ def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),is def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array ,resolution: int, - pwd: str =os.getcwd(),save: bool =True,m: bool =None,factor: int =3,feats_plot: tuple[int,int] =(0,1),ax=None): + pwd: str =os.getcwd(),save: bool =True,m: bool =None,factor: int =3,feats_plot: tuple[int,int] =(0,1),ax=None,labels: bool=True): """ Produce the Local Feature Importance Scoremap. @@ -969,6 +969,8 @@ def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array feats_plot: This tuple contains the indexes of the pair features to compare in the Scoremap. By default the value of feats_plot is set to (0,1). ax: plt.axes object used to create the plot. By default ax is set to None. + labels: Boolean variable used to decide weather to include the x and y label name in the plot. + When calling the plot_importance_map function inside plot_complete_scoremap this parameter will be set to False Returns ---------- @@ -1014,8 +1016,13 @@ def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array print('Handling the IndexError Exception...') ax.scatter(x[(y_train == 0)[:, 0]], y[(y_train == 0)[:, 0]], s=40, c="tab:blue", marker="o", edgecolors="k", label="inliers") ax.scatter(x[(y_train == 1)[:, 0]], y[(y_train == 1)[:, 0]], s=60, c="tab:orange", marker="*", edgecolors="k", label="outliers") - + + if labels: + ax.set_xlabel(f'Feature {feats_plot[0]}') + ax.set_ylabel(f'Feature {feats_plot[1]}') + ax.legend() + if save: plt.savefig(pwd + '/Local_Importance_Scoremap_{}.pdf'.format(name), bbox_inches='tight') else: @@ -1045,8 +1052,8 @@ def plot_complete_scoremap(self,name:str,dim:int,X: pd.DataFrame, y: np.array, p features = [i,j] # One of the successive two lines can be commented so that we obtain only one "half" of the #matrix of plots to reduce a little bit the execution time. - _,_=self.plot_importance_map(name,X, y, 50, pwd, feats_plot = (features[0],features[1]), ax=ax[i,j],save=False) - _,_=self.plot_importance_map(name,X, y, 50, pwd, feats_plot = (features[1],features[0]), ax=ax[j,i],save=False) + _,_=self.plot_importance_map(name,X, y, 50, pwd, feats_plot = (features[0],features[1]), ax=ax[i,j],save=False,labels=False) + _,_=self.plot_importance_map(name,X, y, 50, pwd, feats_plot = (features[1],features[0]), ax=ax[j,i],save=False,labels=False) plt.savefig(pwd+'/Local_Importance_Scoremap_{}_complete.pdf'.format(name),bbox_inches='tight') return fig,ax From 41cfc0a0285f140561a68f0f74ed4543d58e9982 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Tue, 21 Nov 2023 15:01:35 +0100 Subject: [PATCH 07/20] Update tests --- pyod/test/test_iforest.py | 57 ++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/pyod/test/test_iforest.py b/pyod/test/test_iforest.py index 04c43f4e6..2800cc4e8 100644 --- a/pyod/test/test_iforest.py +++ b/pyod/test/test_iforest.py @@ -323,8 +323,8 @@ def test_local_diffi_batch(self): def test_compute_local_importances(self): #Create a path to save the pkl files created by compute_local_importances - test_imp_score_path=os.path.join(os.getcwd(),'test_data','test_imp_score_local') - test_plt_data_path=os.path.join(os.getcwd(),'test_data','test_plt_data_local') + test_imp_score_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score') + test_plt_data_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plt_data') name='test_local_pima' #If the folder do not exist create them: @@ -334,7 +334,7 @@ def test_compute_local_importances(self): os.makedirs(test_plt_data_path) # We will use the data contained in pima.mat for the tests - path = os.path.join(os.getcwd(), 'data', 'ufs', 'pima.mat') + path = os.path.join(os.getcwd(), 'pyod','test','data','pima.mat') data = scipy.io.loadmat(path) X_tr=data['X'] y_tr=data['y'] @@ -388,8 +388,8 @@ def test_compute_local_importances(self): def test_compute_global_importances(self): #Create a path to save the pkl files created by compute_local_importances - test_imp_score_path=os.path.join(os.getcwd(),'test_data','test_imp_score_global') - test_plt_data_path=os.path.join(os.getcwd(),'test_data','test_plt_data_global') + test_imp_score_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score') + test_plt_data_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plt_data') name='test_global_pima' #If the folder do not exist create them: @@ -399,14 +399,14 @@ def test_compute_global_importances(self): os.makedirs(test_plt_data_path) # We will use the data contained in pima.mat for the tests - path = os.path.join(os.getcwd(), 'data', 'ufs', 'pima.mat') + path = os.path.join(os.getcwd(),'pyod','test', 'data', 'pima.mat') data = scipy.io.loadmat(path) X_tr=data['X'] y_tr=data['y'] X, _ = shuffle(X_tr, y_tr, random_state=0) # create an isolation forest model - iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest = IForest(n_estimators=10, max_samples=64) iforest.fit(X) nruns=np.random.randint(1,10) @@ -462,12 +462,12 @@ def test_plot_importances_bars(self): #We create the plot with plot_importances_bars and we will then compare it with the #expected result contained in GFI_glass_synt.pdf - imps_path=os.path.join(os.getcwd(),'test_data','test_imp_score_global','imp_score_LFI_test_global_pima.pkl') + imps_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score','imp_scores_GFI_test_global_pima.pkl') imps=pickle.load(open(imps_path,'rb')) #Create a path to save the plot image - plot_path=os.path.join(os.getcwd(),'test_data','test_plots') + plot_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') #If the folder do not exist create it: if not os.path.exists(plot_path): @@ -510,28 +510,39 @@ def test_plot_importances_bars(self): """ Tests on bars - The main test o perform on bars is that the sum of the percentages values on each column should be 100. + The main test to perform on bars is that the sum of the percentages values on each column should be 100. """ assert type(bars) == pd.DataFrame assert bars.shape == (imps.shape[1],imps.shape[1]) - assert np.all(bars.sum()==100) == True + + #Check that the sum of the values in each column of bars is almost equal to 100 + bars_sum=np.array([bars[i].sum() for i in range(bars.shape[1])]) + assert_array_almost_equal(bars_sum,np.full(bars.shape[1],100)) + #Same on bars1 assert type(bars1) == pd.DataFrame assert bars1.shape == (imps.shape[1],imps.shape[1]) - assert np.all(bars1.sum()==100) == True + + #Check that the sum of the values in each column of bars1 is almost equal to 100 + bars1_sum=np.array([bars1[i].sum() for i in range(bars1.shape[1])]) + assert_array_almost_equal(bars1_sum,np.full(bars1.shape[1],100)) def test_plt_feat_bar_plot(self): # We need the plt_data array: let's consider the global case with plt_data_GFI_glass.pkl and # the local case with plt_data_LFI_glass.pkl - plt_data_global_path=os.path.join(os.getcwd(),'plt_data','plt_data_GFI_test_global_pima.pkl') - plt_data_local_path=os.path.join(os.getcwd(),'plt_data','plt_data_LFI_test_local_pima.pkl') + plt_data_global_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plt_data','plt_data_GFI_test_global_pima.pkl') + plt_data_local_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plt_data','plt_data_LFI_test_local_pima.pkl') name_global='test_GFI_pima' name_local='test_LFI_pima' - plot_path=os.path.join(os.getcwd(),'test_data','test_plots') + plot_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') + + #If the folder do not exist create it: + if not os.path.exists(plot_path): + os.makedirs(plot_path) #Create an IForest object to call the plt_feat_bar_plot method iforest=IForest() @@ -567,7 +578,7 @@ def test_plt_feat_bar_plot(self): def test_plot_importance_map(self): # Let's perform the test on the pima.mat dataset - path = os.path.join(os.getcwd(), 'data', 'ufs', 'pima.mat') + path = os.path.join(os.getcwd(),'pyod','test','data','pima.mat') data = scipy.io.loadmat(path) X_tr=data['X'] y_tr=data['y'] @@ -578,7 +589,11 @@ def test_plot_importance_map(self): # create an isolation forest model iforest = IForest(n_estimators=10, max_samples=64, random_state=0) iforest.fit(X_tr) - plot_path=os.path.join(os.getcwd(),'test_data','test_plots') + plot_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') + + #If the folder do not exist create it: + if not os.path.exists(plot_path): + os.makedirs(plot_path) fig,ax=iforest.plot_importance_map(name,X,y,30,pwd=plot_path) @@ -603,9 +618,13 @@ def test_plot_complete_scoremap(self): # create an isolation forest model iforest = IForest(n_estimators=10, max_samples=64, random_state=0) iforest.fit(X) - plot_path=os.path.join(os.getcwd(),'test_data','test_plots') + plot_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') + + #If the folder do not exist create it: + if not os.path.exists(plot_path): + os.makedirs(plot_path) - fig,ax=iforest.plot_complete_scoremap(name,X.shape[1],iforest,X,y,pwd=plot_path) + fig,ax=iforest.plot_complete_scoremap(name,X.shape[1],X,y,pwd=plot_path) """ Tests on ax From 5eda2cec5db3d607940c80e771a40dc611603df3 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Fri, 1 Dec 2023 11:47:10 +0100 Subject: [PATCH 08/20] Update iforest and test_iforest.py --- pyod/models/iforest.py | 10 ++++++++++ pyod/test/test_iforest.py | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index a229622b6..06268d386 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -718,6 +718,11 @@ def compute_local_importances(self,X: pd.DataFrame,name: str,pwd_imp_score: str name='LFI_'+name fi,_,_=self.local_diffi_batch(X) + # Handle the case in which there are some np.nan or np.inf values in the fi array + if np.isnan(fi).any() or np.isinf(fi).any(): + #Substitute the np.nan values with 0 and the np.inf values with the maximum value of the fi array plus 1. + fi=np.nan_to_num(fi,nan=0,posinf=np.nanmax(fi[np.isfinite(fi)])+1) + # Save the Importance Scores in a pkl file path_fi = pwd_imp_score + '/imp_scores_' + name + '.pkl' with open(path_fi, 'wb') as fl: @@ -777,6 +782,11 @@ def compute_global_importances(self,X: pd.DataFrame, n_runs:int, name: str,pwd_i self.fit(X) fi[i,:],_=self.diffi_ib(X) + # Handle the case in which there are some np.nan or np.inf values in the fi array + if np.isnan(fi).any() or np.isinf(fi).any(): + #Substitute the np.nan values with 0 and the np.inf values with the maximum value of the fi array plus 1. + fi=np.nan_to_num(fi,nan=0,posinf=np.nanmax(fi[np.isfinite(fi)])+1) + # Save the Importance Scores in a pkl file path_fi = pwd_imp_score + '/imp_scores_' + name + '.pkl' with open(path_fi, 'wb') as fl: diff --git a/pyod/test/test_iforest.py b/pyod/test/test_iforest.py index 2800cc4e8..bd78ec72d 100644 --- a/pyod/test/test_iforest.py +++ b/pyod/test/test_iforest.py @@ -367,6 +367,8 @@ def test_compute_local_importances(self): """ #Check that all the elements of fi are finite assert np.all(np.isfinite(fi)) == True + #Check that all the elements of fi are not nan + assert np.all(np.isnan(fi)) == False # check that the output has the correct shape assert fi.shape[0] == X.shape[0] #Extract the keys of plt_data @@ -436,6 +438,8 @@ def test_compute_global_importances(self): assert nruns >= 0 #Check that all the elements of fi are finite assert np.all(np.isfinite(fi)) == True + #Check that all the elements of fi are not nan + assert np.all(np.isnan(fi)) == False # check that the output has the correct shape assert fi.shape[1] == X.shape[1] #Extract the keys of plt_data From 1410f17c5a8973b02bcb20ca1539a1db6f32d80b Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Fri, 1 Dec 2023 12:26:35 +0100 Subject: [PATCH 09/20] Update iforest.py --- pyod/models/iforest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 06268d386..9a2d2413c 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -27,6 +27,7 @@ import os import pickle import time +import random from .base import BaseDetector # noinspection PyProtectedMember @@ -845,7 +846,8 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f number_colours = 20 color = plt.cm.get_cmap('tab20',number_colours).colors - patterns = [None, "/" , "\\" , "|" , "-" , "+" , "x", "o", "O", ".", "*" ] + special_characters='!@#$%^&*()-=_+[]{}|;:\'l,.<>/?`~\\abcdefghi' + patterns = random.sample(special_characters, len(special_characters)) importances_matrix = np.array([np.array(pd.Series(x).sort_values(ascending = False).index).T for x in importances]) dim=importances.shape[1] dim=int(dim) From 477b8e1a9601eda332b57b4e23f089797036ccf9 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Fri, 1 Dec 2023 12:30:38 +0100 Subject: [PATCH 10/20] Typo in iforest.py --- pyod/models/iforest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 9a2d2413c..dac282728 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -846,7 +846,7 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f number_colours = 20 color = plt.cm.get_cmap('tab20',number_colours).colors - special_characters='!@#$%^&*()-=_+[]{}|;:\'l,.<>/?`~\\abcdefghi' + special_characters="!@#$%^&*()-=_+[]{}|;:\'l,.<>/?`~\\abcdefghi" patterns = random.sample(special_characters, len(special_characters)) importances_matrix = np.array([np.array(pd.Series(x).sort_values(ascending = False).index).T for x in importances]) dim=importances.shape[1] From 93ad02f1b51ef5884f70d2875792debcf9fd6295 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Fri, 1 Dec 2023 12:35:13 +0100 Subject: [PATCH 11/20] Typo in iforest.py --- pyod/models/iforest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index dac282728..ccbdb5a74 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -846,7 +846,7 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f number_colours = 20 color = plt.cm.get_cmap('tab20',number_colours).colors - special_characters="!@#$%^&*()-=_+[]{}|;:\'l,.<>/?`~\\abcdefghi" + special_characters="!@#$%^&*°()-=_+[]{}|;:\l,.<>/?`~\\abcdefghi" patterns = random.sample(special_characters, len(special_characters)) importances_matrix = np.array([np.array(pd.Series(x).sort_values(ascending = False).index).T for x in importances]) dim=importances.shape[1] From 2c12b268fbcf8185675565565646d658e0e7c592 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Fri, 1 Dec 2023 15:57:43 +0100 Subject: [PATCH 12/20] Update iforest.py --- pyod/models/iforest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index ccbdb5a74..297a3f676 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -846,8 +846,8 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f number_colours = 20 color = plt.cm.get_cmap('tab20',number_colours).colors - special_characters="!@#$%^&*°()-=_+[]{}|;:\l,.<>/?`~\\abcdefghi" - patterns = random.sample(special_characters, len(special_characters)) + patterns=[None,'!','@','#','$','^','&','*','°','(',')','-','_','+','=','[',']','{','}', + '|',';',':','\l',',','.','<','>','/','?','`','~','\\','!!','@@','##','$$','^^','&&','**','°°','(('] importances_matrix = np.array([np.array(pd.Series(x).sort_values(ascending = False).index).T for x in importances]) dim=importances.shape[1] dim=int(dim) From 9a9a6a159b5a11693b6922994eeeaf95364325fc Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Fri, 1 Dec 2023 16:08:04 +0100 Subject: [PATCH 13/20] Update iforest.py --- pyod/models/iforest.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 297a3f676..25eed6b04 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -870,7 +870,14 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f ncols=1 if importances.shape[1]>15: ncols=2 - + elif importances.shape[1]>30: + ncols=3 + elif importances.shape[1]>45: + ncols=4 + elif importances.shape[1]>60: + ncols=5 + elif importances.shape[1]>75: + ncols=6 fig, ax = plt.subplots() From 3091da03170d001f85867704e77abee02405e2ad Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Fri, 1 Dec 2023 16:13:07 +0100 Subject: [PATCH 14/20] Update iforest.py --- pyod/models/iforest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 25eed6b04..4bca7c904 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -925,7 +925,8 @@ def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),is if is_local: name_file='LFI_Score_plot_'+name - patterns = [None, "/" , "\\" , "|" , "-" , "+" , "x", "o", "O", ".", "*" ] + patterns=[None,'!','@','#','$','^','&','*','°','(',')','-','_','+','=','[',']','{','}', + '|',';',':','\l',',','.','<','>','/','?','`','~','\\','!!','@@','##','$$','^^','&&','**','°°','(('] imp_vals=plt_data['Importances'] feat_imp=pd.DataFrame({'Global Importance': np.round(imp_vals,3), 'Feature': plt_data['feat_order'], From 0c2a610806f6087322e1180ed92268bfc02b321d Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Sun, 3 Dec 2023 10:45:34 +0100 Subject: [PATCH 15/20] Added pima.csv file for tests --- pyod/test/data/pima.csv | 769 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 769 insertions(+) create mode 100644 pyod/test/data/pima.csv diff --git a/pyod/test/data/pima.csv b/pyod/test/data/pima.csv new file mode 100644 index 000000000..db6f31768 --- /dev/null +++ b/pyod/test/data/pima.csv @@ -0,0 +1,769 @@ +Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome +6,148,72,35,0,33.6,0.627,50,1 +1,85,66,29,0,26.6,0.351,31,0 +8,183,64,0,0,23.3,0.672,32,1 +1,89,66,23,94,28.1,0.167,21,0 +0,137,40,35,168,43.1,2.288,33,1 +5,116,74,0,0,25.6,0.201,30,0 +3,78,50,32,88,31,0.248,26,1 +10,115,0,0,0,35.3,0.134,29,0 +2,197,70,45,543,30.5,0.158,53,1 +8,125,96,0,0,0,0.232,54,1 +4,110,92,0,0,37.6,0.191,30,0 +10,168,74,0,0,38,0.537,34,1 +10,139,80,0,0,27.1,1.441,57,0 +1,189,60,23,846,30.1,0.398,59,1 +5,166,72,19,175,25.8,0.587,51,1 +7,100,0,0,0,30,0.484,32,1 +0,118,84,47,230,45.8,0.551,31,1 +7,107,74,0,0,29.6,0.254,31,1 +1,103,30,38,83,43.3,0.183,33,0 +1,115,70,30,96,34.6,0.529,32,1 +3,126,88,41,235,39.3,0.704,27,0 +8,99,84,0,0,35.4,0.388,50,0 +7,196,90,0,0,39.8,0.451,41,1 +9,119,80,35,0,29,0.263,29,1 +11,143,94,33,146,36.6,0.254,51,1 +10,125,70,26,115,31.1,0.205,41,1 +7,147,76,0,0,39.4,0.257,43,1 +1,97,66,15,140,23.2,0.487,22,0 +13,145,82,19,110,22.2,0.245,57,0 +5,117,92,0,0,34.1,0.337,38,0 +5,109,75,26,0,36,0.546,60,0 +3,158,76,36,245,31.6,0.851,28,1 +3,88,58,11,54,24.8,0.267,22,0 +6,92,92,0,0,19.9,0.188,28,0 +10,122,78,31,0,27.6,0.512,45,0 +4,103,60,33,192,24,0.966,33,0 +11,138,76,0,0,33.2,0.42,35,0 +9,102,76,37,0,32.9,0.665,46,1 +2,90,68,42,0,38.2,0.503,27,1 +4,111,72,47,207,37.1,1.39,56,1 +3,180,64,25,70,34,0.271,26,0 +7,133,84,0,0,40.2,0.696,37,0 +7,106,92,18,0,22.7,0.235,48,0 +9,171,110,24,240,45.4,0.721,54,1 +7,159,64,0,0,27.4,0.294,40,0 +0,180,66,39,0,42,1.893,25,1 +1,146,56,0,0,29.7,0.564,29,0 +2,71,70,27,0,28,0.586,22,0 +7,103,66,32,0,39.1,0.344,31,1 +7,105,0,0,0,0,0.305,24,0 +1,103,80,11,82,19.4,0.491,22,0 +1,101,50,15,36,24.2,0.526,26,0 +5,88,66,21,23,24.4,0.342,30,0 +8,176,90,34,300,33.7,0.467,58,1 +7,150,66,42,342,34.7,0.718,42,0 +1,73,50,10,0,23,0.248,21,0 +7,187,68,39,304,37.7,0.254,41,1 +0,100,88,60,110,46.8,0.962,31,0 +0,146,82,0,0,40.5,1.781,44,0 +0,105,64,41,142,41.5,0.173,22,0 +2,84,0,0,0,0,0.304,21,0 +8,133,72,0,0,32.9,0.27,39,1 +5,44,62,0,0,25,0.587,36,0 +2,141,58,34,128,25.4,0.699,24,0 +7,114,66,0,0,32.8,0.258,42,1 +5,99,74,27,0,29,0.203,32,0 +0,109,88,30,0,32.5,0.855,38,1 +2,109,92,0,0,42.7,0.845,54,0 +1,95,66,13,38,19.6,0.334,25,0 +4,146,85,27,100,28.9,0.189,27,0 +2,100,66,20,90,32.9,0.867,28,1 +5,139,64,35,140,28.6,0.411,26,0 +13,126,90,0,0,43.4,0.583,42,1 +4,129,86,20,270,35.1,0.231,23,0 +1,79,75,30,0,32,0.396,22,0 +1,0,48,20,0,24.7,0.14,22,0 +7,62,78,0,0,32.6,0.391,41,0 +5,95,72,33,0,37.7,0.37,27,0 +0,131,0,0,0,43.2,0.27,26,1 +2,112,66,22,0,25,0.307,24,0 +3,113,44,13,0,22.4,0.14,22,0 +2,74,0,0,0,0,0.102,22,0 +7,83,78,26,71,29.3,0.767,36,0 +0,101,65,28,0,24.6,0.237,22,0 +5,137,108,0,0,48.8,0.227,37,1 +2,110,74,29,125,32.4,0.698,27,0 +13,106,72,54,0,36.6,0.178,45,0 +2,100,68,25,71,38.5,0.324,26,0 +15,136,70,32,110,37.1,0.153,43,1 +1,107,68,19,0,26.5,0.165,24,0 +1,80,55,0,0,19.1,0.258,21,0 +4,123,80,15,176,32,0.443,34,0 +7,81,78,40,48,46.7,0.261,42,0 +4,134,72,0,0,23.8,0.277,60,1 +2,142,82,18,64,24.7,0.761,21,0 +6,144,72,27,228,33.9,0.255,40,0 +2,92,62,28,0,31.6,0.13,24,0 +1,71,48,18,76,20.4,0.323,22,0 +6,93,50,30,64,28.7,0.356,23,0 +1,122,90,51,220,49.7,0.325,31,1 +1,163,72,0,0,39,1.222,33,1 +1,151,60,0,0,26.1,0.179,22,0 +0,125,96,0,0,22.5,0.262,21,0 +1,81,72,18,40,26.6,0.283,24,0 +2,85,65,0,0,39.6,0.93,27,0 +1,126,56,29,152,28.7,0.801,21,0 +1,96,122,0,0,22.4,0.207,27,0 +4,144,58,28,140,29.5,0.287,37,0 +3,83,58,31,18,34.3,0.336,25,0 +0,95,85,25,36,37.4,0.247,24,1 +3,171,72,33,135,33.3,0.199,24,1 +8,155,62,26,495,34,0.543,46,1 +1,89,76,34,37,31.2,0.192,23,0 +4,76,62,0,0,34,0.391,25,0 +7,160,54,32,175,30.5,0.588,39,1 +4,146,92,0,0,31.2,0.539,61,1 +5,124,74,0,0,34,0.22,38,1 +5,78,48,0,0,33.7,0.654,25,0 +4,97,60,23,0,28.2,0.443,22,0 +4,99,76,15,51,23.2,0.223,21,0 +0,162,76,56,100,53.2,0.759,25,1 +6,111,64,39,0,34.2,0.26,24,0 +2,107,74,30,100,33.6,0.404,23,0 +5,132,80,0,0,26.8,0.186,69,0 +0,113,76,0,0,33.3,0.278,23,1 +1,88,30,42,99,55,0.496,26,1 +3,120,70,30,135,42.9,0.452,30,0 +1,118,58,36,94,33.3,0.261,23,0 +1,117,88,24,145,34.5,0.403,40,1 +0,105,84,0,0,27.9,0.741,62,1 +4,173,70,14,168,29.7,0.361,33,1 +9,122,56,0,0,33.3,1.114,33,1 +3,170,64,37,225,34.5,0.356,30,1 +8,84,74,31,0,38.3,0.457,39,0 +2,96,68,13,49,21.1,0.647,26,0 +2,125,60,20,140,33.8,0.088,31,0 +0,100,70,26,50,30.8,0.597,21,0 +0,93,60,25,92,28.7,0.532,22,0 +0,129,80,0,0,31.2,0.703,29,0 +5,105,72,29,325,36.9,0.159,28,0 +3,128,78,0,0,21.1,0.268,55,0 +5,106,82,30,0,39.5,0.286,38,0 +2,108,52,26,63,32.5,0.318,22,0 +10,108,66,0,0,32.4,0.272,42,1 +4,154,62,31,284,32.8,0.237,23,0 +0,102,75,23,0,0,0.572,21,0 +9,57,80,37,0,32.8,0.096,41,0 +2,106,64,35,119,30.5,1.4,34,0 +5,147,78,0,0,33.7,0.218,65,0 +2,90,70,17,0,27.3,0.085,22,0 +1,136,74,50,204,37.4,0.399,24,0 +4,114,65,0,0,21.9,0.432,37,0 +9,156,86,28,155,34.3,1.189,42,1 +1,153,82,42,485,40.6,0.687,23,0 +8,188,78,0,0,47.9,0.137,43,1 +7,152,88,44,0,50,0.337,36,1 +2,99,52,15,94,24.6,0.637,21,0 +1,109,56,21,135,25.2,0.833,23,0 +2,88,74,19,53,29,0.229,22,0 +17,163,72,41,114,40.9,0.817,47,1 +4,151,90,38,0,29.7,0.294,36,0 +7,102,74,40,105,37.2,0.204,45,0 +0,114,80,34,285,44.2,0.167,27,0 +2,100,64,23,0,29.7,0.368,21,0 +0,131,88,0,0,31.6,0.743,32,1 +6,104,74,18,156,29.9,0.722,41,1 +3,148,66,25,0,32.5,0.256,22,0 +4,120,68,0,0,29.6,0.709,34,0 +4,110,66,0,0,31.9,0.471,29,0 +3,111,90,12,78,28.4,0.495,29,0 +6,102,82,0,0,30.8,0.18,36,1 +6,134,70,23,130,35.4,0.542,29,1 +2,87,0,23,0,28.9,0.773,25,0 +1,79,60,42,48,43.5,0.678,23,0 +2,75,64,24,55,29.7,0.37,33,0 +8,179,72,42,130,32.7,0.719,36,1 +6,85,78,0,0,31.2,0.382,42,0 +0,129,110,46,130,67.1,0.319,26,1 +5,143,78,0,0,45,0.19,47,0 +5,130,82,0,0,39.1,0.956,37,1 +6,87,80,0,0,23.2,0.084,32,0 +0,119,64,18,92,34.9,0.725,23,0 +1,0,74,20,23,27.7,0.299,21,0 +5,73,60,0,0,26.8,0.268,27,0 +4,141,74,0,0,27.6,0.244,40,0 +7,194,68,28,0,35.9,0.745,41,1 +8,181,68,36,495,30.1,0.615,60,1 +1,128,98,41,58,32,1.321,33,1 +8,109,76,39,114,27.9,0.64,31,1 +5,139,80,35,160,31.6,0.361,25,1 +3,111,62,0,0,22.6,0.142,21,0 +9,123,70,44,94,33.1,0.374,40,0 +7,159,66,0,0,30.4,0.383,36,1 +11,135,0,0,0,52.3,0.578,40,1 +8,85,55,20,0,24.4,0.136,42,0 +5,158,84,41,210,39.4,0.395,29,1 +1,105,58,0,0,24.3,0.187,21,0 +3,107,62,13,48,22.9,0.678,23,1 +4,109,64,44,99,34.8,0.905,26,1 +4,148,60,27,318,30.9,0.15,29,1 +0,113,80,16,0,31,0.874,21,0 +1,138,82,0,0,40.1,0.236,28,0 +0,108,68,20,0,27.3,0.787,32,0 +2,99,70,16,44,20.4,0.235,27,0 +6,103,72,32,190,37.7,0.324,55,0 +5,111,72,28,0,23.9,0.407,27,0 +8,196,76,29,280,37.5,0.605,57,1 +5,162,104,0,0,37.7,0.151,52,1 +1,96,64,27,87,33.2,0.289,21,0 +7,184,84,33,0,35.5,0.355,41,1 +2,81,60,22,0,27.7,0.29,25,0 +0,147,85,54,0,42.8,0.375,24,0 +7,179,95,31,0,34.2,0.164,60,0 +0,140,65,26,130,42.6,0.431,24,1 +9,112,82,32,175,34.2,0.26,36,1 +12,151,70,40,271,41.8,0.742,38,1 +5,109,62,41,129,35.8,0.514,25,1 +6,125,68,30,120,30,0.464,32,0 +5,85,74,22,0,29,1.224,32,1 +5,112,66,0,0,37.8,0.261,41,1 +0,177,60,29,478,34.6,1.072,21,1 +2,158,90,0,0,31.6,0.805,66,1 +7,119,0,0,0,25.2,0.209,37,0 +7,142,60,33,190,28.8,0.687,61,0 +1,100,66,15,56,23.6,0.666,26,0 +1,87,78,27,32,34.6,0.101,22,0 +0,101,76,0,0,35.7,0.198,26,0 +3,162,52,38,0,37.2,0.652,24,1 +4,197,70,39,744,36.7,2.329,31,0 +0,117,80,31,53,45.2,0.089,24,0 +4,142,86,0,0,44,0.645,22,1 +6,134,80,37,370,46.2,0.238,46,1 +1,79,80,25,37,25.4,0.583,22,0 +4,122,68,0,0,35,0.394,29,0 +3,74,68,28,45,29.7,0.293,23,0 +4,171,72,0,0,43.6,0.479,26,1 +7,181,84,21,192,35.9,0.586,51,1 +0,179,90,27,0,44.1,0.686,23,1 +9,164,84,21,0,30.8,0.831,32,1 +0,104,76,0,0,18.4,0.582,27,0 +1,91,64,24,0,29.2,0.192,21,0 +4,91,70,32,88,33.1,0.446,22,0 +3,139,54,0,0,25.6,0.402,22,1 +6,119,50,22,176,27.1,1.318,33,1 +2,146,76,35,194,38.2,0.329,29,0 +9,184,85,15,0,30,1.213,49,1 +10,122,68,0,0,31.2,0.258,41,0 +0,165,90,33,680,52.3,0.427,23,0 +9,124,70,33,402,35.4,0.282,34,0 +1,111,86,19,0,30.1,0.143,23,0 +9,106,52,0,0,31.2,0.38,42,0 +2,129,84,0,0,28,0.284,27,0 +2,90,80,14,55,24.4,0.249,24,0 +0,86,68,32,0,35.8,0.238,25,0 +12,92,62,7,258,27.6,0.926,44,1 +1,113,64,35,0,33.6,0.543,21,1 +3,111,56,39,0,30.1,0.557,30,0 +2,114,68,22,0,28.7,0.092,25,0 +1,193,50,16,375,25.9,0.655,24,0 +11,155,76,28,150,33.3,1.353,51,1 +3,191,68,15,130,30.9,0.299,34,0 +3,141,0,0,0,30,0.761,27,1 +4,95,70,32,0,32.1,0.612,24,0 +3,142,80,15,0,32.4,0.2,63,0 +4,123,62,0,0,32,0.226,35,1 +5,96,74,18,67,33.6,0.997,43,0 +0,138,0,0,0,36.3,0.933,25,1 +2,128,64,42,0,40,1.101,24,0 +0,102,52,0,0,25.1,0.078,21,0 +2,146,0,0,0,27.5,0.24,28,1 +10,101,86,37,0,45.6,1.136,38,1 +2,108,62,32,56,25.2,0.128,21,0 +3,122,78,0,0,23,0.254,40,0 +1,71,78,50,45,33.2,0.422,21,0 +13,106,70,0,0,34.2,0.251,52,0 +2,100,70,52,57,40.5,0.677,25,0 +7,106,60,24,0,26.5,0.296,29,1 +0,104,64,23,116,27.8,0.454,23,0 +5,114,74,0,0,24.9,0.744,57,0 +2,108,62,10,278,25.3,0.881,22,0 +0,146,70,0,0,37.9,0.334,28,1 +10,129,76,28,122,35.9,0.28,39,0 +7,133,88,15,155,32.4,0.262,37,0 +7,161,86,0,0,30.4,0.165,47,1 +2,108,80,0,0,27,0.259,52,1 +7,136,74,26,135,26,0.647,51,0 +5,155,84,44,545,38.7,0.619,34,0 +1,119,86,39,220,45.6,0.808,29,1 +4,96,56,17,49,20.8,0.34,26,0 +5,108,72,43,75,36.1,0.263,33,0 +0,78,88,29,40,36.9,0.434,21,0 +0,107,62,30,74,36.6,0.757,25,1 +2,128,78,37,182,43.3,1.224,31,1 +1,128,48,45,194,40.5,0.613,24,1 +0,161,50,0,0,21.9,0.254,65,0 +6,151,62,31,120,35.5,0.692,28,0 +2,146,70,38,360,28,0.337,29,1 +0,126,84,29,215,30.7,0.52,24,0 +14,100,78,25,184,36.6,0.412,46,1 +8,112,72,0,0,23.6,0.84,58,0 +0,167,0,0,0,32.3,0.839,30,1 +2,144,58,33,135,31.6,0.422,25,1 +5,77,82,41,42,35.8,0.156,35,0 +5,115,98,0,0,52.9,0.209,28,1 +3,150,76,0,0,21,0.207,37,0 +2,120,76,37,105,39.7,0.215,29,0 +10,161,68,23,132,25.5,0.326,47,1 +0,137,68,14,148,24.8,0.143,21,0 +0,128,68,19,180,30.5,1.391,25,1 +2,124,68,28,205,32.9,0.875,30,1 +6,80,66,30,0,26.2,0.313,41,0 +0,106,70,37,148,39.4,0.605,22,0 +2,155,74,17,96,26.6,0.433,27,1 +3,113,50,10,85,29.5,0.626,25,0 +7,109,80,31,0,35.9,1.127,43,1 +2,112,68,22,94,34.1,0.315,26,0 +3,99,80,11,64,19.3,0.284,30,0 +3,182,74,0,0,30.5,0.345,29,1 +3,115,66,39,140,38.1,0.15,28,0 +6,194,78,0,0,23.5,0.129,59,1 +4,129,60,12,231,27.5,0.527,31,0 +3,112,74,30,0,31.6,0.197,25,1 +0,124,70,20,0,27.4,0.254,36,1 +13,152,90,33,29,26.8,0.731,43,1 +2,112,75,32,0,35.7,0.148,21,0 +1,157,72,21,168,25.6,0.123,24,0 +1,122,64,32,156,35.1,0.692,30,1 +10,179,70,0,0,35.1,0.2,37,0 +2,102,86,36,120,45.5,0.127,23,1 +6,105,70,32,68,30.8,0.122,37,0 +8,118,72,19,0,23.1,1.476,46,0 +2,87,58,16,52,32.7,0.166,25,0 +1,180,0,0,0,43.3,0.282,41,1 +12,106,80,0,0,23.6,0.137,44,0 +1,95,60,18,58,23.9,0.26,22,0 +0,165,76,43,255,47.9,0.259,26,0 +0,117,0,0,0,33.8,0.932,44,0 +5,115,76,0,0,31.2,0.343,44,1 +9,152,78,34,171,34.2,0.893,33,1 +7,178,84,0,0,39.9,0.331,41,1 +1,130,70,13,105,25.9,0.472,22,0 +1,95,74,21,73,25.9,0.673,36,0 +1,0,68,35,0,32,0.389,22,0 +5,122,86,0,0,34.7,0.29,33,0 +8,95,72,0,0,36.8,0.485,57,0 +8,126,88,36,108,38.5,0.349,49,0 +1,139,46,19,83,28.7,0.654,22,0 +3,116,0,0,0,23.5,0.187,23,0 +3,99,62,19,74,21.8,0.279,26,0 +5,0,80,32,0,41,0.346,37,1 +4,92,80,0,0,42.2,0.237,29,0 +4,137,84,0,0,31.2,0.252,30,0 +3,61,82,28,0,34.4,0.243,46,0 +1,90,62,12,43,27.2,0.58,24,0 +3,90,78,0,0,42.7,0.559,21,0 +9,165,88,0,0,30.4,0.302,49,1 +1,125,50,40,167,33.3,0.962,28,1 +13,129,0,30,0,39.9,0.569,44,1 +12,88,74,40,54,35.3,0.378,48,0 +1,196,76,36,249,36.5,0.875,29,1 +5,189,64,33,325,31.2,0.583,29,1 +5,158,70,0,0,29.8,0.207,63,0 +5,103,108,37,0,39.2,0.305,65,0 +4,146,78,0,0,38.5,0.52,67,1 +4,147,74,25,293,34.9,0.385,30,0 +5,99,54,28,83,34,0.499,30,0 +6,124,72,0,0,27.6,0.368,29,1 +0,101,64,17,0,21,0.252,21,0 +3,81,86,16,66,27.5,0.306,22,0 +1,133,102,28,140,32.8,0.234,45,1 +3,173,82,48,465,38.4,2.137,25,1 +0,118,64,23,89,0,1.731,21,0 +0,84,64,22,66,35.8,0.545,21,0 +2,105,58,40,94,34.9,0.225,25,0 +2,122,52,43,158,36.2,0.816,28,0 +12,140,82,43,325,39.2,0.528,58,1 +0,98,82,15,84,25.2,0.299,22,0 +1,87,60,37,75,37.2,0.509,22,0 +4,156,75,0,0,48.3,0.238,32,1 +0,93,100,39,72,43.4,1.021,35,0 +1,107,72,30,82,30.8,0.821,24,0 +0,105,68,22,0,20,0.236,22,0 +1,109,60,8,182,25.4,0.947,21,0 +1,90,62,18,59,25.1,1.268,25,0 +1,125,70,24,110,24.3,0.221,25,0 +1,119,54,13,50,22.3,0.205,24,0 +5,116,74,29,0,32.3,0.66,35,1 +8,105,100,36,0,43.3,0.239,45,1 +5,144,82,26,285,32,0.452,58,1 +3,100,68,23,81,31.6,0.949,28,0 +1,100,66,29,196,32,0.444,42,0 +5,166,76,0,0,45.7,0.34,27,1 +1,131,64,14,415,23.7,0.389,21,0 +4,116,72,12,87,22.1,0.463,37,0 +4,158,78,0,0,32.9,0.803,31,1 +2,127,58,24,275,27.7,1.6,25,0 +3,96,56,34,115,24.7,0.944,39,0 +0,131,66,40,0,34.3,0.196,22,1 +3,82,70,0,0,21.1,0.389,25,0 +3,193,70,31,0,34.9,0.241,25,1 +4,95,64,0,0,32,0.161,31,1 +6,137,61,0,0,24.2,0.151,55,0 +5,136,84,41,88,35,0.286,35,1 +9,72,78,25,0,31.6,0.28,38,0 +5,168,64,0,0,32.9,0.135,41,1 +2,123,48,32,165,42.1,0.52,26,0 +4,115,72,0,0,28.9,0.376,46,1 +0,101,62,0,0,21.9,0.336,25,0 +8,197,74,0,0,25.9,1.191,39,1 +1,172,68,49,579,42.4,0.702,28,1 +6,102,90,39,0,35.7,0.674,28,0 +1,112,72,30,176,34.4,0.528,25,0 +1,143,84,23,310,42.4,1.076,22,0 +1,143,74,22,61,26.2,0.256,21,0 +0,138,60,35,167,34.6,0.534,21,1 +3,173,84,33,474,35.7,0.258,22,1 +1,97,68,21,0,27.2,1.095,22,0 +4,144,82,32,0,38.5,0.554,37,1 +1,83,68,0,0,18.2,0.624,27,0 +3,129,64,29,115,26.4,0.219,28,1 +1,119,88,41,170,45.3,0.507,26,0 +2,94,68,18,76,26,0.561,21,0 +0,102,64,46,78,40.6,0.496,21,0 +2,115,64,22,0,30.8,0.421,21,0 +8,151,78,32,210,42.9,0.516,36,1 +4,184,78,39,277,37,0.264,31,1 +0,94,0,0,0,0,0.256,25,0 +1,181,64,30,180,34.1,0.328,38,1 +0,135,94,46,145,40.6,0.284,26,0 +1,95,82,25,180,35,0.233,43,1 +2,99,0,0,0,22.2,0.108,23,0 +3,89,74,16,85,30.4,0.551,38,0 +1,80,74,11,60,30,0.527,22,0 +2,139,75,0,0,25.6,0.167,29,0 +1,90,68,8,0,24.5,1.138,36,0 +0,141,0,0,0,42.4,0.205,29,1 +12,140,85,33,0,37.4,0.244,41,0 +5,147,75,0,0,29.9,0.434,28,0 +1,97,70,15,0,18.2,0.147,21,0 +6,107,88,0,0,36.8,0.727,31,0 +0,189,104,25,0,34.3,0.435,41,1 +2,83,66,23,50,32.2,0.497,22,0 +4,117,64,27,120,33.2,0.23,24,0 +8,108,70,0,0,30.5,0.955,33,1 +4,117,62,12,0,29.7,0.38,30,1 +0,180,78,63,14,59.4,2.42,25,1 +1,100,72,12,70,25.3,0.658,28,0 +0,95,80,45,92,36.5,0.33,26,0 +0,104,64,37,64,33.6,0.51,22,1 +0,120,74,18,63,30.5,0.285,26,0 +1,82,64,13,95,21.2,0.415,23,0 +2,134,70,0,0,28.9,0.542,23,1 +0,91,68,32,210,39.9,0.381,25,0 +2,119,0,0,0,19.6,0.832,72,0 +2,100,54,28,105,37.8,0.498,24,0 +14,175,62,30,0,33.6,0.212,38,1 +1,135,54,0,0,26.7,0.687,62,0 +5,86,68,28,71,30.2,0.364,24,0 +10,148,84,48,237,37.6,1.001,51,1 +9,134,74,33,60,25.9,0.46,81,0 +9,120,72,22,56,20.8,0.733,48,0 +1,71,62,0,0,21.8,0.416,26,0 +8,74,70,40,49,35.3,0.705,39,0 +5,88,78,30,0,27.6,0.258,37,0 +10,115,98,0,0,24,1.022,34,0 +0,124,56,13,105,21.8,0.452,21,0 +0,74,52,10,36,27.8,0.269,22,0 +0,97,64,36,100,36.8,0.6,25,0 +8,120,0,0,0,30,0.183,38,1 +6,154,78,41,140,46.1,0.571,27,0 +1,144,82,40,0,41.3,0.607,28,0 +0,137,70,38,0,33.2,0.17,22,0 +0,119,66,27,0,38.8,0.259,22,0 +7,136,90,0,0,29.9,0.21,50,0 +4,114,64,0,0,28.9,0.126,24,0 +0,137,84,27,0,27.3,0.231,59,0 +2,105,80,45,191,33.7,0.711,29,1 +7,114,76,17,110,23.8,0.466,31,0 +8,126,74,38,75,25.9,0.162,39,0 +4,132,86,31,0,28,0.419,63,0 +3,158,70,30,328,35.5,0.344,35,1 +0,123,88,37,0,35.2,0.197,29,0 +4,85,58,22,49,27.8,0.306,28,0 +0,84,82,31,125,38.2,0.233,23,0 +0,145,0,0,0,44.2,0.63,31,1 +0,135,68,42,250,42.3,0.365,24,1 +1,139,62,41,480,40.7,0.536,21,0 +0,173,78,32,265,46.5,1.159,58,0 +4,99,72,17,0,25.6,0.294,28,0 +8,194,80,0,0,26.1,0.551,67,0 +2,83,65,28,66,36.8,0.629,24,0 +2,89,90,30,0,33.5,0.292,42,0 +4,99,68,38,0,32.8,0.145,33,0 +4,125,70,18,122,28.9,1.144,45,1 +3,80,0,0,0,0,0.174,22,0 +6,166,74,0,0,26.6,0.304,66,0 +5,110,68,0,0,26,0.292,30,0 +2,81,72,15,76,30.1,0.547,25,0 +7,195,70,33,145,25.1,0.163,55,1 +6,154,74,32,193,29.3,0.839,39,0 +2,117,90,19,71,25.2,0.313,21,0 +3,84,72,32,0,37.2,0.267,28,0 +6,0,68,41,0,39,0.727,41,1 +7,94,64,25,79,33.3,0.738,41,0 +3,96,78,39,0,37.3,0.238,40,0 +10,75,82,0,0,33.3,0.263,38,0 +0,180,90,26,90,36.5,0.314,35,1 +1,130,60,23,170,28.6,0.692,21,0 +2,84,50,23,76,30.4,0.968,21,0 +8,120,78,0,0,25,0.409,64,0 +12,84,72,31,0,29.7,0.297,46,1 +0,139,62,17,210,22.1,0.207,21,0 +9,91,68,0,0,24.2,0.2,58,0 +2,91,62,0,0,27.3,0.525,22,0 +3,99,54,19,86,25.6,0.154,24,0 +3,163,70,18,105,31.6,0.268,28,1 +9,145,88,34,165,30.3,0.771,53,1 +7,125,86,0,0,37.6,0.304,51,0 +13,76,60,0,0,32.8,0.18,41,0 +6,129,90,7,326,19.6,0.582,60,0 +2,68,70,32,66,25,0.187,25,0 +3,124,80,33,130,33.2,0.305,26,0 +6,114,0,0,0,0,0.189,26,0 +9,130,70,0,0,34.2,0.652,45,1 +3,125,58,0,0,31.6,0.151,24,0 +3,87,60,18,0,21.8,0.444,21,0 +1,97,64,19,82,18.2,0.299,21,0 +3,116,74,15,105,26.3,0.107,24,0 +0,117,66,31,188,30.8,0.493,22,0 +0,111,65,0,0,24.6,0.66,31,0 +2,122,60,18,106,29.8,0.717,22,0 +0,107,76,0,0,45.3,0.686,24,0 +1,86,66,52,65,41.3,0.917,29,0 +6,91,0,0,0,29.8,0.501,31,0 +1,77,56,30,56,33.3,1.251,24,0 +4,132,0,0,0,32.9,0.302,23,1 +0,105,90,0,0,29.6,0.197,46,0 +0,57,60,0,0,21.7,0.735,67,0 +0,127,80,37,210,36.3,0.804,23,0 +3,129,92,49,155,36.4,0.968,32,1 +8,100,74,40,215,39.4,0.661,43,1 +3,128,72,25,190,32.4,0.549,27,1 +10,90,85,32,0,34.9,0.825,56,1 +4,84,90,23,56,39.5,0.159,25,0 +1,88,78,29,76,32,0.365,29,0 +8,186,90,35,225,34.5,0.423,37,1 +5,187,76,27,207,43.6,1.034,53,1 +4,131,68,21,166,33.1,0.16,28,0 +1,164,82,43,67,32.8,0.341,50,0 +4,189,110,31,0,28.5,0.68,37,0 +1,116,70,28,0,27.4,0.204,21,0 +3,84,68,30,106,31.9,0.591,25,0 +6,114,88,0,0,27.8,0.247,66,0 +1,88,62,24,44,29.9,0.422,23,0 +1,84,64,23,115,36.9,0.471,28,0 +7,124,70,33,215,25.5,0.161,37,0 +1,97,70,40,0,38.1,0.218,30,0 +8,110,76,0,0,27.8,0.237,58,0 +11,103,68,40,0,46.2,0.126,42,0 +11,85,74,0,0,30.1,0.3,35,0 +6,125,76,0,0,33.8,0.121,54,1 +0,198,66,32,274,41.3,0.502,28,1 +1,87,68,34,77,37.6,0.401,24,0 +6,99,60,19,54,26.9,0.497,32,0 +0,91,80,0,0,32.4,0.601,27,0 +2,95,54,14,88,26.1,0.748,22,0 +1,99,72,30,18,38.6,0.412,21,0 +6,92,62,32,126,32,0.085,46,0 +4,154,72,29,126,31.3,0.338,37,0 +0,121,66,30,165,34.3,0.203,33,1 +3,78,70,0,0,32.5,0.27,39,0 +2,130,96,0,0,22.6,0.268,21,0 +3,111,58,31,44,29.5,0.43,22,0 +2,98,60,17,120,34.7,0.198,22,0 +1,143,86,30,330,30.1,0.892,23,0 +1,119,44,47,63,35.5,0.28,25,0 +6,108,44,20,130,24,0.813,35,0 +2,118,80,0,0,42.9,0.693,21,1 +10,133,68,0,0,27,0.245,36,0 +2,197,70,99,0,34.7,0.575,62,1 +0,151,90,46,0,42.1,0.371,21,1 +6,109,60,27,0,25,0.206,27,0 +12,121,78,17,0,26.5,0.259,62,0 +8,100,76,0,0,38.7,0.19,42,0 +8,124,76,24,600,28.7,0.687,52,1 +1,93,56,11,0,22.5,0.417,22,0 +8,143,66,0,0,34.9,0.129,41,1 +6,103,66,0,0,24.3,0.249,29,0 +3,176,86,27,156,33.3,1.154,52,1 +0,73,0,0,0,21.1,0.342,25,0 +11,111,84,40,0,46.8,0.925,45,1 +2,112,78,50,140,39.4,0.175,24,0 +3,132,80,0,0,34.4,0.402,44,1 +2,82,52,22,115,28.5,1.699,25,0 +6,123,72,45,230,33.6,0.733,34,0 +0,188,82,14,185,32,0.682,22,1 +0,67,76,0,0,45.3,0.194,46,0 +1,89,24,19,25,27.8,0.559,21,0 +1,173,74,0,0,36.8,0.088,38,1 +1,109,38,18,120,23.1,0.407,26,0 +1,108,88,19,0,27.1,0.4,24,0 +6,96,0,0,0,23.7,0.19,28,0 +1,124,74,36,0,27.8,0.1,30,0 +7,150,78,29,126,35.2,0.692,54,1 +4,183,0,0,0,28.4,0.212,36,1 +1,124,60,32,0,35.8,0.514,21,0 +1,181,78,42,293,40,1.258,22,1 +1,92,62,25,41,19.5,0.482,25,0 +0,152,82,39,272,41.5,0.27,27,0 +1,111,62,13,182,24,0.138,23,0 +3,106,54,21,158,30.9,0.292,24,0 +3,174,58,22,194,32.9,0.593,36,1 +7,168,88,42,321,38.2,0.787,40,1 +6,105,80,28,0,32.5,0.878,26,0 +11,138,74,26,144,36.1,0.557,50,1 +3,106,72,0,0,25.8,0.207,27,0 +6,117,96,0,0,28.7,0.157,30,0 +2,68,62,13,15,20.1,0.257,23,0 +9,112,82,24,0,28.2,1.282,50,1 +0,119,0,0,0,32.4,0.141,24,1 +2,112,86,42,160,38.4,0.246,28,0 +2,92,76,20,0,24.2,1.698,28,0 +6,183,94,0,0,40.8,1.461,45,0 +0,94,70,27,115,43.5,0.347,21,0 +2,108,64,0,0,30.8,0.158,21,0 +4,90,88,47,54,37.7,0.362,29,0 +0,125,68,0,0,24.7,0.206,21,0 +0,132,78,0,0,32.4,0.393,21,0 +5,128,80,0,0,34.6,0.144,45,0 +4,94,65,22,0,24.7,0.148,21,0 +7,114,64,0,0,27.4,0.732,34,1 +0,102,78,40,90,34.5,0.238,24,0 +2,111,60,0,0,26.2,0.343,23,0 +1,128,82,17,183,27.5,0.115,22,0 +10,92,62,0,0,25.9,0.167,31,0 +13,104,72,0,0,31.2,0.465,38,1 +5,104,74,0,0,28.8,0.153,48,0 +2,94,76,18,66,31.6,0.649,23,0 +7,97,76,32,91,40.9,0.871,32,1 +1,100,74,12,46,19.5,0.149,28,0 +0,102,86,17,105,29.3,0.695,27,0 +4,128,70,0,0,34.3,0.303,24,0 +6,147,80,0,0,29.5,0.178,50,1 +4,90,0,0,0,28,0.61,31,0 +3,103,72,30,152,27.6,0.73,27,0 +2,157,74,35,440,39.4,0.134,30,0 +1,167,74,17,144,23.4,0.447,33,1 +0,179,50,36,159,37.8,0.455,22,1 +11,136,84,35,130,28.3,0.26,42,1 +0,107,60,25,0,26.4,0.133,23,0 +1,91,54,25,100,25.2,0.234,23,0 +1,117,60,23,106,33.8,0.466,27,0 +5,123,74,40,77,34.1,0.269,28,0 +2,120,54,0,0,26.8,0.455,27,0 +1,106,70,28,135,34.2,0.142,22,0 +2,155,52,27,540,38.7,0.24,25,1 +2,101,58,35,90,21.8,0.155,22,0 +1,120,80,48,200,38.9,1.162,41,0 +11,127,106,0,0,39,0.19,51,0 +3,80,82,31,70,34.2,1.292,27,1 +10,162,84,0,0,27.7,0.182,54,0 +1,199,76,43,0,42.9,1.394,22,1 +8,167,106,46,231,37.6,0.165,43,1 +9,145,80,46,130,37.9,0.637,40,1 +6,115,60,39,0,33.7,0.245,40,1 +1,112,80,45,132,34.8,0.217,24,0 +4,145,82,18,0,32.5,0.235,70,1 +10,111,70,27,0,27.5,0.141,40,1 +6,98,58,33,190,34,0.43,43,0 +9,154,78,30,100,30.9,0.164,45,0 +6,165,68,26,168,33.6,0.631,49,0 +1,99,58,10,0,25.4,0.551,21,0 +10,68,106,23,49,35.5,0.285,47,0 +3,123,100,35,240,57.3,0.88,22,0 +8,91,82,0,0,35.6,0.587,68,0 +6,195,70,0,0,30.9,0.328,31,1 +9,156,86,0,0,24.8,0.23,53,1 +0,93,60,0,0,35.3,0.263,25,0 +3,121,52,0,0,36,0.127,25,1 +2,101,58,17,265,24.2,0.614,23,0 +2,56,56,28,45,24.2,0.332,22,0 +0,162,76,36,0,49.6,0.364,26,1 +0,95,64,39,105,44.6,0.366,22,0 +4,125,80,0,0,32.3,0.536,27,1 +5,136,82,0,0,0,0.64,69,0 +2,129,74,26,205,33.2,0.591,25,0 +3,130,64,0,0,23.1,0.314,22,0 +1,107,50,19,0,28.3,0.181,29,0 +1,140,74,26,180,24.1,0.828,23,0 +1,144,82,46,180,46.1,0.335,46,1 +8,107,80,0,0,24.6,0.856,34,0 +13,158,114,0,0,42.3,0.257,44,1 +2,121,70,32,95,39.1,0.886,23,0 +7,129,68,49,125,38.5,0.439,43,1 +2,90,60,0,0,23.5,0.191,25,0 +7,142,90,24,480,30.4,0.128,43,1 +3,169,74,19,125,29.9,0.268,31,1 +0,99,0,0,0,25,0.253,22,0 +4,127,88,11,155,34.5,0.598,28,0 +4,118,70,0,0,44.5,0.904,26,0 +2,122,76,27,200,35.9,0.483,26,0 +6,125,78,31,0,27.6,0.565,49,1 +1,168,88,29,0,35,0.905,52,1 +2,129,0,0,0,38.5,0.304,41,0 +4,110,76,20,100,28.4,0.118,27,0 +6,80,80,36,0,39.8,0.177,28,0 +10,115,0,0,0,0,0.261,30,1 +2,127,46,21,335,34.4,0.176,22,0 +9,164,78,0,0,32.8,0.148,45,1 +2,93,64,32,160,38,0.674,23,1 +3,158,64,13,387,31.2,0.295,24,0 +5,126,78,27,22,29.6,0.439,40,0 +10,129,62,36,0,41.2,0.441,38,1 +0,134,58,20,291,26.4,0.352,21,0 +3,102,74,0,0,29.5,0.121,32,0 +7,187,50,33,392,33.9,0.826,34,1 +3,173,78,39,185,33.8,0.97,31,1 +10,94,72,18,0,23.1,0.595,56,0 +1,108,60,46,178,35.5,0.415,24,0 +5,97,76,27,0,35.6,0.378,52,1 +4,83,86,19,0,29.3,0.317,34,0 +1,114,66,36,200,38.1,0.289,21,0 +1,149,68,29,127,29.3,0.349,42,1 +5,117,86,30,105,39.1,0.251,42,0 +1,111,94,0,0,32.8,0.265,45,0 +4,112,78,40,0,39.4,0.236,38,0 +1,116,78,29,180,36.1,0.496,25,0 +0,141,84,26,0,32.4,0.433,22,0 +2,175,88,0,0,22.9,0.326,22,0 +2,92,52,0,0,30.1,0.141,22,0 +3,130,78,23,79,28.4,0.323,34,1 +8,120,86,0,0,28.4,0.259,22,1 +2,174,88,37,120,44.5,0.646,24,1 +2,106,56,27,165,29,0.426,22,0 +2,105,75,0,0,23.3,0.56,53,0 +4,95,60,32,0,35.4,0.284,28,0 +0,126,86,27,120,27.4,0.515,21,0 +8,65,72,23,0,32,0.6,42,0 +2,99,60,17,160,36.6,0.453,21,0 +1,102,74,0,0,39.5,0.293,42,1 +11,120,80,37,150,42.3,0.785,48,1 +3,102,44,20,94,30.8,0.4,26,0 +1,109,58,18,116,28.5,0.219,22,0 +9,140,94,0,0,32.7,0.734,45,1 +13,153,88,37,140,40.6,1.174,39,0 +12,100,84,33,105,30,0.488,46,0 +1,147,94,41,0,49.3,0.358,27,1 +1,81,74,41,57,46.3,1.096,32,0 +3,187,70,22,200,36.4,0.408,36,1 +6,162,62,0,0,24.3,0.178,50,1 +4,136,70,0,0,31.2,1.182,22,1 +1,121,78,39,74,39,0.261,28,0 +3,108,62,24,0,26,0.223,25,0 +0,181,88,44,510,43.3,0.222,26,1 +8,154,78,32,0,32.4,0.443,45,1 +1,128,88,39,110,36.5,1.057,37,1 +7,137,90,41,0,32,0.391,39,0 +0,123,72,0,0,36.3,0.258,52,1 +1,106,76,0,0,37.5,0.197,26,0 +6,190,92,0,0,35.5,0.278,66,1 +2,88,58,26,16,28.4,0.766,22,0 +9,170,74,31,0,44,0.403,43,1 +9,89,62,0,0,22.5,0.142,33,0 +10,101,76,48,180,32.9,0.171,63,0 +2,122,70,27,0,36.8,0.34,27,0 +5,121,72,23,112,26.2,0.245,30,0 +1,126,60,0,0,30.1,0.349,47,1 +1,93,70,31,0,30.4,0.315,23,0 \ No newline at end of file From b9237115d9aff6d70be94e73894d159a2721230d Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Sun, 3 Dec 2023 10:46:00 +0100 Subject: [PATCH 16/20] Added new functionalities to the plot methods --- pyod/models/iforest.py | 65 ++++++++++++++++++++++++++++----------- pyod/test/test_iforest.py | 37 ++++++++++++++++++++++ 2 files changed, 84 insertions(+), 18 deletions(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 4bca7c904..c34efd180 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -460,7 +460,7 @@ def fs_datasets_hyperparams(self,dataset): } return data[dataset] - def diffi_ib(self, X, adjust_iic=True): # "ib" stands for "in-bag" + def diffi_ib(self, X: np.array, adjust_iic=True): # "ib" stands for "in-bag" """Computes the Global Feature Importance scores for a set of input samples according to the DIFFI algorithm. Parameters @@ -555,7 +555,7 @@ def diffi_ib(self, X, adjust_iic=True): # "ib" stands for "in-bag" return fi_ib, exec_time - def local_diffi(self, x): + def local_diffi(self, x: np.array): """Compute the Local Feature Importance scores for a single input sample according to the DIFFI algorithm. Parameters @@ -663,7 +663,7 @@ def _get_iic(self,estimator, predictions, is_leaves, adjust_iic): lambda_[node] = tmp return lambda_ - def local_diffi_batch(self, X): + def local_diffi_batch(self, X: np.array): """Computes the Local Feature Importance scores for a set of input samples according to the DIFFI algorithm. Parameters @@ -694,14 +694,14 @@ def local_diffi_batch(self, X): - def compute_local_importances(self,X: pd.DataFrame,name: str,pwd_imp_score: str = os.getcwd(), pwd_plt_data: str = os.getcwd()) -> tuple[np.array,dict,str,str]: + def compute_local_importances(self,X: np.array,name: str,pwd_imp_score: str = os.getcwd(), pwd_plt_data: str = os.getcwd()) -> tuple[np.array,dict,str,str]: """ Collect useful information that will be successively used by the plt_importances_bars,plt_global_importance_bar and plt_feat_bar_plot functions. Parameters ---------- - X: Input dataset + X: Input dataset,np.array of shape (n_samples,n_features) name: Dataset's name pwd_imp_score: Directory where the Importance Scores results will be saved as pkl files, by default the current working directory pwd_plt_data: Directory where the plot data results will be saved as pkl files, by default the current working directory @@ -754,14 +754,14 @@ def compute_local_importances(self,X: pd.DataFrame,name: str,pwd_imp_score: str return fi,plt_data,path_fi,path_plt_data - def compute_global_importances(self,X: pd.DataFrame, n_runs:int, name: str,pwd_imp_score: str = os.getcwd(), pwd_plt_data: str = os.getcwd()) -> tuple[np.array,dict,str,str]: + def compute_global_importances(self,X: np.array, n_runs:int, name: str,pwd_imp_score: str = os.getcwd(), pwd_plt_data: str = os.getcwd()) -> tuple[np.array,dict,str,str]: """ Collect useful information that will be successively used by the plt_importances_bars,plt_global_importance_bar and plt_feat_bar_plot functions. Parameters ---------- - X: Input Dataset + X: Input Dataset,np.array of shape (n_samples,n_features) n_runs: Number of runs to perform in order to compute the Global Feature Importance Scores. name: Dataset's name pwd_imp_score: Directory where the Importance Scores results will be saved as pkl files, by default the current working directory @@ -812,7 +812,7 @@ def compute_global_importances(self,X: pd.DataFrame, n_runs:int, name: str,pwd_i return fi,plt_data,path_fi,path_plt_data - def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f: int = 6,is_local: bool=False, save: bool =True): + def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f: int = 6,col_names = None, is_local: bool=False, save: bool =True): """ Obtain the Global Importance Bar Plot given the Importance Scores values computed in the compute_local_importance or compute_global_importance functions. @@ -822,7 +822,8 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f Obtained from the compute_local_importance or compute_global_importance functions. name: Dataset's name pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. - f: Number of vertical bars to include in the Bar Plot. By default f is set to 6. + f: Number of vertical bars to include in the Bar Plot. By default f is set to 6. + col_names: List with the names of the features of the input dataset, by default None. is_local: Boolean variable used to specify weather we are plotting the Global or Local Feature Importance in order to set the file name. If is_local is True the result will be the LFI Score Plot (based on the LFI scores of the input samples), otherwise the result is the GFI Score Plot (based on the GFI scores obtained in the different n_runs execution of the model). By default is_local is set to False. @@ -882,7 +883,10 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f fig, ax = plt.subplots() for i in range(dim): - ax.bar(r[:f], bars.T.iloc[i, :f].values, bottom=bars.T.iloc[:i, :f].sum().values, color=color[i % number_colours], edgecolor='white', width=barWidth, label=str(i), hatch=patterns[i // number_colours]) + if col_names is not None: + ax.bar(r[:f], bars.T.iloc[i, :f].values, bottom=bars.T.iloc[:i, :f].sum().values, color=color[i % number_colours], edgecolor='white', width=barWidth, label=col_names[i], hatch=patterns[i // number_colours]) + else: + ax.bar(r[:f], bars.T.iloc[i, :f].values, bottom=bars.T.iloc[:i, :f].sum().values, color=color[i % number_colours], edgecolor='white', width=barWidth, label=str(i), hatch=patterns[i // number_colours]) ax.set_xlabel("Rank", fontsize=20) ax.set_xticks(range(f), tick_names[:f]) @@ -896,7 +900,7 @@ def plt_importances_bars(self,imps_path: str, name: str, pwd: str =os.getcwd(),f return fig, ax, bars - def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),is_local: bool =False,save: bool =True): + def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),col_names=None,is_local: bool =False,save: bool =True): """ Obtain the Global Feature Importance Score Plot exploiting the information obtained from the compute_local_importance or compute_global_importance functions. @@ -905,7 +909,8 @@ def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),is plt_data_path: Dictionary generated from the compute_local_importance or compute_global_importance functions with the necessary information to create the Score Plot. name: Dataset's name - pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. + pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. + col_names: List with the names of the features of the input dataset, by default None. is_local: Boolean variable used to specify weather we are plotting the Global or Local Feature Importance in order to set the file name. If is_local is True the result will be the LFI Score Plot (based on the LFI scores of the input samples), otherwise the result is the GFI Score Plot (based on the GFI scores obtained in the different n_runs execution of the model). By default is_local is set to False. @@ -965,14 +970,21 @@ def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),is ax1.set_ylabel('Features',fontsize=20) plt.xlim(xlim) plt.subplots_adjust(left=0.3) + + if col_names is not None: + ax1.set_yticks(range(dim)) + ax1.set_yticklabels(col_names) + + if save: plt.savefig(pwd+'/{}.pdf'.format(name_file),bbox_inches='tight') return ax1,ax2 - def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array ,resolution: int, - pwd: str =os.getcwd(),save: bool =True,m: bool =None,factor: int =3,feats_plot: tuple[int,int] =(0,1),ax=None,labels: bool=True): + def plot_importance_map(self,name: str, X_train: np.array,y_train: np.array ,resolution: int, + pwd: str =os.getcwd(),save: bool =True,m: bool =None,factor: int =3, feats_plot: tuple =(0,1), + col_names=None,ax=None,labels: bool=True): """ Produce the Local Feature Importance Scoremap. @@ -987,7 +999,9 @@ def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array m: Boolean variable regulating the plt.pcolor advanced settings. By defualt the value of m is set to None. factor: Integer factor used to define the minimum and maximum value of the points used to create the scoremap. By default the value of f is set to 3. feats_plot: This tuple contains the indexes of the pair features to compare in the Scoremap. By default the value of feats_plot - is set to (0,1). + is set to (0,1). Do not use in case we pass the col_names parameter. + col_names: List with the names of the features of the input dataset, by default None. + two features will be compared. ax: plt.axes object used to create the plot. By default ax is set to None. labels: Boolean variable used to decide weather to include the x and y label name in the plot. When calling the plot_importance_map function inside plot_complete_scoremap this parameter will be set to False @@ -996,6 +1010,7 @@ def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array ---------- fig,ax : plt.figure and plt.axes objects used to create the plot """ + mins = X_train.min(axis=0)[list(feats_plot)] maxs = X_train.max(axis=0)[list(feats_plot)] mean = X_train.mean(axis = 0) @@ -1037,9 +1052,12 @@ def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array ax.scatter(x[(y_train == 0)[:, 0]], y[(y_train == 0)[:, 0]], s=40, c="tab:blue", marker="o", edgecolors="k", label="inliers") ax.scatter(x[(y_train == 1)[:, 0]], y[(y_train == 1)[:, 0]], s=60, c="tab:orange", marker="*", edgecolors="k", label="outliers") - if labels: - ax.set_xlabel(f'Feature {feats_plot[0]}') - ax.set_ylabel(f'Feature {feats_plot[1]}') + if (labels) and (col_names is not None): + ax.set_xlabel(col_names[feats_plot[0]],fontsize=20) + ax.set_ylabel(col_names[feats_plot[1]],fontsize=20) + elif (labels) and (col_names is None): + ax.set_xlabel(f'Feature {feats_plot[0]}',fontsize=20) + ax.set_ylabel(f'Feature {feats_plot[1]}',fontsize=20) ax.legend() @@ -1049,6 +1067,17 @@ def plot_importance_map(self,name: str, X_train: pd.DataFrame,y_train: np.array fig,ax=None,None return fig, ax + + def plot_importance_map_col_names(self,name: str, X:pd.DataFrame, X_train: np.array,y_train: np.array ,resolution: int, + pwd: str =os.getcwd(),save: bool =True,m: bool =None,factor: int =3, + col_names=None,ax=None,labels: bool=True): + + feats_plot=tuple((X.columns.get_loc(col_names[0]),X.columns.get_loc(col_names[1]))) + col_names=list(X.columns) + + return self.plot_importance_map(name,X_train,y_train,resolution,pwd,save,m,factor,feats_plot,col_names,ax,labels) + + #col_names: A list with the names of the two features to compare in the Scoremap. By default the value of col_names is set to None and the first def plot_complete_scoremap(self,name:str,dim:int,X: pd.DataFrame, y: np.array, pwd:str =os.getcwd()): """Produce the Complete Local Feature Importance Scoremap: a Scoremap for each pair of features in the input dataset. diff --git a/pyod/test/test_iforest.py b/pyod/test/test_iforest.py index bd78ec72d..46ec63ece 100644 --- a/pyod/test/test_iforest.py +++ b/pyod/test/test_iforest.py @@ -361,6 +361,9 @@ def test_compute_local_importances(self): #Check that the pkl files can be loaded assert pickle.load(open(path_fi,'rb')) is not None assert pickle.load(open(path_plt_data,'rb')) is not None + #Check that X_tr and y_tr are of type np.array + assert type(X_tr) == np.ndarray + assert type(y_tr) == np.ndarray """ Tests on fi and plt_data @@ -430,6 +433,9 @@ def test_compute_global_importances(self): #Check that the pkl files can be loaded assert pickle.load(open(path_fi,'rb')) is not None assert pickle.load(open(path_plt_data,'rb')) is not None + #Check that X_tr and y_tr are of type np.array + assert type(X_tr) == np.ndarray + assert type(y_tr) == np.ndarray """ Tests on fi and plt_data @@ -609,6 +615,37 @@ def test_plot_importance_map(self): assert ax is not None assert fig is not None + def test_plot_importance_map_col_names(self): + + # Let's perform the test on the pima.mat dataset + path = os.path.join(os.getcwd(),'pyod','test','data','pima.csv') + data=pd.read_csv(path) + X_tr=data.drop(columns=['Outcome']) + y_tr=data['Outcome'] + X_tr,y_tr=shuffle(X_tr,y_tr,random_state=0) + X,y=X_tr.values,y_tr.values + + name='test_pima_col_names' + + # create an isolation forest model + iforest = IForest(n_estimators=10, max_samples=64, random_state=0) + iforest.fit(X_tr) + plot_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') + + #If the folder do not exist create it: + if not os.path.exists(plot_path): + os.makedirs(plot_path) + + fig,ax=iforest.plot_importance_map_col_names(name,X_tr,X,y,30,pwd=plot_path,col_names=['Pregnancies','Glucose']) + + """ + Tests on ax + """ + + #Check that the returned ax is not None + assert ax is not None + assert fig is not None + def test_plot_complete_scoremap(self): # Here we'll use a random dataset with just 3 features otherwise it takes too much time to From c273710d338b2e33f22e79921044c75ef630f089 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Sun, 3 Dec 2023 17:35:05 +0100 Subject: [PATCH 17/20] Added plot_importance_map_col_names method --- pyod/models/iforest.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index c34efd180..4c654822f 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -1071,6 +1071,23 @@ def plot_importance_map(self,name: str, X_train: np.array,y_train: np.array ,res def plot_importance_map_col_names(self,name: str, X:pd.DataFrame, X_train: np.array,y_train: np.array ,resolution: int, pwd: str =os.getcwd(),save: bool =True,m: bool =None,factor: int =3, col_names=None,ax=None,labels: bool=True): + """Stub method of plot_importance_map used to give the user the possibility of specifying the names of the features to compare in the Scoremap. + + Parameters + ---------- + name: Dataset's name + X: Input dataset as a pd.DataFrame + X_train: Training Set + y_train: Dataset training labels + resolution: Scoremap resolution + pwd: Directory where the plot will be saved as a PDF file. By default the value of pwd is set to the current working directory. + save: Boolean variable used to decide weather to save the Score Plot locally as a PDF or not. By default save is set to True. + m: Boolean variable regulating the plt.pcolor advanced settings. By defualt the value of m is set to None. + factor: Integer factor used to define the minimum and maximum value of the points used to create the scoremap. By default the value of f is set to 3. + col_names: List with the names of the two features that will be compares, by default None. + ax: plt.axes object used to create the plot. By default ax is set to None. + labels: Boolean variable used to decide weather to include the x and y label name in the plot. + """ feats_plot=tuple((X.columns.get_loc(col_names[0]),X.columns.get_loc(col_names[1]))) col_names=list(X.columns) From 742163c4420c38f3ad9b23a9f95cc46de07de10b Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Sun, 3 Dec 2023 18:57:49 +0100 Subject: [PATCH 18/20] Correct typo in iforest.py --- pyod/models/iforest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 4c654822f..a6645c9a5 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -973,7 +973,9 @@ def plt_feat_bar_plot(self,plt_data_path: str,name: str,pwd: str =os.getcwd(),co if col_names is not None: ax1.set_yticks(range(dim)) - ax1.set_yticklabels(col_names) + idx=list(feat_imp['Feature']) + yticks=[col_names[i] for i in idx] + ax1.set_yticklabels(yticks) if save: From adc330cacad2e07277f083cb0778c2a33eb5c2c3 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Tue, 19 Dec 2023 16:21:15 +0100 Subject: [PATCH 19/20] Added tearDown method --- pyod/test/test_iforest.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pyod/test/test_iforest.py b/pyod/test/test_iforest.py index 46ec63ece..401595beb 100644 --- a/pyod/test/test_iforest.py +++ b/pyod/test/test_iforest.py @@ -44,6 +44,23 @@ def setUp(self): self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) + def tearDown(self): + test_imp_score_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score') + test_plt_data_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plt_data') + test_plots_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') + files_to_delete = [os.path.join(test_imp_score_path,'imp_scores_GFI_test_global_pima.pkl'),os.path.join(test_imp_score_path,'imp_scores_LFI_test_global_pima.pkl'), + os.path.join(test_plt_data_path,'plt_data_GFI_test_global_pima.pkl'),os.path.join(test_plt_data_path,'plt_data_LFI_test_global_pima.pkl'), + os.path.join(test_plots_path,'GFI_Bar_plot_test_pima_bar_plot.pdf'),os.path.join(test_plots_path,'GFI_Bar_plot_test_pima_9_bar_plot.pdf'), + os.path.join(test_plots_path,'GFI_Score_Plot_test_GFI_pima.pdf'),os.path.join(test_plots_path,'GFI_Score_Plot_test_LFI_pima.pdf'), + os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima.pdf'),os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima_col_names.pdf'), + os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima_complete')] + + for file in files_to_delete: + try: + os.remove(file) + except FileNotFoundError: + print('File not found') + def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) @@ -193,7 +210,9 @@ def test_decision_function_single_tree(self): #Check weather the two decision function values are different - assert not np.array_equal(clf1.decision_function_single_tree(tree_idx,X), clf2.decision_function_single_tree(tree_idx,X)) + assert_array_almost_equal( + clf1.decision_function_single_tree(tree_idx,X), clf2.decision_function_single_tree(tree_idx,X), + decimal=1) def test_score_samples(self): @@ -655,7 +674,7 @@ def test_plot_complete_scoremap(self): #Assign at random the anomalous/not anomaoous labels #Create a random array of 0 and 1 of shape=(100,) y=np.random.randint(0,2,size=100) - name='test_complete' + name='test_pima' # create an isolation forest model iforest = IForest(n_estimators=10, max_samples=64, random_state=0) iforest.fit(X) From ac64a57e71dffbdc8e500078064c145380074254 Mon Sep 17 00:00:00 2001 From: FrizzoDavide Date: Tue, 19 Dec 2023 17:56:33 +0100 Subject: [PATCH 20/20] Update test_iforest.py --- pyod/test/test_iforest.py | 62 ++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/pyod/test/test_iforest.py b/pyod/test/test_iforest.py index 401595beb..b5cdc8951 100644 --- a/pyod/test/test_iforest.py +++ b/pyod/test/test_iforest.py @@ -10,6 +10,7 @@ import pandas as pd import pickle import scipy +import shutil # noinspection PyProtectedMember from numpy.testing import assert_allclose @@ -43,24 +44,7 @@ def setUp(self): self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) - - def tearDown(self): - test_imp_score_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score') - test_plt_data_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plt_data') - test_plots_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') - files_to_delete = [os.path.join(test_imp_score_path,'imp_scores_GFI_test_global_pima.pkl'),os.path.join(test_imp_score_path,'imp_scores_LFI_test_global_pima.pkl'), - os.path.join(test_plt_data_path,'plt_data_GFI_test_global_pima.pkl'),os.path.join(test_plt_data_path,'plt_data_LFI_test_global_pima.pkl'), - os.path.join(test_plots_path,'GFI_Bar_plot_test_pima_bar_plot.pdf'),os.path.join(test_plots_path,'GFI_Bar_plot_test_pima_9_bar_plot.pdf'), - os.path.join(test_plots_path,'GFI_Score_Plot_test_GFI_pima.pdf'),os.path.join(test_plots_path,'GFI_Score_Plot_test_LFI_pima.pdf'), - os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima.pdf'),os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima_col_names.pdf'), - os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima_complete')] - - for file in files_to_delete: - try: - os.remove(file) - except FileNotFoundError: - print('File not found') - + def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) @@ -340,6 +324,12 @@ def test_local_diffi_batch(self): assert np.all(np.array(exec_time)>=0) == True def test_compute_local_importances(self): + + test_dir_path=os.path.join(os.getcwd(),'pyod','test','test_data') + + #If the folder do not exist create it: + if not os.path.exists(test_dir_path): + os.makedirs(test_dir_path) #Create a path to save the pkl files created by compute_local_importances test_imp_score_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score') @@ -491,9 +481,10 @@ def test_plot_importances_bars(self): #We create the plot with plot_importances_bars and we will then compare it with the #expected result contained in GFI_glass_synt.pdf - imps_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score','imp_scores_GFI_test_global_pima.pkl') + imps_path_global=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score','imp_scores_GFI_test_global_pima.pkl') + imps_path_local=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score','imp_scores_LFI_test_local_pima.pkl') - imps=pickle.load(open(imps_path,'rb')) + imps=pickle.load(open(imps_path_global,'rb')) #Create a path to save the plot image plot_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') @@ -508,7 +499,7 @@ def test_plot_importances_bars(self): #Create a name for the plot name='test_pima' f=6 - fig,ax,bars=iforest.plt_importances_bars(imps_path,name,pwd=plot_path,f=f) + fig,ax,bars=iforest.plt_importances_bars(imps_path_global,name,pwd=plot_path,f=f) """ Tests on ax @@ -530,7 +521,7 @@ def test_plot_importances_bars(self): #See if the plot correctly changes if I pass from f=6 (default value) to f=9 f1=9 - fig1,ax1,bars1=iforest.plt_importances_bars(imps_path,name='test_pima_9',pwd=plot_path,f=f1) + fig1,ax1,bars1=iforest.plt_importances_bars(imps_path_global,name='test_pima_9',pwd=plot_path,f=f1) #Check that the xtick and y tick labels are correct x_tick_labels1 = [tick.get_text() for tick in ax1.get_xticklabels()] @@ -556,6 +547,10 @@ def test_plot_importances_bars(self): bars1_sum=np.array([bars1[i].sum() for i in range(bars1.shape[1])]) assert_array_almost_equal(bars1_sum,np.full(bars1.shape[1],100)) + #At the end of the test delete the pkl file + os.remove(imps_path_global) + os.remove(imps_path_local) + def test_plt_feat_bar_plot(self): # We need the plt_data array: let's consider the global case with plt_data_GFI_glass.pkl and @@ -604,6 +599,10 @@ def test_plt_feat_bar_plot(self): assert np.all(np.array(y_tick_labels_local).astype('float')>=len(y_tick_labels2_local)-1) == False assert np.all(np.array(y_tick_labels_global).astype('float')>=len(y_tick_labels2_global)-1) == False + #At the end of the test delete the pkl file + os.remove(plt_data_global_path) + os.remove(plt_data_local_path) + def test_plot_importance_map(self): # Let's perform the test on the pima.mat dataset @@ -695,7 +694,24 @@ def test_plot_complete_scoremap(self): assert fig is not None def tearDown(self): - pass + test_imp_score_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_imp_score') + test_plt_data_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plt_data') + test_plots_path=os.path.join(os.getcwd(),'pyod','test','test_data','test_plots') + files_to_delete = [os.path.join(test_imp_score_path,'imp_scores_GFI_test_global_pima.pkl'),os.path.join(test_imp_score_path,'imp_scores_LFI_test_local_pima.pkl'), + os.path.join(test_plt_data_path,'plt_data_GFI_test_global_pima.pkl'),os.path.join(test_plt_data_path,'plt_data_LFI_test_local_pima.pkl'), + os.path.join(test_plots_path,'GFI_Bar_plot_test_pima_bar_plot.pdf'),os.path.join(test_plots_path,'GFI_Bar_plot_test_pima_9_bar_plot.pdf'), + os.path.join(test_plots_path,'GFI_Score_plot_test_GFI_pima.pdf'),os.path.join(test_plots_path,'LFI_Score_plot_test_LFI_pima.pdf'), + os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima.pdf'),os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima_col_names.pdf'), + os.path.join(test_plots_path,'Local_Importance_Scoremap_test_pima_complete.pdf')] + + for file in files_to_delete: + try: + if file.endswith('.pkl'): + pass + else: + os.remove(file) + except FileNotFoundError: + pass if __name__ == '__main__':