code/models/Data.py

"""
    This module handles all dataset related functionalities.
"""

import cv2
import datetime
import glob
import leargist
import numpy as np
import os
import random
import scipy
import pefile
import time

# from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.python.keras.utils import data_utils

from PIL import Image
#from skimage.feature import local_binary_pattern
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from vendor.pe_injector.injector import PEInjector
from vendor.pe_injector.shifter import PEShifter

class OutputManager:
    def __init__(self, output_mode=True):
        # Define output paths
        self.logger = Logger(time_format="%Y%m%d_%H:%M:%S")
        self.model_path = os.path.dirname(os.path.realpath(__file__))
        self.output_path = self.model_path + '/../output/'
        self.tmp_output_path = self.output_path + self.logger.get_init_time_formatted()
        self.output_mode = output_mode
        self.tmp_model_output_path = self.tmp_output_path + '/models'
        self.log_file = None

        if self.output_mode is True:
            # Create output folders/files
            try:
                os.mkdir(self.tmp_output_path)
                os.mkdir(self.tmp_model_output_path)
            except Exception as identifier:
                self.logger.exception(identifier)
                raise Exception('Failure during output files creation.')

    def create_log_file(self, temp=True, name="output.txt"):
        self.log_file = open(
            self.tmp_output_path + '/' + name,
            "w"
        )

    def create_file(self, name, temp=True):
        if self.output_mode is not True:
            raise Exception('Writing to log without output mode enabled.')

        if temp:
            save_path = self.tmp_output_path
        else:
            save_path = self.output_path

        return open(
            save_path + '/' + name,
            "w"
        )

    def save_to_log(self, data):
        if self.output_mode is not True:
            raise Exception('Writing to log without output mode enabled.')

        self.log_file.write(
            "{}: {}"
            .format(
                self.logger.get_time_formatted(),
                data
            )
        )

    def close_log_file(self):
        self.log_file.close()

    def save_txt(self, filename, data, temp=True, format='%s', delimiter=','):
        if self.output_mode is not True:
            raise Exception('Writing to log without output mode enabled.')

        if temp:
            save_path = self.tmp_output_path
        else:
            save_path = self.output_path

        np.savetxt(save_path + '/' + filename,
            data,
            fmt=format,
            delimiter=delimiter
        )

    def save_numpy_data(self, filename, data, temp=True):
        if temp:
            save_path = self.tmp_output_path
        else:
            save_path = self.output_path

        np.save(save_path + '/' + filename, data)

class MalImgDataset(OutputManager, data_utils.Sequence):
    """
    A class that handles interactions with a dataset of malware images.

    This class manages datasets in the same format as
    Nataraj et. al. (2011) malware image dataset.

    Attributes
    ----------
    dataset_path : str
        Path to malware dataset
    class_map : dict
        Mapping of labels for each class
    class_size : dict
        Mapping of sizes of each class
    X_paths : dict
        Mapping of file paths for each class
    subset : dict
        Mapping of class names from Nataraj's smaller subset (8 classes)
    train_set : list
        List of files from train set.
        2D Arrays -> each position is an array with an image path and a label
        train_set[i][0] -> image of class i
        train_set[i][1] -> label of class i
    validation_set : list
        List of files from validation set
        2D Arrays -> each position is an array with an image path and a label
        validation_set[i][0] -> image of class i
        validation_set[i][1] -> label of class i
    test_set : list
        List of files from test set
        2D Arrays -> each position is an array with an image path and a label
        test_set[i][0] -> image of class i
        test_set[i][1] -> label of class i
    validation_percentage : float
        Percentage of images to use as validation (default 0.10)
    test_percentage : float
        Percentage of images to use as test (default 0.10)

    Methods
    -------
    __init__(sound=None)
        Class initialization
    """

    # Dicts to hold mapping from family names (and paths) to numbers
    class_map = {}
    class_size = {}
    X_paths = {}
    n_classes = 0

    # Lists to hold out data
    # 2D Arrays -> each position is an array with an image path and a label
    # training_images[i][0] -> image of class i
    # training_images[i][1] -> label of class i
    training_set = []
    test_set = []
    validation_set = []

    # If we need to check which class is being loaded
    valid_classes = [
        "Adialer.C",
        "Agent.FYI",
        "Allaple.A",
        "Allaple.L",
        "Alueron.gen!J",
        "Autorun.K",
        "C2LOP.gen!g",
        "C2LOP.P",
        "Dialplatform.B",
        "Dontovo.A",
        "Fakerean",
        "Instantaccess",
        "Lolyda.AA1",
        "Lolyda.AA2",
        "Lolyda.AA3",
        "Lolyda.AT",
        "Malex.gen!J",
        "Obfuscator.AD",
        "Rbot!gen",
        "Skintrim.N",
        "Swizzor.gen!E",
        "Swizzor.gen!I",
        "VB.AT",
        "Wintrim.BX",
        "Yuner.A",
        "benign",
        "malware"
    ]

    # If we decide to use the smaller subset from Nataraj's paper
    subset = {
        'Instantaccess': 335,
        'Yuner.A': 485,
        'Obfuscator.AD': 111,
        'Skintrim.N': 80,
        'Fakerean': 298,
        'Wintrim.BX': 88,
        'VB.AT': 97,
        'Allaple.A': 219
    }

    use_subset = False

    def __init__(
        self,
        path,
        extension="png",
        flatten=False,
        test_percentage=0.10,
        validation_percentage=0.10,
        batch_size=64,
        shuffle=True,
        output_mode=True
    ):

        # Init parent class
        OutputManager.__init__(self, output_mode)

        # Init instance variables
        self.dataset_path = os.path.abspath(path)
        self.flatten = flatten
        self.validation_percentage = validation_percentage
        self.test_percentage = test_percentage
        self.shuffle = shuffle
        self.extension = extension

        self.batch_size = batch_size

        self.X = []
        self.y = []

        # Control the number of folds splitted
        self.folds = 1
        self.current_fold = 1

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        if self.shuffle == True:
            np.random.shuffle(self.training_set)

    def __len__(self):
        # Denotes the number of batches per epoch
        return self.total_images // self.batch_size

    def __getitem__(self, index):
        X, y = self.load_raw_images(
            self.training_set[index * self.batch_size:(index + 1) * self.batch_size]
        )

        # self.logger.info("{} x {}".format(X.shape, y.shape))

        return X, y

    def load_dataset(self, extension='png'):
        # the parent folder with sub-folders
        os.chdir(self.dataset_path)

        self.logger.info("** Loading dataset from: {}".format(self.dataset_path))

        # vector of strings with family names
        # without 'str.lower' sorted function swaps C2LOP variants order
        list_fams = sorted(filter(os.path.isdir, os.listdir(os.getcwd())))

        # No. of samples per family
        no_imgs = []

        # Use external counter to avoid problems with one hot vectors
        id = 0
        for i in range(len(list_fams)):
            if (\
                (self.use_subset and list_fams[i] not in self.subset.keys())\
                or \
                (list_fams[i] not in self.valid_classes)\
            ):
                continue

            # Change to family directory
            os.chdir(list_fams[i])
            self.class_map[list_fams[i]] = id
            # Assuming the images are stored as 'png'
            file_list = glob.glob('*.{}'.format(extension))
            len1 = len(file_list)
            self.class_size[list_fams[i]] = len1
            no_imgs.append(len1)
            # Get paths for each family
            self.X_paths[list_fams[i]] = []
            for f in file_list:
                self.X_paths[list_fams[i]].append(os.path.abspath(f))
            # Return to parent folder
            os.chdir('..')
            id += 1
        self.n_classes = len(np.unique(list(self.class_map.values())))
        self.total_images = sum(no_imgs) # total number of all samples

        print(self.class_map)

        return

    def load_binarized_dataset(self, family, extension='exe'):

        negative_class = 'others'
        self.class_size = {family: 0, negative_class: 0}
        self.X_paths = { family: [], negative_class: [] }
        self.id_map = {}

        # the parent folder with sub-folders
        os.chdir(self.dataset_path)

        self.logger.info("** Binarizing dataset from: {}".format(self.dataset_path))

        # vector of strings with family names
        list_fams = sorted(filter(os.path.isdir, os.listdir(os.getcwd())))

        print(list_fams)

        # No. of samples per family
        no_imgs = []

        # Use external counter to avoid problems with one hot vectors
        id = 0
        for i in range(len(list_fams)):

            # Save a map with {id: family_str}
            self.id_map[i] = list_fams[i]

            if list_fams[i] == family:
                idx = 1
                name = family
            else:
                idx = 0
                name = negative_class

            # Change to family directory
            os.chdir(list_fams[i])
            self.class_map[name] = idx
            # Assuming the images are stored as 'png'
            file_list = glob.glob('*.{}'.format(extension))
            len1 = len(file_list)
            self.class_size[name] += len1
            no_imgs.append(len1)
            # Get paths for each family
            # X_paths[list_fams[i]] = []
            for f in file_list:
                self.X_paths[name].append(os.path.abspath(f))
            # Return to parent folder
            os.chdir('..')
            id += 1
        self.n_classes = len(np.unique(list(self.class_map.values())))
        self.total_images = sum(no_imgs) # total number of all samples

        # Undersample negative class
        np.random.shuffle(self.X_paths[negative_class])
        self.X_paths[negative_class] = self.X_paths[negative_class][:self.class_size[family]]
        self.total_images = self.class_size[family] * 2
        self.class_size[negative_class] = self.class_size[family]
        self.n_classes = 2

        print(self.class_map)
        print(self.class_size)
        print(len(self.X_paths[family]), len(self.X_paths[negative_class]))

        return

    def nataraj_split(self):
        for malware_class in self.X_paths:

            if self.use_subset and malware_class not in self.subset.keys():
                continue

            class_images = self.X_paths[malware_class]
            label = self.class_map[malware_class]

            # Shuffle list to get different images each time
            random.shuffle(class_images)
            if self.use_subset:
                class_images_len = self.subset[malware_class]
            else:
                class_images_len = len(class_images)

            # Split into training, test and validation based on idx
            validation_size = int(np.floor(class_images_len * self.validation_percentage))
            test_size = int(np.floor(class_images_len * self.test_percentage))
            training_idx = class_images_len - (test_size+validation_size)
            validation_idx = training_idx+validation_size

            # Get splits paths
            training_paths = class_images[:training_idx] # First block for training
            validation_paths = class_images[training_idx:validation_idx] # Second block for validation
            test_paths = class_images[validation_idx:class_images_len] # Test with whatever is left

            # self.logger.info("\'{}\' size: {};"\
            #     " Splits: TR: {}:{}:{} | V: {}:{}:{} | TE: {}:{}:{}"
            #     .format(
            #         malware_class,
            #         class_images_len,
            #         0,
            #         training_idx-1,
            #         len(training_paths),
            #         training_idx,
            #         validation_idx-1,
            #         len(validation_paths),
            #         validation_idx,
            #         class_images_len-1,
            #         len(test_paths)
            #     )
            # )

            # Append to 2D array
            for img in training_paths:
                self.training_set.append([img, label])
            for img in validation_paths:
                self.validation_set.append([img, label])
            for img in test_paths:
                self.test_set.append([img, label])
        return

    def get_all_paths_and_labels(self):
        self.X = []
        self.y = []

        for malware_class in self.X_paths:

            if self.use_subset and malware_class not in self.subset.keys():
                continue

            label = self.class_map[malware_class]
            class_images = np.array(self.X_paths[malware_class])
            if self.use_subset:
                # If using subset, get the amount of images defined previously
                curr_size = self.class_size[malware_class]
                class_size_on_subset = self.subset[malware_class]
                random_indices = np.random.randint(0, curr_size, class_size_on_subset)
                class_images = class_images[random_indices]

            for path in class_images:
                self.X.append(path)
                self.y.append(label)
        return

    def split_dataset(self, n_folds=1, split_method='nataraj', use_subset=False, binarize_dataset=None):
        """
            Splits dataset into train set, test set and validation set
        """

        self.logger.info("** Splitting dataset!")

        # Control the number of folds splitted
        self.folds = n_folds
        self.current_fold = 1
        self.use_subset = use_subset

        # Load dataset paths
        if binarize_dataset is None:
            self.load_dataset(extension=self.extension)
        elif binarize_dataset in self.valid_classes:
            self.load_binarized_dataset(binarize_dataset, extension=self.extension)
        else:
            raise Exception('Error during dataset load!')

        if split_method == 'nataraj':
            self.split_generator = None
        elif split_method == 'stratified_kfold':
            self.get_all_paths_and_labels()
            skf = StratifiedKFold(self.folds)
            self.split_generator = skf.split(self.X, self.y)
        elif split_method == 'regular_kfold':
            self.get_all_paths_and_labels()
            kf = KFold(self.folds)
            self.split_generator = kf.split(self.X, self.y)
        else:
            raise Exception('This split method is not defined!')

        return

    def compute_fold_split(self):
        if self.current_fold > self.folds:
            raise Exception('Number of folds exceeded!')

        # Recreate log file
        if (
            self.log_file is not None
            and not self.log_file.closed
        ):
            self.log_file.close()

        if self.output_mode is True:
            self.create_log_file(name="output_fold_{}.txt".format(self.current_fold))

        # Empty lists to avoid memory overflow
        self.training_set = []
        self.validation_set = []
        self.test_set = []

        if self.split_generator is not None:
            train_indices, test_indices = next(self.split_generator)

            # Get 10% from train_set to use as validation
            # TODO: check if that's the best method
            train_indices, validation_indices = train_test_split(
                train_indices,
                test_size=0.1,
                shuffle=True
            )

            for idx in train_indices:
                img = self.X[idx]
                label = self.y[idx]
                self.training_set.append([img, label])

            for idx in validation_indices:
                img = self.X[idx]
                label = self.y[idx]
                self.validation_set.append([img, label])

            for idx in test_indices:
                img = self.X[idx]
                label = self.y[idx]
                self.test_set.append([img, label])
        else:
            # Since Nataraj's method uses a random split, we just call it again
            self.nataraj_split()

        if self.output_mode is True:
            # These arrays are now set
            self.save_txt('training_set_' + str(self.current_fold), self.training_set)
            self.save_txt('validation_set_' + str(self.current_fold), self.validation_set)
            self.save_txt('test_set_' + str(self.current_fold), self.test_set)

        self.logger.warning("Training: {}; "\
            "Validation: {}; Testing: {}; TOTAL: {}"
            .format(
                len(self.training_set),
                len(self.validation_set),
                len(self.test_set),
                (len(self.training_set) + len(self.validation_set) + len(self.test_set))
            )
        )

        self.current_fold += 1

        return

    def get_test_set_from_timestamp(self, runtime):
        self.logger.warning("Loading test set from {}.".format(self.output_path + runtime))

        return np.loadtxt(
            self.output_path \
            + runtime \
            + '/test_set',
            dtype=np.ndarray,
            delimiter=','
        )

    def get_train_data(self, merge_validation=False):
        if merge_validation:
            # If 'merge_validation' == True, we merge those arrays
            return [*self.training_set, *self.validation_set]

        return self.training_set

    def get_validation_data(self):
        return self.validation_set

    def get_test_data(self):
        return self.test_set

    def get_all_data(self):
        return [*self.training_set, *self.validation_set, *self.test_set]

    def load_raw_images(self, data_array, height=-1, width=-1, channels=1, should_stack=False, should_reshape=False, categorical_y=True):
        """
        Parameters
        ----------
        data_array : list of tuples
            Tuple's list with images paths and labels
        height : int
            Height of the malware image (default -1)
        width : int
            Width of the malware image (default -1)
        channels : int
            Number of channels of the malware image (default 1)
        should_stack : Boolean
            Flag to indicate if one should stack image channels (default False)
        """
        images = []
        labels = []
        for _tuple in data_array:
            image_path = _tuple[0]
            label = _tuple[1]
            try:
                image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            except Exception as e:
                self.logger.warning("Exception {} while reading {}.".format(e, image_path))
                continue
            # If new height and width is provided, we resize the image
            if (height != -1 and width != -1):
                if should_reshape:
                    image = image.reshape(image.shape[0]*image.shape[1], 1)
                image = cv2.resize(image, (height, width))
            # To avoid numerical problems, we normalize pixel values
            if should_stack:
                # Stack image to 3 channels
                image = np.stack((image,)*3, axis=-1) / 255.0
            else:
                image = image.reshape(image.shape[0], image.shape[1], channels) / 255.0
            images.append(image)
            labels.append(label)

        if categorical_y:
            y = to_categorical(np.array(labels),
                               num_classes=self.n_classes)
        else:
            y = labels

        return np.array(images), y

    def load_exe_sections(self, data_array, height=-1, width=-1, categorical_y=True):
        """
        Parameters
        ----------
        data_array : list of tuples
            Tuple's list with images paths and labels
        height : int
            Height of the malware image (default -1)
        width : int
            Width of the malware image (default -1)
        """

        images = []
        labels = []
        sections_weights  = []
        for s_tuple in data_array:
            exe_path = s_tuple[0]
            label    = s_tuple[1]

            exe = pefile.PE(name=exe_path, fast_load=True)

            for section in exe.sections:
                data = section.get_data()
                n_bytes = len(data)

                img = np.frombuffer(data, dtype=np.uint8)

                if (n_bytes == 0 or img.size == 0 or np.count_nonzero(img) == 0):
                    continue

                # If new height and width is provided, we resize the image
                if (height != -1 and width != -1):
                    # img = cv2.resize(img, (height, width))
                    if n_bytes <= width:
                        # pad with Zeros
                        img = np.concatenate([img, np.array([0x00] * (width-n_bytes))]).reshape(width, height) / 255.0
                        # print("A:{}".format(img.shape))
                        images.append(img)
                        labels.append(label)
                        sections_weights.append(n_bytes)
                    else:
                        # # Truncate
                        # img = img[:width].reshape(width, height)
                        # # print("B:{}".format(img.shape))
                        # # # Resize
                        # # img = cv2.resize(img, (height, width))
                        # images.append(img)
                        # labels.append(label)
                        # sections_weights.append(n_bytes)

                        # Split section data into smaller sections
                        rem = n_bytes%width # If n_bytes is not a multiple of width
                        blocks = int(n_bytes/width)

                        # avoid sections too big
                        if blocks > self.batch_size*10:
                            self.logger.warning(
                                "{} -> {} b split into {} blocks and {} rem".format(exe_path, n_bytes, blocks, rem))
                            continue

                        for b in range(0, n_bytes-rem, width):
                            sec = img[b:b+width].reshape(width, height) / 255.0
                            # Add section data with max length as a separate section
                            images.append(sec)
                            labels.append(label)
                            sections_weights.append(width)
                        if rem > 0:
                            # Get remaining bytes and pad with Zeros
                            img = np.concatenate([img[-rem:], np.array([0x00] * (width-rem))]).reshape(width, height) / 255.0
                            images.append(img)
                            labels.append(label)
                            sections_weights.append(rem)
        if categorical_y:
            y = to_categorical(np.array(labels),
                               num_classes=self.n_classes)
        else:
            y = labels

        return np.array(images), y, np.array(sections_weights)

    def load_lstm(self, data_array, length, chunk_size, n_chunks, from_data=False, categorical_y=True):

        samples = []
        labels  = []
        for s_tuple in data_array:
            exe_path = s_tuple[0]
            label    = s_tuple[1]

            if from_data:
                bin_stream = exe_path / 255.0 # No need to read file
            else:
                bin_stream = np.fromfile(exe_path, dtype='uint8') / 255.0

            # Gets only first 'length' bytes from sample
            # sample = image[:length]
            # Reshapes to a 3D Vector to work with Tensorflow LSTM input dimension
            # shape(1, number_of_chunks, size_of_each_chunk)
            # image = sample.reshape(n_chunks, chunk_size)
            bin_stream = np.array([ bin_stream[pos:pos+chunk_size] for pos in range(0, bin_stream.shape[0], chunk_size) ][:length])
            # print("{} {} {}".format(chunk_size, length, bin_stream.shape))

            samples.append(bin_stream)
            labels.append(label)

            if categorical_y:
                y = to_categorical(np.array(labels),
                                num_classes=self.n_classes)
            else:
                y = np.array(labels)

        return np.array(samples), y

    def load_sequence(self,
                      data_array,
                      length=None,
                      from_data=False,
                      padding_char=256,
                      categorical_y=False,
                      augmenters=[]):
        """_summary_

        Args:
            data_array (list): List of (path, label) tuples
            length (int, optional): Max length of the sequence. Defaults to None.
            from_data (bool, optional): If input is raw data or a file path. Defaults to False.
            padding_char (int, optional): Value to be used as padding. Defaults to 256.
            categorical_y (bool, optional): If label should be categorical. Defaults to False.
            augment (list, optional): _description_. Defaults to None.

        Returns:
            tuple: (data, labels)
        """

        def read(path):
            return np.fromfile(path, dtype='uint8')

        def shorten(data):
            return data[:length]

        def process(data):
            # Get an array of padding chars with desired length
            # int32 because torch does not accept uint16
            stream = np.ones(length, dtype=np.int32) * padding_char

            # Copy initial bytes from original data
            n_bytes          = data.shape[0]
            stream[:n_bytes] = data

            return stream

        # Transform to numpy array to allow column indexing
        paths = np.array(data_array, dtype=np.object)

        # Add original samples to output
        samples = []
        if from_data:
            samples = [ process(shorten(x)) for x in paths[:, 0] ]
        else:
            samples = [ process(shorten(read(x))) for x in paths[:, 0] ]
        labels = list(map(int, paths[:, 1]))

        # Iterate over augmenter list and merge lists
        augmented_samples = []
        for f in augmenters:
            if from_data:
                augmented_samples += [ process(shorten(f(x))) for x in paths[:, 0] ]
            else:
                augmented_samples += [ process(shorten(f(read(x)))) for x in paths[:, 0] ]
            labels += list(map(int, paths[:, 1])) # Duplicate labels for each augmentation operation

        if categorical_y:
            y = to_categorical(np.array(labels),
                               num_classes=self.n_classes)
        else:
            y = np.array(labels)

        return np.array(samples + augmented_samples), y


    def load_raw_exe(self,
                     data_array,
                     height=-1,
                     width=-1,
                     categorical_y=True,
                     from_data=False,
                     augmenters=[]):
        """
        Parameters
        ----------
        data_array : list of tuples
            Tuple's list with images paths and labels
        height : int
            Height of the malware image (default -1)
        width : int
            Width of the malware image (default -1)
        """

        def process(path):
            if isinstance(path, str) and os.path.isfile(path):
                data = np.fromfile(path, dtype='uint8')
            else:
                data = path

            if (height != -1 and width != -1):
                data = self.buffer_to_image(data, height, width)
            return data

        # Transform to numpy array to allow column indexing
        paths = np.array(data_array, dtype=np.object)

        # Add original samples to output
        samples = list(map(process, paths[:, 0]))
        labels = list(map(int, paths[:, 1]))

        # Iterate over augmenter list and merge lists
        augmented_samples = []
        for f in augmenters:
            augmented_samples += [ process(f(x)) for x in paths[:, 0] ]
            labels += list(map(int, paths[:, 1])) # Duplicate labels for each augmentation operation

        if categorical_y:
            y = to_categorical(np.array(labels),
                               num_classes=self.n_classes)
        else:
            y = np.array(labels)

        return np.array(samples + augmented_samples), y

    def split_sections_data(self, data_array, sequence_size, from_data=False, categorical_y=False):
        """
        Transforms the exe into an array of sequences using sections data only

        Parameters
        ----------
        data_array : list of tuples
            Tuple's list with images paths and labels
        sequence_size : int
            Length of each sequence
        from_data : boolean
            If True, input data is treated as numpy buffer
        """

        samples = []
        labels  = []
        for s_tuple in data_array:
            exe_path = s_tuple[0]
            label    = s_tuple[1]

            if from_data:
                exe = pefile.PE(data=exe_path.tobytes())
            else:
                exe = pefile.PE(name=exe_path, fast_load=True)

            sections = []
            for section in exe.sections:
                data    = section.get_data()
                n_bytes = len(data)

                data = np.frombuffer(data, dtype=np.uint8)

                # Invalid or memory-only section
                if (n_bytes == 0 or data.size == 0 or np.count_nonzero(data) == 0):
                    continue

                if n_bytes < sequence_size:
                    # Pad with random data
                    seq = np.random.randint(0, 255, sequence_size)
                    seq[:n_bytes] = data

                    sections.append(seq / 255.0)
                else:
                    # Remainder of division
                    remaining   = n_bytes % sequence_size
                    # Total number of sequences
                    n_sequences = int(n_bytes/sequence_size)

                    sequences = []
                    for idx in range(0, n_bytes, sequence_size):
                        seq = data[idx:idx+sequence_size] / 255.0
                        # Add section data with max length as a separate section
                        sequences.append(seq)
                    if remaining > 0:
                        # Get remaining bytes and pad with random bytes
                        seq = np.random.randint(0, 255, sequence_size)
                        seq[:remaining] = data[-remaining:]
                        sequences[-1] = seq / 255.0
                    for s in sequences:
                        sections.append(s)
            samples.append(np.array(sections))
            labels.append(label)

        if categorical_y:
            y = to_categorical(np.array(labels),
                               num_classes=self.n_classes)
        else:
            y = np.array(labels)

        return np.array(samples), y

    def split_to_sequences(self, data_array, sequence_size, from_data=False, categorical_y=False, pad_batch=False):
        """
        Transforms the exe into an array of sequences

        Parameters
        ----------
        data_array : list of tuples
            Tuple's list with images paths and labels
        sequence_size : int
            Length of each sequence
        from_data : boolean
            If True, input data is treated as numpy buffer
        """

        # Used to compute the greatest sequence of the batch, used to pad the
        # other sequences
        batch_padding_limit = 0

        samples = []
        labels  = []
        for s_tuple in data_array:
            exe_path = s_tuple[0]
            label    = s_tuple[1]

            if from_data:
                bin_stream = exe_path # No need to read file
            else:
                bin_stream = np.fromfile(exe_path, dtype='uint8')

            # Total number of bytes
            bin_size    = bin_stream.shape[0]
            # Remainder of division
            remaining   = bin_size % sequence_size
            # Total number of sequences
            n_sequences = int(bin_size/sequence_size)

            if n_sequences > batch_padding_limit:
                batch_padding_limit = n_sequences

            # sequences = np.array([ bin_stream[idx:idx+sequence_size] / 255.0 for idx in range(0, bin_stream.shape[0], sequence_size) ], dtype=np.float64)
            sequences = []
            for idx in range(0, bin_stream.shape[0], sequence_size):
                seq = bin_stream[idx:idx+sequence_size]
                # Add section data with max length as a separate section
                sequences.append(seq)
            if remaining > 0:
                # Get remaining bytes and pad with Zeros
                # sequences[-1] = np.concatenate([bin_stream[-remaining:], np.array([0x90] * (sequence_size-remaining), dtype=np.float64)]) / 255.0
                seq             = np.ones(sequence_size, dtype=np.uint8) * 0x90
                seq[:remaining] = bin_stream[-remaining:]
                sequences[-1]   = seq
            # samples.append(np.array(sequences) / 255.0)
            samples.append(np.array(sequences))
            labels.append(label)

        # if pad_batch:
        #     # Pad each sequence with less than the batch_padding_limit with the
        #     # remaining number of sequences (zeroed)
        #     for k in range(len(samples)):
        #         n_seqs = samples[k].shape[0]
        #         if n_seqs < batch_padding_limit + 1:
        #             samples[k] = np.concatenate((samples[k], np.random.normal(0, 0, ((batch_padding_limit+1)-n_seqs, sequence_size))), 0)
        #     # print(samples)

        if categorical_y:
            y = to_categorical(np.array(labels),
                               num_classes=self.n_classes)
        else:
            y = np.array(labels)

        return np.array(samples), y

    def exe_to_PIL_image(self, path, height, width, from_data=False, return_as_array=False):
        if from_data == True:
            bin_stream = path
        else:
            bin_stream = np.fromfile(path, dtype='uint8')

        bin_stream = bin_stream.reshape(bin_stream.shape[0], 1)

        image = Image.fromarray(bin_stream)
        image = image.resize((height, width), Image.ANTIALIAS)

        if return_as_array == True:
            return np.asarray(image)

        return image

    def extract_GIST_features(self,
                              paths,
                              height,
                              width,
                              n_features=320,
                              from_data=False,
                              augmenters=[]):
        """
        Parameters
        ----------
        paths : list of tuples
            Tuple's list with images paths and labels
        height : int
            Height of the malware image
        width : int
            Width of the malware image
        n_features : int
            Amount of features to be returned. Since most images are grayscale, we need only first 320 values (default 320)
        """
        images = []
        labels = []
        features = None

        # Transform to numpy array to allow column indexing
        paths = np.array(paths, dtype=np.object)

        # Inner function to get image from path
        def image_from_buffer(path):
            if self.extension == 'exe':
                if isinstance(path, str) and os.path.isfile(path):
                    bin_stream = np.fromfile(path, dtype='uint8')
                else:
                    bin_stream = path

                bin_stream = bin_stream.reshape(bin_stream.shape[0], 1)

                # Read with numpy, convert to image with OpenCV and then PIL image.
                # We need to convert from 1D buffer to a 2D image with OpenCV,
                # otherwise resizing with PIL (using ANTIALIAS) takes forever
                # to complete

                stream = self.exe_to_img(bin_stream)
                image = Image.fromarray(stream, mode='L')
            else:
                image = Image.open(open(path, 'rb'))

            image = image.resize((height, width), Image.ANTIALIAS)

            features = leargist.color_gist(image)[:n_features]

            return np.array(features)

        # Add original samples to output
        images = list(map(image_from_buffer, paths[:, 0]))
        labels = list(map(int, paths[:, 1]))

        # Iterate over augmenter list and merge lists
        augmented_images = []
        for f in augmenters:
            augmented_images += [ image_from_buffer(f(x)) for x in paths[:, 0] ]
            labels += list(map(int, paths[:, 1])) # Duplicate labels for each augmentation operation


        return np.array(images + augmented_images), np.array(labels)

    def extract_LBP_features(self, paths, height, width, radius=2, neighbours=16, eps=1e-8):
        """
        Parameters
        ----------
        paths : list of tuples
            Tuple's list with images paths and labels
        height : int
            Height of the malware image
        width : int
            Width of the malware image
        n_features : int
            Amount of features to be returned. Since most images are grayscale, we need only first 320 values (default 320)
        """
        images = []
        labels = []
        features = None

        for tuple in paths:
            image_path = tuple[0]
            label = int(tuple[1])

            # OpenCV
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            image = cv2.resize(image, (height, width))

            lbp = local_binary_pattern(image, neighbours, radius, method="uniform")
            (features, _) = np.histogram(lbp.ravel(), bins=np.arange(0, neighbours + 3), range=(0, neighbours + 2))
            features = features.astype("float")
            features /= (features.sum() + eps)

            images.append(np.array(features))
            labels.append(label)
        return np.array(images), np.array(labels)

    def split_into_chunks(self, data_array, length, chunk_size, n_chunks):
        """
        Parameters
        ----------
        paths : list of tuples
            Tuple's list with images paths and labels
        height : int
            Height of the malware image
        width : int
            Width of the malware image
        n_features : int
            Amount of features to be returned. Since most images are grayscale, we need only first 320 values (default 320)
        """
        images = []
        labels = []
        for tuple in data_array:
            image_path = tuple[0]
            label = tuple[1]
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if self.flatten:
                # Reshapes image to a 1D vector
                image = image.flatten()
                # Gets only first 'length' bytes from sample
                # sample = image[:length]
                # Reshapes to a 3D Vector to work with Tensorflow LSTM input dimension
                # shape(1, number_of_chunks, size_of_each_chunk)
                # image = sample.reshape(n_chunks, chunk_size)
                image = np.array([ image[idx:idx+chunk_size] for idx in range(0, image.shape[0], chunk_size) ][:length])
            images.append(image)
            labels.append(label)
        return np.array(images), np.array(labels)


    def inject_from_file(self,
                         malware_path,
                         section_source_path,
                         n_bytes,
                         n_injected_sections,
                         height,
                         width,
                         return_raw=False):
        """
        Reads an PE file and injects it with data

        Parameters
        ----------
        malware_path : str
            Path to malware to be injected
        section_source_path : int
            Path to pe32 file with sections to be taken
        n_injected_sections : int
            Number of sections to be injected into the malware file
        return_raw : bool
            If True returns raw injection data without resizing
        """

        injector = PEInjector(malware_path,
                              n_bytes,
                              n_sections_to_inject=n_injected_sections,
                              middleware_path=section_source_path,
                              verbose=False)

        data = np.frombuffer(injector.run(), dtype=np.uint8)

        if return_raw:
            return data

        return self.buffer_to_image(data, height, width)


    def buffer_to_image(self, buffer, height, width):

        # reshape the buffer to (buffer_size, 1)
        buffer = buffer.reshape(buffer.shape[0], 1)

        # Transform to image
        buffer = cv2.resize(buffer, (height, width)) / 255.0

        return buffer

    def exe_to_img(self, buffer):
        def get_width_by_filesize(size):
            if size < 10 * 1e3:
                return 32
            elif (size >= 10 * 1e3 and size < 30 * 1e3):
                return 64
            elif (size >= 30 * 1e3 and size < 60 * 1e3):
                return 128
            elif (size >= 60 * 1e3 and size < 100 * 1e3):
                return 256
            elif (size >= 100 * 1e3 and size < 200 * 1e3):
                return 384
            elif (size >= 200 * 1e3 and size < 500 * 1e3):
                return 512
            elif (size >= 500 * 1e3 and size < 1000 * 1e3):
                return 768
            elif (size >= 1000 * 1e3 and size < 2000 * 1e3):
                return 1024
            elif (size >= 2000 * 1e3):
                return 2048

        # length of file in bytes (using only bytes from malware)
        length = buffer.shape[0]
        width  = get_width_by_filesize(length)
        rem    = length%width

        # Discard remaining bytes
        img_data = buffer[:-rem] if rem > 0 else buffer
        height   = int(img_data.shape[0]/width)

        return np.reshape(img_data, (height, width))

class Logger:

    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

    def __init__(self, time_format="%Y%m%d %H:%M:%S"):
        self.init_time = datetime.datetime.now()
        self.time_format = time_format

    def get_init_time(self):
        return self.init_time

    def get_time(self):
        return datetime.datetime.now()

    def get_init_time_formatted(self):
        return self.get_init_time().strftime(self.time_format)

    def get_time_formatted(self):
        return self.get_time().strftime(self.time_format)

    def info(self, message):
        print(
            "{} => \t {}{}INFO:{} {}"
            .format(
                self.get_time_formatted(),
                self.OKGREEN,
                self.BOLD,
                self.ENDC,
                message
            )
        )

    def warning(self, message):
        print(
            "{} => \t {}{}WARN:{} {}"
            .format(
                self.get_time_formatted(),
                self.WARNING,
                self.BOLD,
                self.ENDC,
                message
            )
        )

    def exception(self, instance):
        print(
            "{} => \t {}{}" \
            "An unexpected Exception of type {} occurred. " \
            "Here is the message provided: " \
            "\n\t\t\t {}." \
            "\n\t\t\t Exiting. {}"
            .format(
                self.get_time_formatted(),
                self.FAIL,
                self.BOLD,
                type(instance),
                instance.args,
                self.ENDC
            )
        )