mms_waf_modeling_ml_algo.py

# -*- coding: utf-8 -*-
"""mms waf modeling ml algo.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1h2OcDhgHiLVx6_u5HlfTz9roYKkPH9-K

# Modeling and evaluation of machine learning model

Import all dependencies
"""

# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
from IPython.display import display
#evaluations
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve

data = pd.read_csv("/home/ubuntu/tools/mms_waf/final-allpayload.csv",index_col="index")
#data

"""Selecting dependent and independent variables"""

Y = data['is_malicious']
"""independent_variables = data.columns
independent_variables = independent_variables.delete(1)
independent_variables"""
X = data.iloc[:,3:]
X

"""# Modeling data with Xgboost Classifier"""

import xgboost as xgb
#?xgb.XGBClassifier()
xgb_classifer = xgb.XGBClassifier()
xgb_classifer.fit(X,Y)

# Commented out IPython magic to ensure Python compatibility.
# %time xgb_classifer.fit(X,Y)

data['predicted_is_malicious'] = xgb_classifer.predict(X)

data.head(30)

data[["is_malicious","predicted_is_malicious"]]

#?plt.plot()

"""# Integration with website"""

independent_variables=['length', 'non-printable','punctuation', 'min-byte', 'max-byte', 'mean-byte', 'std-byte','distinct-byte', 'sql-keywords', 'js-keywords']
independent_variables

independent_variables=data.columns

sql_keywords = pd.read_csv('/home/ubuntu/tools/mms_waf/SQLKeywords.txt', index_col=False)
js_keywords = pd.read_csv("/home/ubuntu/tools/mms_waf/JavascriptKeywords.txt",index_col=False)
def calculate_features_and_predict(payload):
  features = {} 
  payload = str(payload)
  features['length'] = len(payload)
  features['non-printable'] = len([1 for letter in payload if letter not in string.printable])
  features['punctuation'] = len([1 for letter in payload if letter in string.punctuation])
  features['min-byte'] = min(bytearray(payload,'utf-8'))
  features['max-byte'] = max(bytearray(payload,'utf-8'))
  features['mean-byte'] = np.mean(bytearray(payload,'utf-8'))
  features['std-byte'] = np.std(bytearray(payload,'utf-8'))
  features['distinct-byte'] = len(set(bytearray(payload,'utf-8')))
  features['sql-keywords'] = len([1 for keyword in sql_keywords['Keyword'] if str(keyword).lower() in payload.lower()])
  features['js-keywords'] = len([1 for keyword in js_keywords['Keyword'] if str(keyword).lower() in payload.lower()])
  #payload_df = pd.DataFrame(data=features,index=[0],columns=independent_variables)
  payload_df = pd.DataFrame(features,index=[0])
  display(payload_df)
  result = xgb_classifer.predict(payload_df)
  #display(result)
  return result[0]

calculate_features_and_predict("<>")

payload=''
while (payload != 'exit' ):
  payload = input("Enter payload")
  result = calculate_features_and_predict(payload)
  if(result > 0):
    print(f"Your payload {payload} is malicious - 403 error\n")
  else:
    print(f"Your payload {payload} is safe 200 OK\n")