diff --git a/.gitignore b/.gitignore index 3775733..5814112 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ crashlytics-build.properties fabric.properties .idea/httpRequests .idea/caches/build_file_checksums.ser +.idea/ .DS_Store .AppleDouble .LSOverride diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..55aac82 --- /dev/null +++ b/Pipfile @@ -0,0 +1,12 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +python-dotenv = "*" + +[dev-packages] + +[requires] +python_version = "3.8" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..1e71e8e --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,29 @@ +{ + "_meta": { + "hash": { + "sha256": "580cb6a9b28d54ce847142d6c2dfcdf68fdc46d1cd31141832b3b21d4776b3fb" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.8" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "python-dotenv": { + "hashes": [ + "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba", + "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a" + ], + "index": "pypi", + "version": "==1.0.0" + } + }, + "develop": {} +} diff --git a/app/__init__.py b/app/__init__.py index e69de29..e096662 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -0,0 +1,22 @@ +""" +In a Python project, an __init__.py file within a directory signals to Python that the directory should be considered a +Python package. This means that the directory can be imported as a module, and other Python files within the directory +can be organized as submodules. + +The presence of an __init__.py file allows you to create a hierarchical structure of modules, making code organization +cleaner and more maintainable. + +Here's a brief overview of what the __init__.py file can do: + +Package Initialization: You can place initialization code for your package inside the __init__.py file, and it will be +executed when the package is imported. + +Define __all__ for Wildcard Imports: By defining a list called __all__ in your __init__.py file, you can control what +is imported when a user performs a wildcard import (from package import *). + +Submodule Import Control: You can write import statements in __init__.py to make submodules available at the package +level, allowing users to access them with shorter import paths. + +Empty __init__.py: Even an empty __init__.py file serves the purpose of marking a directory as a package, though it +doesn't provide any of the additional functionality mentioned above. +""" diff --git a/app/data.py b/app/data.py index 61e69e5..50216ce 100644 --- a/app/data.py +++ b/app/data.py @@ -1,4 +1,6 @@ from os import getenv +from typing import Dict, Iterable, Iterator +# from random import randrange from certifi import where from dotenv import load_dotenv @@ -8,18 +10,256 @@ class Database: + """ + The Database class is an interface between a Pymongo database + hosted on Atlas https://cloud.mongodb.com/ using MongoClient. - def seed(self, amount): - pass + The interface supports CRUD, and has business logic functions as well. + There are also functions to seed the database, reset it, wrap the data + in a dataframe, and create a html table from a dataframe. + + Class Attributes: + --------- + None + + Instance Attributes: (defined in __init__() ) + --------- + self.client : MongoClient + A database driver for a MongoDB. The connection. + self.db : PyMongo Database (https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo. + database.Database) + A PyMongo Database object. The specific database. + self.Collection : PyMongo Collection object (https://pymongo.readthedocs.io/en/stable/api/pymongo/collection. + html#pymongo.collection.Collection) + A PyMongo Collection object. The specific collection. + + Init and CRUD Methods: + --------- + __init__(self) -> None: + Initializes a PyMongo Connection, with instance objects: client, db, Collection + create_one(self, record: Dict = None) -> bool: + CRUD method: creates a single Monster in the database. + read_one(self, query: Dict = None) -> Dict: + CRUD method: reads a single record matching the query. + update_one(self, query: Dict, update: Dict) -> bool: + CRUD method: updates a record in the database using a query and an update dictionary. + delete_one(self, query: Dict) -> bool: + CRUD method: deletes a single record in the database using a query. + create_many(self, records: Iterable[Dict]) -> bool: + CRUD method: creates many record in the database using an iterable + containing dictionary records. + read_many(self, query: Dict) -> Iterator[Dict]: + CRUD method: reads many records from the database that match query. + update_many(self, query: Dict, update: Dict) -> bool: + CRUD method: updates many records that match query with update dictionary. + delete_many(self, query: Dict) -> bool: + CRUD method: deletes many records that match query. + + Business logic methods: + --------- + seed(self, amount: int): + Creates the input amount of Monsters in the database. + reset(self): + Resets the database to be empty. + count(self) -> int: + Returns a count of the total objects in the database's current collection. + dataframe(self) -> DataFrame: + Returns a Pandas dataframe with all objects in the database's current collection. + html_table(self) -> str: + Returns a string containing a html table of all the Monsters + in the database's current collection. + """ + + def __init__(self) -> None: + """ + The init function creates a connection to the Atlas hosted database + using an environment file string. And also sets class variables for + the database 'db' and the Monster collection 'collection' + + :return: Database, an instance of the Database interface class + with instance attributes. + """ + + # Load environmental variables + load_dotenv() + + # Create a connection to the MongoDB server + self.client = MongoClient(getenv("DB_URL"), tlsCAFile=where()) + + # Select the database + self.db = self.client['Database'] + + # Select the collection + self.collection = self.db['Monsters'] + + def create_one(self, record: Dict = None) -> bool: + """ + CRUD method: creates a single Monster in the database. + + If no input Monster record is given, it creates a random Monster. + It uses the pymongo insert_one method: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html# + pymongo.collection.Collection.insert_one + It returns a bool as type pymongo.results.InsertOneResult + + :param record: Dict, the attributes of the Monster to create. + :return: bool, representing if the Monster was created. + """ + if record is None: + record = Monster().to_dict() + return self.collection.insert_one(record).acknowledged + + def read_one(self, query: Dict = None) -> Dict: + """ + CRUD method: reads a single record matching the query. + + If no query is given, then it returns the first record. + In the MongoDB context, passing None as the query to find_one will + return the first document in the collection without any filter, + excluding the _id field. + + :param query: Dict, the attributes to find a single Monster. + :return: Dict, the attributes of the found Monster. + """ + # if query is None: + # pipeline = [ + # {'$sample': {'size': 1}} + # ] + # record = list(self.collection.aggregate(pipeline)) + return self.collection.find_one(query, {"_id": False}) + + def update_one(self, query: Dict, update: Dict) -> bool: + """ + CRUD method: updates a record in the database using a query and an update dictionary. + + and updates the first matching record to the query with the new info. + The $set operator replaces the value of the field or creates it if is does + not exist. + Returns a pymongo.results.UpdateResult which has properties: acknowledged, + matched_count (num of docs matching), modified_count (num of docs modified), + raw_result(raw doc returned from server), upserted_id (id of upserted doc) + + :param query: Dict, the attributes to match to find a single Monster to update. + :param update: Dict, the attributes to update. + :return: bool, representing if the Monster was updated. + """ + return self.collection.update_one(query, {"$set": update}).acknowledged + + def delete_one(self, query: Dict) -> bool: + """ + CRUD method: deletes a single record in the database using a query. + + :param query: Dict, the Monster attributes to find and delete a single Monster. + :return: bool, representing if the Monster was deleted. + """ + return self.collection.delete_one(query).acknowledged + + def create_many(self, records: Iterable[Dict]) -> bool: + """ + CRUD method: creates many record in the database using an iterable + containing dictionary records. + + :param records: Iterable[Dict], Dicts of Monsters to create. + :return: bool, representing if the Monsters were created. + """ + return self.collection.insert_many(records).acknowledged + + def read_many(self, query: Dict) -> Iterator[Dict]: + """ + CRUD method: reads many records from the database that match query. + + :param query: Dict, Monster attributes to find matching Monsters. + :return: bool, representing if Monsters are found. + """ + return self.collection.find(query, {"_id": False}) + + def update_many(self, query: Dict, update: Dict) -> bool: + """ + CRUD method: updates many records that match query with update dictionary. + + :param query: Dict, Monster attributes to find Monsters to update. + :param update: Dict, the attribute changes to be made to matching Monsters. + :return: bool, representing if the Monsters were updated. + """ + return self.collection.update_many(query, {"$set": update}).acknowledged + + def delete_many(self, query: Dict) -> bool: + """ + CRUD method: deletes many records that match query. + + :param query: Dict, Monster attributes that are used to delete Monsters + from the database. + :return: bool, representing if the objects were deleted successfully. + """ + return self.collection.delete_many(query).acknowledged + + def seed(self, amount: int): + """ + Creates the input amount of Monsters in the database. + + :param amount: int, the desired number of Monsters to create in the database. + :return: bool, representing if the objects were created successfully. + """ + records = [Monster().to_dict() for _ in range(amount)] + return self.create_many(records) def reset(self): - pass + """ + Resets the database to be empty. + + :return: bool, representing if the objects were deleted successfully. + """ + records = {} + return self.delete_many(records) def count(self) -> int: - pass + """ + Returns a count of the total objects in the database's current collection. + + :return: int, count of Monsters. + """ + query = {} + count = self.collection.count_documents(query) # {} means no filter, so it counts all documents + print(f'There are {count} documents in the collection.') + return count def dataframe(self) -> DataFrame: - pass + """ + Returns a Pandas dataframe with all objects in the database's current collection. + + :return: Pandas DataFrame object, of Monsters. + """ + query = {} + df = DataFrame(list(self.read_many(query))) + return df def html_table(self) -> str: - pass + """ + Returns a string containing a html table of all the Monsters + in the database's current collection. + + :return: str, html formatted in a table. + """ + df = self.dataframe() + html_table = df.to_html(border=1, classes='dataframe', index=True) + return html_table + + +if __name__ == '__main__': + ''' + This code is run when data.py file is the main program. + If imported into another file, this block of code will not run. + ''' + print('Running data.py, the Database interface, as main...') + + db = Database() + + print(f'The client: {db.client}') + + databases = db.client.list_database_names() + print(f'The databases: {databases}') + + # use list_collection_names method to get a list of all collection names + collections = db.client.list_collection_names() + # print the collections + for collection in collections: + print(f'Collection: {collection}') diff --git a/app/graph.py b/app/graph.py index 7fb68f1..861e031 100644 --- a/app/graph.py +++ b/app/graph.py @@ -1,5 +1,48 @@ -from altair import Chart +from altair import Chart, Tooltip +from pandas import DataFrame +import altair as alt +# Define a custom theme +def custom_theme(): + return { + 'config': { + 'axis': { + 'labelColor': '#aaaaaa', + 'titleColor': '#aaaaaa', + 'tickColor': '#4f4f4f', + 'domainColor': '#aaaaaa', + 'gridColor': '#4f4f4f' + }, + 'legend': { + 'labelColor': '#aaaaaa', + 'titleColor': '#aaaaaa', + }, + 'title': { + 'color': '#aaaaaa', + 'fontSize': 24 + }, + } + } + +# Register and enable the custom theme +alt.themes.register('custom', custom_theme) +alt.themes.enable('custom') + + +def chart(df: DataFrame, x: str, y: str, target: str) -> Chart: + graph = Chart( + df, + title=f"{y} by {x} for {target}", + ).mark_circle(size=100).encode( + x=x, + y=y, + color=target, + tooltip=Tooltip(df.columns.to_list()) + ).properties( + width=600, + height=600, + background='#2b2b2b', + padding=25 + ) + return graph -def chart(df, x, y, target) -> Chart: - pass diff --git a/app/machine.py b/app/machine.py index 1785a57..9e43e58 100644 --- a/app/machine.py +++ b/app/machine.py @@ -1,17 +1,143 @@ -class Machine: +from pandas import DataFrame +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +from sklearn.preprocessing import LabelEncoder +from joblib import load, dump +from datetime import datetime +import os + +# Define the project root path +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +class Machine: def __init__(self, df): - pass + print(f"--------Initializing a new machine instance...") + # Create a Machine instance. + # (In __init__(), Try to load an existing model. If no model exists, train one). + # Use __init__() with open(). OR train a model and save it with save(). Fill in machine attributes using + # the model (loaded or created) and info(): self.name, self.timestamp, self.target, self.features, self.model + + # Try to load an existing model. Use open() + filepath = os.path.join(PROJECT_ROOT, 'app', 'model.joblib') + if os.path.exists(filepath): + print("Model already exists! Loading model...") + model = self.open(filepath) + self.name = model + self.timestamp = datetime.now() + self.target = df["Rarity"] + self.features = df.drop(columns=["Rarity"]) + self.model = model + + # Save the model labels/classes as part of the Machine object, for future use + self.labels = ['Rank 0', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5'] + + else: + print("Model does not exist! Creating one...") + # If no model exists, train one, then save with save(). + + # Cleanup the dataframe for training + le = LabelEncoder() + df.loc[:, 'Rarity'] = le.fit_transform(df['Rarity']).astype(int) + df['Rarity'] = df['Rarity'].astype(int) + + columns_to_drop = ['Timestamp', 'Damage', 'Name', 'Type'] + columns_to_drop = [col for col in columns_to_drop if col in df.columns] + if columns_to_drop: + df.drop(columns=columns_to_drop, inplace=True) + + self.target = df["Rarity"] + self.features = df.drop(columns=["Rarity"]) + + # Initializing the Random Forest Classifier + self.model = RandomForestClassifier(random_state=42) + + # Splitting the data into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(self.features, self.target, test_size=0.2, + random_state=42) + + # Fitting the model + self.model.fit(X_train, y_train) + + # Predicting the target variable + y_pred = self.model.predict(X_test) + + # Calculating the accuracy + accuracy = accuracy_score(y_test, y_pred) + + # Save the trained model + self.save(filepath) + + self.name = self.model + self.timestamp = datetime.now() + self.target = df["Rarity"] + self.features = df.drop(columns=["Rarity"]) + + # Save the labels/classes for future use + self.labels = ['Rank 0', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5'] + print(f"Model created with these labels: {self.labels}") + + def __call__(self, pred_basis: DataFrame): + print(f"Calling machine instance with __call__() to return prediction and confidence...") + prediction, *_ = self.model.predict(pred_basis) + confidence = max(self.model.predict_proba(pred_basis)[0]) + return prediction, confidence + + def retrain(self, df): + print(f"Retraining starting...") + filepath = os.path.join(PROJECT_ROOT, 'app', 'model.joblib') + + # Cleanup the dataframe for training + le = LabelEncoder() + df.loc[:, 'Rarity'] = le.fit_transform(df['Rarity']).astype(int) + df['Rarity'] = df['Rarity'].astype(int) + + columns_to_drop = ['Timestamp', 'Damage', 'Name', 'Type'] + columns_to_drop = [col for col in columns_to_drop if col in df.columns] + if columns_to_drop: + df.drop(columns=columns_to_drop, inplace=True) + + self.target = df["Rarity"] + self.features = df.drop(columns=["Rarity"]) + + # Initializing the Random Forest Classifier + self.model = RandomForestClassifier(random_state=42) + + # Splitting the data into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(self.features, self.target, test_size=0.2, + random_state=42) + + # Fitting the model + self.model.fit(X_train, y_train) + + # Predicting the target variable + y_pred = self.model.predict(X_test) + + # Calculating the accuracy + accuracy = accuracy_score(y_test, y_pred) + + # Save the trained model + self.save(filepath) + + self.name = self.model + self.timestamp = datetime.now() + self.target = df["Rarity"] + self.features = df.drop(columns=["Rarity"]) - def __call__(self, feature_basis): - pass + # Save the labels/classes for future use + self.labels = ['Rank 0', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5'] + print(f"RETRAINED Model created with these labels: {self.labels}") def save(self, filepath): - pass + print(f"Saving self.model {self.model} to filepath {filepath}") + # Save the model to a file + dump(self.model, filepath) @staticmethod def open(filepath): - pass + model = load(filepath) + return model def info(self): - pass + model_info = f'Base Model: {self.name}\n
Timestamp: {self.timestamp.strftime("%Y-%m-%d %I:%M:%S %p")}' + return model_info diff --git a/app/main.py b/app/main.py index 1f9e0b0..5c903cf 100644 --- a/app/main.py +++ b/app/main.py @@ -3,14 +3,19 @@ from Fortuna import random_int, random_float from MonsterLab import Monster -from flask import Flask, render_template, request +from flask import Flask, render_template, request, redirect, url_for from pandas import DataFrame from app.data import Database from app.graph import chart -from app.machine import Machine +from app.machine import Machine, PROJECT_ROOT -SPRINT = 0 +import logging + + +logging.basicConfig(filename='application.log', level=logging.DEBUG) + +SPRINT = 3 APP = Flask(__name__) @@ -40,17 +45,22 @@ def data(): def view(): if SPRINT < 2: return render_template("view.html") + db = Database() + options = ["Level", "Health", "Energy", "Sanity", "Rarity"] + x_axis = request.values.get("x_axis") or options[1] y_axis = request.values.get("y_axis") or options[2] target = request.values.get("target") or options[4] + graph = chart( df=db.dataframe(), x=x_axis, y=y_axis, target=target, ).to_json() + return render_template( "view.html", options=options, @@ -64,37 +74,98 @@ def view(): @APP.route("/model", methods=["GET", "POST"]) def model(): + logging.debug("Entered model route") + if SPRINT < 3: - return render_template("model.html") - db = Database() - options = ["Level", "Health", "Energy", "Sanity", "Rarity"] - filepath = os.path.join("app", "model.joblib") - if not os.path.exists(filepath): - df = db.dataframe() - machine = Machine(df[options]) - machine.save(filepath) + return render_template("model.html", + level = 4) + else: - machine = Machine.open(filepath) - stats = [round(random_float(1, 250), 2) for _ in range(3)] - level = request.values.get("level", type=int) or random_int(1, 20) - health = request.values.get("health", type=float) or stats.pop() - energy = request.values.get("energy", type=float) or stats.pop() - sanity = request.values.get("sanity", type=float) or stats.pop() - prediction, confidence = machine(DataFrame( - [dict(zip(options, (level, health, energy, sanity)))] - )) - info = machine.info() - return render_template( - "model.html", - info=info, - level=level, - health=health, - energy=energy, - sanity=sanity, - prediction=prediction, - confidence=f"{confidence:.2%}", - ) + # Setup: + # Create a database instance and dataframe instance. + db = Database() + df = db.dataframe() + + + # Create a Machine instance. + # (In __init__(), Try to load an existing model. If no model exists, train one). + # Use __init__() with open(). OR train a model and save it with save(). Fill in machine attributes using + # the model (loaded or created) and info(). + machine = Machine(df) + + + # Assign the variables to load the page: + info = machine.info() + level = request.values.get("level", type=int) or random_int(1, 20) + stats = [round(random_float(1, 250), 2) for _ in range(3)] + health = request.values.get("health", type=float) or stats.pop() + energy = request.values.get("energy", type=float) or stats.pop() + sanity = request.values.get("sanity", type=float) or stats.pop() + + + # Make a Prediction. + # If no values are present in the form, give random numbers. And immediately use those random + # numbers to predict Rarity with a confidence number using the existing model (either loaded or trained). + # Use __call__() to make a prediction + + options = ["Level", "Health", "Energy", "Sanity", "Rarity"] + prediction, confidence = machine(DataFrame([dict(zip(options, (level, health, energy, sanity)))])) + print(f"Labels are: {machine.labels}, type: {type(machine.labels)}") + string_prediction = machine.labels[prediction] + print(f"String prediction is: {string_prediction}") + print(f"Prediction {prediction}. Confidence {confidence}") + + + # Make a new Prediction. + # Option A - yes retrain) + # If the user changes those numbers and clicks the button ‘Retrain’, then create a new model + # (with the full database), save it, make a prediction (with the input numbers) and return that prediction. + # Delete the model. + # Then create a new Machine instance + retrain = request.form.get('retrain') + print(f"Retrain: {retrain}") + + if retrain == 'True': + # Checkbox was checked + print("Retrain is checked") + filepath = os.path.join(PROJECT_ROOT, 'app', 'model.joblib') + + # Check if the model file exists, then delete it + if os.path.exists(filepath): + os.remove(filepath) + print(f"Model file at {filepath} has been deleted") + + machine.retrain(df) + + options = ["Level", "Health", "Energy", "Sanity", "Rarity"] + prediction, confidence = machine(DataFrame([dict(zip(options, (level, health, energy, sanity)))])) + print(f"Labels are: {machine.labels}, type: {type(machine.labels)}") + string_prediction = machine.labels[prediction] + print(f"String prediction is: {string_prediction}") + print(f"Prediction {prediction}. Confidence {confidence}") + + else: + print(f"No such model file at {filepath}") + + else: + # Checkbox was not checked + print("Retrain is not checked") + # Option B - no retrain) + # If the user changes those numbers and clicks ‘predict rarity’ (without ‘Retrain’ + # clicked) then use the existing loaded model, and new numbers, to make a prediction and return that + # prediction. + # The prediction/confidence will change automatically as the page is reloaded when the button is clicked. + return render_template( + "model.html", + info=info, + level=level, + health=health, + energy=energy, + sanity=sanity, + prediction=string_prediction, + confidence=f"{confidence:.2%}", + ) if __name__ == '__main__': - APP.run() + APP.run(debug=True) diff --git a/app/model.joblib b/app/model.joblib new file mode 100644 index 0000000..8a0b292 Binary files /dev/null and b/app/model.joblib differ diff --git a/app/static/css/reset.css b/app/static/css/reset.css index 81d90fa..a9016c1 100644 --- a/app/static/css/reset.css +++ b/app/static/css/reset.css @@ -78,4 +78,4 @@ hr { } input, select { vertical-align:middle; -} +} \ No newline at end of file diff --git a/app/static/css/style.css b/app/static/css/style.css index c6d2bfb..7155ed7 100644 --- a/app/static/css/style.css +++ b/app/static/css/style.css @@ -161,6 +161,12 @@ form input { float: right; } +form input[type=checkbox] { + margin: 5px 107px 0 -47px; + cursor: pointer; + padding: 0 0 0 0; +} + form select { width: 164px; margin: 2px 20px 5px 5px; @@ -216,3 +222,4 @@ article table td { .vega-embed summary { display: none; } + diff --git a/app/templates/home.html b/app/templates/home.html index ae82ee4..e809c8d 100644 --- a/app/templates/home.html +++ b/app/templates/home.html @@ -9,6 +9,7 @@

Data Science Team

  • April Fairweather: Database Engineer
  • Thackery Binx: Data Analyst
  • Eugene Albright: Machine Learning Engineer
  • +
  • Stephen Mordue: Data Scientist Student, Bloomtech 2023
  • Tech Stack