From 0bccb9e76d692055dc9b8ced2b9b93ac57cb433d Mon Sep 17 00:00:00 2001 From: Mikhail Date: Sun, 10 May 2020 11:44:25 +0300 Subject: [PATCH] Update MLPanel BASE edition - v0.2 --- LICENSE | 21 - README.md | 243 +----- build.sh | 29 + common/utils.py | 30 +- config/.gitignore | 2 - docker-compose.yaml | 50 -- functions.sh | 60 ++ restart.sh | 14 + services/auth/Dockerfile | 6 + services/auth/requirements.txt | 5 + services/auth/src/app.py | 270 +++++++ services/deploy/2_model.log | 8 + services/deploy/Dockerfile | 43 -- services/deploy/README.md | 14 - services/deploy/src/app.py | 21 +- services/deploy/src/config.py | 117 +-- services/deploy/src/deployments/__init__.py | 435 ----------- .../src/deployments/gcp_deploy_utils.py | 11 +- services/deploy/src/deployments/local.py | 21 +- services/deploy/src/deployments/manager.py | 704 ++++++++++++++++++ services/deploy/src/deployments/utils.py | 553 +++++++++++++- services/deploy/src/routers/deployments.py | 120 ++- services/deploy/src/utils.py | 56 ++ services/deploy/startup.sh | 7 + .../{ => integration/base}/model/MLmodel | 0 .../{ => integration/base}/model/conda.yaml | 0 .../{ => integration/base}/model/model.pkl | Bin .../tests/{ => integration/base}/test_app.py | 52 +- services/deploy/tests/test.sh | 59 +- services/projects/.dockerignore | 1 + services/projects/Dockerfile | 44 -- services/projects/README.md | 17 +- services/projects/examples/data/.gitignore | 3 +- services/projects/examples/data/iris.csv | 151 ++++ services/projects/examples/demo_ws.py | 29 +- services/projects/examples/example2.py | 52 +- services/projects/examples/example3.py | 53 +- services/projects/examples/requirements.txt | 6 +- services/projects/src/app.py | 90 +-- services/projects/src/config.py | 83 ++- services/projects/src/project_management.py | 220 ++++-- services/projects/src/routers/artifacts.py | 11 +- services/projects/src/routers/deployments.py | 79 +- services/projects/src/routers/experiments.py | 36 +- services/projects/src/routers/misc.py | 11 +- services/projects/src/routers/projects.py | 71 +- .../projects/src/routers/registered_models.py | 45 +- services/projects/src/routers/runs.py | 25 +- services/projects/src/routers/utils.py | 6 +- services/projects/src/utils.py | 40 +- services/projects/startup.sh | 7 + .../tests/{ => integration}/test_app.py | 3 +- services/projects/tests/test.sh | 64 +- .../projects/tests/test_project_management.py | 107 --- services/ui/Dockerfile | 20 - start.sh | 13 + stop.sh | 12 + swagger.yaml | 251 +++++-- 58 files changed, 3096 insertions(+), 1405 deletions(-) delete mode 100644 LICENSE create mode 100755 build.sh delete mode 100644 config/.gitignore delete mode 100644 docker-compose.yaml create mode 100644 functions.sh create mode 100755 restart.sh create mode 100644 services/auth/Dockerfile create mode 100644 services/auth/requirements.txt create mode 100644 services/auth/src/app.py create mode 100644 services/deploy/2_model.log delete mode 100644 services/deploy/Dockerfile create mode 100644 services/deploy/src/deployments/manager.py create mode 100755 services/deploy/startup.sh rename services/deploy/tests/{ => integration/base}/model/MLmodel (100%) rename services/deploy/tests/{ => integration/base}/model/conda.yaml (100%) rename services/deploy/tests/{ => integration/base}/model/model.pkl (100%) rename services/deploy/tests/{ => integration/base}/test_app.py (68%) create mode 100644 services/projects/.dockerignore delete mode 100644 services/projects/Dockerfile create mode 100644 services/projects/examples/data/iris.csv create mode 100755 services/projects/startup.sh rename services/projects/tests/{ => integration}/test_app.py (99%) delete mode 100644 services/projects/tests/test_project_management.py delete mode 100644 services/ui/Dockerfile create mode 100755 start.sh create mode 100755 stop.sh diff --git a/LICENSE b/LICENSE deleted file mode 100644 index f089dbb..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2020 Machine Learning REPA - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md index 1ca34fc..fd864d0 100644 --- a/README.md +++ b/README.md @@ -1,234 +1,71 @@ -# mlpanel +# MLPanel -GitBook docs: https://mlrepa.gitbook.io/mlpanel/ +![MLPanel](https://gblobscdn.gitbook.com/assets%2F-Lzh1AArhTugENuEmRGS%2F-M6jmR5ihP5f3uoAqt1Y%2F-M6se7wKEtDD0HjtK3_O%2Fimage.png?alt=media&token=a5428f6c-2bfb-448d-8972-e25311515f4f) -## Preparations -### Create *config/.env* - -* **if you are going to use GCS bucket as artifact storage**, put into *config/* Google credentials json +Docs: [MLPanel documentation](https://mlrepa.gitbook.io/mlpanel/) -* create file *config/.env*: -```.env - -# Common -HOST_MODE=[local|remote] -WORKSPACE=/path/to/workspace/folder/on/host -GOOGLE_APPLICATION_CREDENTIALS=/home/config/ - -# Deploy -GCP_PROJECT=vision-230607 -GCP_ZONE=us-east1-b -GCP_MACHINE_TYPE=g1-small -GCP_OS_IMAGE=ubuntu1804-docker -GCP_BUCKET= -MODEL_DEPLOY_DOCKER_IMAGE=meisterurian/mlflow-deploy -MODEL_DEPLOY_DEFAULT_PORT=5000 -MODEL_DEPLOY_FIREWALL_RULE=mlflow-deploy - -# Projects -LOGLEVEL= -ARTIFACT_STORE=[mlruns|gs://] -MLFLOW_TRACKING_SERVERS_PORTS_RANGE=5000-5020 - -# UI -MLPANEL_UI_BRANCH=feature/ui-features -REACT_APP_API_URL=http://0.0.0.0:8080/ -``` - - -### Description of environment variables - -#### Common - -##### ***HOST_MODE*** - -Required: No. - -Default: local. - -Defines ip address of tracking servers: - -* if *local* then ip address = "0.0.0.0" (e.i. **mlpanel** is running and available locally); -* if *remote* then ip address = (e.i. **mlpanel** is running on some remote server and available by external ip); -external ip is detected automatically. - - -##### ***WORKSPACE*** - -Required: Yes. - -WORKSPACE sets path to workspace folder - any path on you disk (taking into account permission). - -The folder contains: - -* projects database; -* deploy database; -* folders for each project, each folder is named by the project id and contains: - * mlflow.db - mlflow database; - * artifact storage folder - if variable *ARTIFACT_STORE* points to local path. - -**Note**: if artifacts are stored locally then folder WORKSPACE can have huge size, it depends on -number of projects and artifacts (especially ML models) volume. - - -##### ***GOOGLE_APPLICATION_CREDENTIALS*** - -Required: No. - -Defines path to your Google credentials JSON (https://cloud.google.com/docs/authentication/getting-started) -inside docker container. - -Consists from two parts: fixed (/home/config/ - exact path of folder config inside container) and name of your JSON. -The JSON you must put to folder config/. - -**Note** define *GOOGLE_APPLICATION_CREDENTIALS* if you are going to use Google Cloud Storage -(https://cloud.google.com/storage) or/and deploy models on Google Compute Engine (GCE, https://cloud.google.com/compute). - - -#### Deploy - -Now two types of deployments exist: - -* local - deploys model locally (as new MLflow model server process inside container); -* gcp - start new GCE instance and run MLflow model server process on it. - -##### ***GCP_PROJECT*** - -Required: No (Yes for deploying on GCE). - -Id of your project Google Cloud Platform. Define it if you are going to deploy models on GCE. - - -##### ***GCP_ZONE*** - -Required: No (Yes for deploying on GCE). - -Zone type for instances. More about zone: https://cloud.google.com/compute/docs/regions-zones. - -##### ***GCP_MACHINE_TYPE*** - -Required: No (Yes for deploying on GCE). - -GCE machine type. More about machine types: https://cloud.google.com/compute/docs/machine-types. - -Minimum requirement of machine type is *g1-small*. - -##### ***GCP_OS_IMAGE*** - -Required: No (Yes for deploying on GCE). - -Your private (custom) OS image. Docker must be installed inside OS image. -Working with OS images: https://cloud.google.com/compute/docs/images/create-delete-deprecate-private-images. - - -##### ***GCP_BUCKET*** - -Required: No (Yes for deploying on GCE). - -Google Storage bucket name. - - -##### ***MODEL_DEPLOY_DOCKER_IMAGE*** - -Required: No (Yes for deploying on GCE). - -Name of docker image which provides running of deployment container on GCE instance. -MLflow must be installed in the image. - - -##### ***MODEL_DEPLOY_DEFAULT_PORT*** - -Required: No (Yes for deploying on GCE). - -Port on which deployments are available. - -**Note**: all gcp deployment have the same port, but theirs external ip addresses are different. - - -##### ***MODEL_DEPLOY_FIREWALL_RULE*** - -Required: No (Yes for deploying on GCE). - -Name of firewall rule. Firewall rule is required to open some port(s) for communication with services running on -GCE instances. More about firewall: https://cloud.google.com/vpc/docs/using-firewalls. - - -#### Projects - -##### ***LOGLEVEL*** - -Required: No. - -Default: INFO. - -Logging level in service *projects*. - - -##### ***ARTIFACT_STORE*** - -Required: Yes. - -Path to artifact store. It can be: +Examples: [MLPanel examples](https://mlrepa.gitbook.io/mlpanel/) -* name of folder - it means relative path in $WORKSPACE/, e.g.: -``` -ARTIFACT_STORE=mlruns => full path of artifact store is $WORKSPACE//mlruns. -``` +Find us on [Discord](https://discord.gg/DAMrSAy) -* Google Storage bucket in format **gs://**; this case requires to -define variable *GOOGLE_APPLICATION_CREDENTIALS*. +# Features -##### ***MLFLOW_TRACKING_SERVERS_PORTS_RANGE*** +- interface to collaborate on multiple Machine Learning and Data Science projects +- create and manage Projects +- manage Machine Learning Experiments: metrics tracking and models management +- 1-click-deploy +- customizable -Required: Yes. +![Projects Panel](https://lh5.googleusercontent.com/DLt_aeA71h5a6zZI6_DOEInAU-6I9cjRyshF7Tihywi_khMOlq_OX6ToXQ8kR-43KM8Li1ENEWp22R_dTBRpIu1C52jqky08uuihXC28NOwXzkfq3eTw-62JPesN5OJP1kHUhpCR5-w) -Format: -. -Range of ports for tracking servers. It also defines how many projects can be -created: one port corresponds to one project (tracking server). +# How to start +1) look into [MLPanel Discord channel](https://discord.gg/DAMrSAy) +2) [Install MLPanel](https://mlrepa.gitbook.io/mlpanel/get-started-1/install) +3) [Run examples](https://mlrepa.gitbook.io/mlpanel/tutorials/examples-with-jupyter-notebook) +4) Try MLPanel in your projects -#### UI -##### ***MLPANEL_UI_BRANCH*** -Required: Yes. +# Feature requests and feedback -[mlpanel-ui](https://github.com/mlrepa/mlpanel-ui) branch. -It can be any existing branch name. +The best way to send feedback is to file an issue or message on [Discord](https://discord.gg/DAMrSAy) +If you are proposing a feature: -##### ***REACT_APP_API_URL*** +* Give a short use case +* Describe how it would work +* Provide support materials like charts, diagrams, code samples +* Feel free to make a pull request :) -Required: Yes. -Fixed value: http://0.0.0.0:8080/. +# Development -URL to API which UI service calls. +1. Fork [MLPanel](https://github.com/mlrepa/mlpanel) + +2. Clone your fork locally: -## Build + git clone git@github.com:mlrepa/mlpanel.git -```bash -ln -sf config/.env && docker-compose build -``` +3. Create a branch for local development:: -## Run services + git checkout -b name-of-feature-branch -```bash -docker-compose up -``` +3. Add your awesome changes !!! -### Enter UI +4. Run tests -`http://:3000`, -where `host` = *localhost* or *remote server ip* + ./services/projects/tests/test.sh -vv + ./services/deploy/tests/test.sh -vv -## Tests and examples +5. Commit your changes and push your branch to GitHub:: -Read **README.md** files in *services/*: + git add . + git commit -m "Describe changes made" + git push origin name-of-feature-branch -* [projects](services/projects/README.md); -* [deploy](services/deploy/README.md). +6. Submit a pull request through the GitHub website. \ No newline at end of file diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..059a578 --- /dev/null +++ b/build.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +source functions.sh +handle_help build +check_input_edition + +if [ 'base' == $EDITION ]; then + + DOCKER_FOLDER="config/base/docker" + + echo "***Base image***" + docker build -t mlrepa/mlpanel-base:v0.2 $DOCKER_FOLDER/base + + echo "***Deploy image***" + docker build -t mlrepa/mlpanel-base-deploy:v0.2 $DOCKER_FOLDER/deploy + + echo "***Projects image***" + docker build -t mlrepa/mlpanel-base-projects:v0.2 $DOCKER_FOLDER/projects + + echo "***UI image***" + docker build -t mlrepa/mlpanel-base-ui:v0.2 $DOCKER_FOLDER/ui + + echo "***Web image***" + docker build -t mlrepa/mlpanel-base-web:v0.2 $DOCKER_FOLDER/web + +else + echo "Edition $EDITION undefined" + +fi; \ No newline at end of file diff --git a/common/utils.py b/common/utils.py index 34c512d..0cb36b6 100644 --- a/common/utils.py +++ b/common/utils.py @@ -10,6 +10,14 @@ from starlette.responses import JSONResponse +class ModelDoesNotExistError(Exception): + """Model does not exists""" + + +class InvalidEnvVarValueError(Exception): + """Invalid env var value""" + + def get_rfc3339_time() -> Text: """Get timestamp in rfc3339 format @@ -57,9 +65,6 @@ def build_error_response(status_code: HTTPStatus, e: Exception) -> JSONResponse: message=str(e) ) - # Set CORS - error_resp.headers['Access-Control-Allow-Origin'] = '*' - return error_resp @@ -95,6 +100,10 @@ def kill(proc_pid: int) -> None: Arguments: proc_pid {int} -- process id """ + + if not psutil.pid_exists(proc_pid): + return + process = psutil.Process(proc_pid) for proc in process.children(recursive=True): @@ -118,5 +127,16 @@ def is_remote(path: Text) -> bool: return False -class ModelDoesNotExistError(Exception): - """Model does not exists""" \ No newline at end of file +def validate_env_var(name: Text, value: Text) -> None: + + if ' ' in value: + raise InvalidEnvVarValueError(f'Invalid value "{value}" of env var {name}') + + +def get_utc_timestamp() -> Text: + """Get utc timestamp. + Returns: + Text: utc timestamp + """ + + return str(datetime.datetime.utcnow().timestamp() * 1000) \ No newline at end of file diff --git a/config/.gitignore b/config/.gitignore deleted file mode 100644 index c96a04f..0000000 --- a/config/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index 70bf2cb..0000000 --- a/docker-compose.yaml +++ /dev/null @@ -1,50 +0,0 @@ -version: '3' - -services: - projects: - - env_file: - - .env - build: - context: services/projects - image: mlpanel-projects:latest - container_name: mlpanel-projects - ports: - - "$MLFLOW_TRACKING_SERVERS_PORTS_RANGE:$MLFLOW_TRACKING_SERVERS_PORTS_RANGE" - - 8080:8080 - expose: - - "$MLFLOW_TRACKING_SERVERS_PORTS_RANGE" - - 8080 - volumes: - - ./config:/home/config - - ./services/projects:/home/projects - - ./common:/home/common - - $WORKSPACE:$WORKSPACE - - deploy: - - env_file: - - .env - build: - context: ./ - dockerfile: services/deploy/Dockerfile - image: mlpanel-deploy:latest - container_name: mlpanel-deploy - ports: - - 9000:9000 - volumes: - - ./config:/home/config - - ./services/deploy:/home/deploy - - ./common:/home/common - - $WORKSPACE:$WORKSPACE - - ui: - env_file: - - .env - build: - context: ./ - dockerfile: services/ui/Dockerfile - image: mlpanel-ui:latest - container_name: mlpanel-ui - ports: - - 3000:3000 diff --git a/functions.sh b/functions.sh new file mode 100644 index 0000000..1bd352c --- /dev/null +++ b/functions.sh @@ -0,0 +1,60 @@ + +POSITIONAL_ARGS=() + +while [ "$1" != "" ]; do + case "$1" in + --edition) + shift + EDITION=$1 + ;; + *) + POSITIONAL_ARGS+=("$1") + esac + shift +done; + + +check_input_edition(){ + if [ -z $EDITION ]; then + echo "WARNING: edition not specified, base edition selected by default." + EDITION='base' + fi; +} + + +help(){ + + SCRIPT=$1 + EDITION_INFO="[--edition ].\nDefault edition: base." + case $SCRIPT in + "build") + printf "Builds images.\nSyntax: ./build.sh $EDITION_INFO\nExamples: + \t./build.sh\n\t./build.sh --edition base\n" + ;; + + "start") + printf "Starts mlpanel.\nSyntax: ./start.sh $EDITION_INFO\nExamples: + \t./build.sh\n\t./start.sh --edition base\n" + ;; + + "restart") + printf "Restarts several or all services. If services not specified then restarts all + \nSyntax: ./retart.sh [, , ...] $EDITION_INFO\nExamples: + \t./restart.sh\n\t./restart.sh web\n\t./restart.sh web --edition base\n" + ;; + + "stop") + + printf "Stops mlpanel.\nSyntax: ./stop.sh $EDITION_INFO\nExamples: + \t./build.sh\n\t./build.sh --edition base\n" + ;; + esac + + exit +} + +handle_help(){ + if [ '--help' == "${POSITIONAL_ARGS[0]}" ]; then + help $1 + fi; +} \ No newline at end of file diff --git a/restart.sh b/restart.sh new file mode 100755 index 0000000..cd07296 --- /dev/null +++ b/restart.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +source functions.sh +handle_help restart +check_input_edition + +if [ 'base' == $EDITION ]; then + export $(cat config/base/.env | grep "^[^#;]") + docker-compose -f config/base/docker-compose.yaml down "${POSITIONAL_ARGS[@]}" + docker-compose -f config/base/docker-compose.yaml up "${POSITIONAL_ARGS[@]}" +else + echo "Edition $EDITION undefined" +fi; + diff --git a/services/auth/Dockerfile b/services/auth/Dockerfile new file mode 100644 index 0000000..c0f47e3 --- /dev/null +++ b/services/auth/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.7-slim + +COPY ./requirements.txt /tmp/requirements.txt +RUN pip install -r /tmp/requirements.txt + +CMD gunicorn --bind 0.0.0.0:1234 --workers $AUTH_SERVER_GUNICORN_WORKERS home.auth.src.app:app \ No newline at end of file diff --git a/services/auth/requirements.txt b/services/auth/requirements.txt new file mode 100644 index 0000000..c730ca6 --- /dev/null +++ b/services/auth/requirements.txt @@ -0,0 +1,5 @@ +flask==1.1.1 +flask-cors==3.0.8 +gunicorn==20.0.4 +python-jose==3.1.0 +requests==2.22.0 \ No newline at end of file diff --git a/services/auth/src/app.py b/services/auth/src/app.py new file mode 100644 index 0000000..a67915c --- /dev/null +++ b/services/auth/src/app.py @@ -0,0 +1,270 @@ +""" +Auth server. +Provides endpoints: + * obtain access token; + * check authorization (validate access token); +""" + + +from flask import Flask, request, jsonify +from jose import jwt +import json +import logging +import os +import requests +from typing import Text + + +LOGLEVEL = os.getenv('LOGLEVEL', 'INFO').upper() +logging.basicConfig(level=LOGLEVEL) + +AUTH0_DOMAIN = os.getenv('AUTH0_DOMAIN') +AUTH0_API_IDENTIFIER = os.getenv('AUTH0_API_IDENTIFIER') +ALGORITHMS = ['RS256'] + +app = Flask(__name__) + + +class JWKSCache: + + JWKS_CACHED_JSON = '/tmp/jwks.json' + + def __init__(self, auth0_domain): + self.auth0_domain = auth0_domain + + def get_jwks(self): + + self.refresh_jwks_json() + + with open(self.JWKS_CACHED_JSON, 'r') as jwks_json_file: + return json.load(fp=jwks_json_file) + + def refresh_jwks_json(self): + + # TODO (Alex): add JWKS json structure validation + # TODO (Alex): handle error when specified auth0_domain not found + if not os.path.exists(self.JWKS_CACHED_JSON): + + jwks_resp = requests.get( + 'https://' + self.auth0_domain + '/.well-known/jwks.json' + ) + jwks = jwks_resp.json() + + with open(self.JWKS_CACHED_JSON, 'w') as jwks_json_file: + json.dump(obj=jwks, fp=jwks_json_file) + + +class AuthError(Exception): + """Format error response and append status code.""" + + def __init__(self, error, status_code): + + self.error = error + self.status_code = status_code + + +JWKS_CACHE = JWKSCache(auth0_domain=AUTH0_DOMAIN) + + +@app.errorhandler(AuthError) +def handle_auth_error(ex: AuthError): + + logging.debug(f'error handling: {str(ex)}') + + response = jsonify(ex.error) + response.status_code = ex.status_code + + return response + + +def get_token_auth_header(): + """Obtains the access token from the Authorization Header""" + + logging.debug('obtain token from header') + + auth = request.headers.get('Authorization', None) + + if not auth: + + logging.debug('authorization header missing') + + raise AuthError({ + 'code': 'authorization_header_missing', + 'description': 'Authorization header is expected' + }, 401) + + parts = auth.split() + + if parts[0].lower() != 'bearer': + + logging.debug('invalid header: authorization header must start with bearer') + + raise AuthError({ + 'code': 'invalid_header', + 'description': 'Authorization header must start with Bearer' + }, 401) + + elif len(parts) == 1: + + logging.debug('invalid header: token not found') + + raise AuthError({ + 'code': 'invalid_header', + 'description': 'Token not found' + }, 401) + + elif len(parts) > 2: + + logging.debug('invalid header: authorization header must be Bearer token') + + raise AuthError({ + 'code': 'invalid_header', + 'description': 'Authorization header must be Bearer token' + }, 401) + + token = parts[1] + + return token + + +def check_auth(access_token: Text): + """Check authorization - validate access_token""" + + logging.info('check authorization') + + jwks = JWKS_CACHE.get_jwks() + + try: + + unverified_header = jwt.get_unverified_header(access_token) + + except jwt.JWTError: + + logging.debug('invalid_header: use an rs256 signed jwt access token') + + raise AuthError({ + 'code': 'invalid_header', + 'description': 'Invalid header. Use an RS256 signed JWT Access Token' + }, 401) + + if unverified_header['alg'] == 'HS256': + + logging.debug('invalid_header: use an rs256 signed jwt access token') + + raise AuthError({ + 'code': 'invalid_header', + 'description': 'Invalid header. Use an RS256 signed JWT Access Token' + }, 401) + + rsa_key = {} + + for key in jwks.get('keys', []): + + if key['kid'] == unverified_header['kid']: + + rsa_key = { + 'kty': key['kty'], + 'kid': key['kid'], + 'use': key['use'], + 'n': key['n'], + 'e': key['e'] + } + + if rsa_key: + try: + + jwt.decode( + access_token, + rsa_key, + algorithms=ALGORITHMS, + audience=AUTH0_API_IDENTIFIER, + issuer='https://'+AUTH0_DOMAIN+'/' + ) + + except jwt.ExpiredSignatureError: + + logging.debug('token expired') + + JWKS_CACHE.refresh_jwks_json() + + raise AuthError({ + 'code': 'token_expired', + 'description': 'token is expired' + }, 401) + + except jwt.JWTClaimsError: + + logging.debug('incorrect claims, please check the audience and issuer') + + raise AuthError({ + 'code': 'invalid_claims', + 'description': 'incorrect claims, please check the audience and issuer' + }, 401) + + except Exception as e: + logging.error(e, exc_info=True) + logging.debug('invalid_header: unable to parse authentication token') + + raise AuthError({ + 'code': 'invalid_header', + 'description': 'Unable to parse authentication token' + }, 401) + + return + + logging.debug('invalid_header: to find appropriate key') + + raise AuthError({ + 'code': 'invalid_header', + 'description': 'Unable to find appropriate key' + }, 401) + + +@app.route('/token', methods=['POST']) +def token(): + """Get access token""" + + logging.debug('get access token') + + headers = {'content-type': 'application/json'} + payload = { + 'client_id': request.form.get('client_id'), + 'client_secret': request.form.get('client_secret'), + 'audience': AUTH0_API_IDENTIFIER, + 'grant_type':'client_credentials' + } + resp = requests.post( + url=f'https://{AUTH0_DOMAIN}/oauth/token', + json=payload, + headers=headers + ) + data = resp.json() + + if 'access_token' not in data: + + logging.debug('auth0 response does not contain access token') + + raise AuthError({ + 'code': data.get('error', ''), + 'description': data.get('error_description', ''), + }, resp.status_code) + + access_token = data['access_token'] + + return {'access_token': access_token} + + +@app.route('/token/validate') +def validate_token(): + """Validate (confirm) access token""" + + logging.debug('validate token') + token = get_token_auth_header() + check_auth(token) + + return '' + + +if __name__ == '__main__': + + app.run(host='0.0.0.0', port=1234, debug=True) diff --git a/services/deploy/2_model.log b/services/deploy/2_model.log new file mode 100644 index 0000000..e94aebe --- /dev/null +++ b/services/deploy/2_model.log @@ -0,0 +1,8 @@ +Usage: mlflow models serve [OPTIONS] +Try 'mlflow models serve --help' for help. + +Error: Got unexpected extra argument (2_model) +Usage: mlflow models serve [OPTIONS] +Try 'mlflow models serve --help' for help. + +Error: Got unexpected extra argument (2_model) diff --git a/services/deploy/Dockerfile b/services/deploy/Dockerfile deleted file mode 100644 index 2a5f274..0000000 --- a/services/deploy/Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -FROM python:3.7 - -RUN pip install email-validator==1.0.5 \ - fastapi==0.42.0 \ - google-api-python-client==1.7.9 \ - google-cloud-storage==1.20.0 \ - jinja2==2.10.3 \ - mlflow==1.4.0 \ - oauth2client==3.0.0 \ - psutil==5.6.2 \ - python-multipart==0.0.5 \ - pytest==5.2.2 \ - pyyaml==5.1.2 \ - requests==2.22.0 \ - scikit-learn==0.20.4 \ - uvicorn==0.10.8 - -RUN apt-get update && \ - apt-get install -y sudo && \ - useradd -m user -u 1000 && \ - echo 'user:user' | chpasswd user && \ - echo "user ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/user && \ - chmod 0440 /etc/sudoers.d/user && \ - chown -R user /home - - -RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz - -# Installing the package -RUN mkdir -p /usr/local/gcloud \ - && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ - && /usr/local/gcloud/google-cloud-sdk/install.sh - -# Adding the package path to local -ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin - -USER user - -WORKDIR /home/deploy - -ENV PYTHONPATH=/home - -CMD sudo chown -R user $WORKSPACE && uvicorn --host 0.0.0.0 --port 9000 src.app:app --reload diff --git a/services/deploy/README.md b/services/deploy/README.md index 48f8f78..d6ab3f0 100644 --- a/services/deploy/README.md +++ b/services/deploy/README.md @@ -1,17 +1,3 @@ # deploy Service for deploying ML models locally - -## Run tests (in docker) - -```bash -chmod +x tests/test.sh -./tests/test.sh -``` - -or with any `pytest` key, for example, verbose flag: - -```bash -chmod +x tests/test.sh -./tests/test.sh -v -``` \ No newline at end of file diff --git a/services/deploy/src/app.py b/services/deploy/src/app.py index 04b6d3f..dad324b 100644 --- a/services/deploy/src/app.py +++ b/services/deploy/src/app.py @@ -5,15 +5,20 @@ from fastapi import FastAPI from http import HTTPStatus +import logging from starlette.middleware.cors import CORSMiddleware from starlette.responses import Response from starlette.requests import Request from common.utils import build_error_response, ModelDoesNotExistError -from deploy.src import config -from deploy.src.deployments import DeployManager, DeploymentNotFoundError, InvalidDeploymentType +from deploy.src.deployments.manager import DeploymentNotFoundError, InvalidDeploymentType, \ + DeployDbSchema, DeployManager +from deploy.src.deployments.utils import BadInputDataSchemaError from deploy.src.routers import default, deployments + +logging.basicConfig(level=logging.DEBUG) + app = FastAPI() # pylint: disable=invalid-name app.setup() @@ -31,8 +36,9 @@ def init() -> None: """Init on application startup""" - config.create_workspace_dir() - config.load_gcp_config_env_vars() + # TODO: refactor (same as project) + DeployDbSchema() + deploy_manager = DeployManager() deploy_manager.check_and_update_deployments_statuses() @@ -50,13 +56,14 @@ async def before_and_after_request(request: Request, call_next) -> Response: try: response = await call_next(request) - except (DeploymentNotFoundError, InvalidDeploymentType) as e: + except (DeploymentNotFoundError, ModelDoesNotExistError) as e: return build_error_response(HTTPStatus.NOT_FOUND, e) - except ModelDoesNotExistError as e: - return build_error_response(HTTPStatus.NOT_FOUND, e) + except (BadInputDataSchemaError, InvalidDeploymentType) as e: + return build_error_response(HTTPStatus.BAD_REQUEST, e) except Exception as e: + logging.error(e, exc_info=True) return build_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, e) return response diff --git a/services/deploy/src/config.py b/services/deploy/src/config.py index 0660395..b6a24c8 100644 --- a/services/deploy/src/config.py +++ b/services/deploy/src/config.py @@ -4,54 +4,71 @@ import os -GCP_CONFIG = None -WORKSPACE = os.getenv('WORKSPACE') -DEPLOY_DB_NAME = 'deploy.db' -DEPLOYMENTS_TABLE = 'deployment' +from common.utils import validate_env_var + + +class Config: + + def __init__(self): + + self._load_env_vars() + self._check_env_vars() + + self.deployments_logs_dir = os.path.join(self.get('WORKSPACE'), 'deployments_logs') + os.makedirs(self.get('WORKSPACE'), exist_ok=True) + os.makedirs(self.deployments_logs_dir, exist_ok=True) + + def get(self, env_var): + return self.env_vars.get(env_var) + + def get_gcp_config(self): + + gcp_config = { + 'gcp_project': self.env_vars.get('GCP_PROJECT'), + 'zone': self.env_vars.get('GCP_ZONE'), + 'machine_type_name': self.env_vars.get('GCP_MACHINE_TYPE'), + 'image': self.env_vars.get('GCP_OS_IMAGE'), + 'bucket': self.env_vars.get('GCP_BUCKET'), + 'docker_image': self.env_vars.get('MODEL_DEPLOY_DOCKER_IMAGE'), + 'firewall': self.env_vars.get('MODEL_DEPLOY_FIREWALL_RULE'), + 'port': self.env_vars.get('MODEL_DEPLOY_DEFAULT_PORT'), + 'google_credentials_json': self.env_vars.get('GOOGLE_APPLICATION_CREDENTIALS') + } + + return gcp_config + + def _load_env_vars(self): + + self.env_vars = { + 'WORKSPACE': os.getenv('WORKSPACE'), + 'DEPLOY_DB_NAME': os.getenv('DEPLOY_DB_NAME'), + 'DB_HOST': os.getenv('DB_HOST'), + 'DB_PORT': os.getenv('DB_PORT'), + 'DB_USER': os.getenv('POSTGRES_USER'), + 'DEPLOY_SERVER_WORKERS': os.getenv('DEPLOY_SERVER_WORKERS', 1), + 'DB_PASSWORD': os.getenv('POSTGRES_PASSWORD'), + 'GCP_PROJECT': os.getenv('GCP_PROJECT', ''), + 'GCP_ZONE': os.getenv('GCP_ZONE', ''), + 'GCP_MACHINE_TYPE': os.getenv('GCP_MACHINE_TYPE', ''), + 'GCP_OS_IMAGE': os.getenv('GCP_OS_IMAGE', ''), + 'GCP_BUCKET': os.getenv('GCP_BUCKET', ''), + 'MODEL_DEPLOY_DOCKER_IMAGE': os.getenv('MODEL_DEPLOY_DOCKER_IMAGE', ''), + 'MODEL_DEPLOY_FIREWALL_RULE': os.getenv('MODEL_DEPLOY_FIREWALL_RULE', ''), + 'MODEL_DEPLOY_DEFAULT_PORT': os.getenv('MODEL_DEPLOY_DEFAULT_PORT', ''), + 'GOOGLE_APPLICATION_CREDENTIALS': os.getenv('GOOGLE_APPLICATION_CREDENTIALS', '') + } + + def _check_env_vars(self): + """Check if all required environment variables are set. + Raises: + EnvironmentError: if some required environment variable is not set + InvalidEnvVarValueError: if some environment variable is invalid + """ + + for name, value in self.env_vars.items(): + + if value is None: + raise EnvironmentError(f'Failed because {name} env var is not set') + + validate_env_var(name, str(value)) - -def create_workspace_dir() -> None: - """Create workspace directory""" - - os.makedirs(WORKSPACE, exist_ok=True) - - -def check_environment_variable() -> None: - """Check if all required environment variables are set. - Raises: - TypeError: if GCP_CONFIG is not dictionary - EnvironmentError: if some required environment variable is not set - """ - - global GCP_CONFIG - - if WORKSPACE is None: - raise EnvironmentError('Failed because WORKSPACE env var is not set') - - if not isinstance(GCP_CONFIG, dict): - raise TypeError(f'GCP_CONFIG must be dictionary, not {type(GCP_CONFIG)}') - - for name, value in GCP_CONFIG.items(): - if value is None: - raise EnvironmentError(f'Failed because {name} env var is not set') - - -def load_gcp_config_env_vars() -> None: - """Load environment variables.""" - - global GCP_CONFIG - - GCP_CONFIG = { - 'gcp_project': os.getenv('GCP_PROJECT'), - 'zone': os.getenv('GCP_ZONE'), - 'machine_type_name': os.getenv('GCP_MACHINE_TYPE'), - 'image': os.getenv('GCP_OS_IMAGE'), - 'bucket': os.getenv('GCP_BUCKET'), - 'docker_image': os.getenv('MODEL_DEPLOY_DOCKER_IMAGE'), - 'firewall': os.getenv('MODEL_DEPLOY_FIREWALL_RULE'), - 'port': os.getenv('MODEL_DEPLOY_DEFAULT_PORT'), - 'google_credentials_json': os.getenv('GOOGLE_APPLICATION_CREDENTIALS') - } - - if GCP_CONFIG.get('google_credentials_json') is not None: - check_environment_variable() diff --git a/services/deploy/src/deployments/__init__.py b/services/deploy/src/deployments/__init__.py index cea32de..e69de29 100644 --- a/services/deploy/src/deployments/__init__.py +++ b/services/deploy/src/deployments/__init__.py @@ -1,435 +0,0 @@ -""" -This module contains class for deployments management - DeployManager. -It allows to create, run, stop and delete deployments of different types. - -Deployment is process which loads model and is available by http. -Models are deployed via MLflow. Deployment has two endpoints: - * /ping = healthcheck; - * /invocations = predict. -More information about MLflow models deploy: - https://www.mlflow.org/docs/latest/models.html#deploy-mlflow-models - -Now it's supported types of deployments: - * local - runs new process locally; - * gcp - creates new GCE instance and runs MLflow model deploy process on it. -""" - -# pylint: disable=wrong-import-order - -import os -import requests -import sqlite3 -from typing import Dict, List, Text - -from common.types import StrEnum -from common.utils import get_rfc3339_time, is_remote -from deploy.src import config -from deploy.src.deployments.gcp import create_gcp_deployment, stop_gcp_deployment, wait_gcp_host_ip -from deploy.src.deployments.gcp_deploy_utils import gcp_get_external_ip, generate_gcp_instance_name -from deploy.src.deployments.local import create_local_deployment, stop_local_deployment -from deploy.src.deployments.utils import mlflow_model_predict - - -class DeploymentNotFoundError(Exception): - """Deployment not found""" - - -class InvalidDeploymentType(Exception): - """Invalid deployment type""" - - -class LocalModelDeployRemotelyError(Exception): - """Local model cannot be deployed remotely""" - - -class DeploymentStatus(StrEnum): - """Deployment status enum. - Statuses: - * NOT_FOUND - there is no record in database; - * RUNNING - deployment process is running; - * STOPPED - deployment process is stopped; - * DELETED - deployment is marked as deleted in database. - """ - - NOT_FOUND = 'not found' - RUNNING = 'running' - STOPPED = 'stopped' - DELETED = 'deleted' - - -class DeploymentType(StrEnum): - """Deployment type""" - - LOCAL = 'local' - GCP = 'gcp' - - -class DeployManager: - """ - Manage deployments. - Methods: - create(Deployment): creates new deployment. - get_status(Deployment): get deployment status. - predict(Deployment, str): predict data on deployment. - stop(Deployment): stop deployment. - list(): get deployments list. - """ - # pylint: disable=too-many-instance-attributes - def __init__(self): - """ - Args: - workspace {Text}: workspace folder - gcp_config {Dict}: config for GCP deployment - """ - # pylint: disable=invalid-name - - self._WORKSPACE = config.WORKSPACE - self._GCP_CONFIG = config.GCP_CONFIG - self._LOCAL_DEPLOYMENT_HOST = '0.0.0.0' - self._GCP_INSTANCE_CONNECTION_TIMEOUT = 30 - self._DEPLOY_DB_NAME = config.DEPLOY_DB_NAME - self._DEPLOY_DB = os.path.join(self._WORKSPACE, self._DEPLOY_DB_NAME) - self._DEPLOYMENTS_TABLE = config.DEPLOYMENTS_TABLE - self._deploy_table_schema = { - 'id': 'INTEGER UNIQUE', - 'project_id': 'INTEGER', - 'model_id': 'TEXT', - 'version': 'TEXT', - 'model_uri': 'TEXT', - 'host': 'TEXT', - 'port': 'INTEGER', - 'pid': 'INTEGER', - 'instance_name': 'TEXT', - 'type': 'TEXT', - 'created_at': 'TIMESTAMP', - 'last_updated_at': 'TIMESTAMP', - 'status': 'TEXT' - } - self._connect_db() - self._local_deployments_processes = {} - - def create_deployment(self, project_id: int, model_id: Text, model_version: Text, - model_uri: Text, deployment_type: Text) -> int: - """Create deployment. - Args: - project_id {int}: project id - model_id {Text}: model id (name) - model_version {Text}: model version - model_uri {Text}: path to model package - deployment_type {Text}: deployment type - Returns: - int: id of created deployment - """ - # pylint: disable=too-many-arguments - - instance_name = '' - pid = -1 - - if deployment_type == DeploymentType.LOCAL: - - host = self._LOCAL_DEPLOYMENT_HOST - process, port = create_local_deployment(model_uri, self._LOCAL_DEPLOYMENT_HOST) - pid = process.pid - self._local_deployments_processes[pid] = process - - elif deployment_type == DeploymentType.GCP: - - if not is_remote(model_uri): - raise LocalModelDeployRemotelyError( - f'Model {model_uri} is local, it cannot be deployed remotely') - - instance_name = generate_gcp_instance_name() - create_gcp_deployment(model_uri, self._GCP_CONFIG, instance_name) - port = self._GCP_CONFIG.get('port') - host = wait_gcp_host_ip( - instance_name, self._GCP_CONFIG, self._GCP_INSTANCE_CONNECTION_TIMEOUT) - - else: - raise InvalidDeploymentType(f'Invalid deployment type: {deployment_type}') - - deployment_id = self._insert_new_deployment_in_db( - project_id, model_id, model_version, model_uri, - host, port, pid, instance_name, deployment_type - ) - - return deployment_id - - def run(self, deployment_id: int) -> None: - """Run deployment. - Args: - deployment_id {int}: deployment id - """ - - self._cursor.execute( - f'SELECT type, model_uri, instance_name, status ' - f'FROM {self._DEPLOYMENTS_TABLE} ' - f'WHERE id = {deployment_id}' - ) - deployment = self._cursor.fetchone() - - if deployment is None: - raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') - - deployment_type, model_uri, instance_name, status = deployment - - if status == DeploymentStatus.RUNNING: - return - - if deployment_type == DeploymentType.LOCAL: - - host = self._LOCAL_DEPLOYMENT_HOST - process, port = create_local_deployment(model_uri, self._LOCAL_DEPLOYMENT_HOST) - pid = process.pid - self._local_deployments_processes[pid] = process - - elif deployment_type == DeploymentType.GCP: - - pid = -1 - port = self._GCP_CONFIG.get('port') - create_gcp_deployment(model_uri, self._GCP_CONFIG, instance_name) - host = wait_gcp_host_ip( - instance_name, self._GCP_CONFIG, self._GCP_INSTANCE_CONNECTION_TIMEOUT) - - else: - raise InvalidDeploymentType(f'Invalid deployment type: {deployment_type}') - - self._cursor.execute( - f'UPDATE {self._DEPLOYMENTS_TABLE} ' - f'SET host = ?, port = ?, status = ?, pid = ?, last_updated_at = ? ' - f'WHERE id = {deployment_id}', - (host, port, str(DeploymentStatus.RUNNING), pid, get_rfc3339_time()) - ) - self._conn.commit() - - def stop(self, deployment_id: int) -> None: - """Stop deployment. - Args: - deployment_id {int}: deployment id - """ - - self._cursor.execute( - f'SELECT status, type, pid, instance_name ' - f'FROM {self._DEPLOYMENTS_TABLE} ' - f'WHERE id = {deployment_id} AND ' - f' status <> "{str(DeploymentStatus.DELETED)}"' - ) - deployment = self._cursor.fetchone() - - if deployment is None: - raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') - - status, deployment_type, pid, instance_name = deployment - - if status == DeploymentStatus.STOPPED: - return - - if deployment_type == DeploymentType.LOCAL: - proc = self._local_deployments_processes.get(pid) - stop_local_deployment(pid, proc) - - elif deployment_type == DeploymentType.GCP: - stop_gcp_deployment(instance_name, self._GCP_CONFIG) - - else: - raise InvalidDeploymentType(f'Invalid deployment type: {deployment_type}') - - self._cursor.execute( - f'UPDATE {self._DEPLOYMENTS_TABLE} ' - f'SET status = ?, host = ?, port = ?, last_updated_at = ? ' - f'WHERE id = {deployment_id}', - (str(DeploymentStatus.STOPPED), None, None, get_rfc3339_time()) - ) - self._conn.commit() - - def predict(self, deployment_id: int, data: Text) -> requests.Response: - """Predict data on deployment. - Args: - deployment_id {int}: deployment id - data {Text}: data to predict - """ - - self._cursor.execute( - f'SELECT host, port ' - f'FROM {self._DEPLOYMENTS_TABLE} ' - f'WHERE id = {deployment_id} AND ' - f' status <> "{str(DeploymentStatus.DELETED)}"' - ) - host_port = self._cursor.fetchone() - - if host_port is None: - raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') - - host, port = host_port - response = mlflow_model_predict(host, port, data) - - return response - - def delete(self, deployment_id: int) -> None: - """Delete deployment (mark as deleted. - Args: - deployment_id {int}: deployment id - """ - - self.stop(deployment_id) - self._cursor.execute( - f'UPDATE {self._DEPLOYMENTS_TABLE} ' - f'SET status = ?, host = ?, port = ?, last_updated_at = ? ' - f'WHERE id = {deployment_id}', - (str(DeploymentStatus.DELETED), None, None, get_rfc3339_time()) - ) - self._conn.commit() - - def list(self) -> List[Dict]: - """Get list of deployments info. - Returns: - List[Dict] - """ - - self._cursor.execute( - f'SELECT id, project_id, model_id, version, model_uri, ' - f'type, created_at, instance_name, status, host, port ' - f'FROM {self._DEPLOYMENTS_TABLE} ' - f'WHERE status <> "{str(DeploymentStatus.DELETED)}"' - ) - rows = self._cursor.fetchall() - deployments = [] - - for row in rows: - deployments.append({ - 'id': str(row[0]), - 'project_id': str(row[1]), - 'model_id': row[2], - 'version': row[3], - 'model_uri': row[4], - 'type': row[5], - 'created_at': row[6], - 'status': row[8], - 'host': row[9], - 'port': str(row[10]) if row[10] is not None else row[10] - }) - - return deployments - - def ping(self, deployment_id: int) -> bool: - """Ping deployment. - Args: - deployment_id {int}: deployment id - Returns: - True deployment is available by http, otherwise False - """ - - self._cursor.execute( - f'SELECT host, port, status ' - f'FROM {self._DEPLOYMENTS_TABLE} ' - f'WHERE id = "{deployment_id}"' - ) - deployment = self._cursor.fetchone() - - if deployment is None: - raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') - - host, port, status = deployment - - if status != DeploymentStatus.RUNNING: - return False - - try: - requests.get(f'http://{host}:{port}/ping') - return True - except requests.exceptions.ConnectionError: - return False - - def check_and_update_deployments_statuses(self) -> None: - """Check if deployment status. - If status "running" is not confirmed, change status to "stopped" - """ - - self._cursor.execute( - f'SELECT id, host, port FROM {self._DEPLOYMENTS_TABLE} ' - f'WHERE status = "{str(DeploymentStatus.RUNNING)}"' - ) - running_local_deployments = self._cursor.fetchall() - - for deployment_id, host, port in running_local_deployments: - - try: - requests.get( - url=f'http://{host}:{port}/ping', - timeout=self._GCP_INSTANCE_CONNECTION_TIMEOUT // 5 - ) - except requests.exceptions.ConnectionError: - self._cursor.execute( - f'UPDATE {self._DEPLOYMENTS_TABLE} ' - f'SET host = ?, port = ?, status = ? ' - f'WHERE id = "{deployment_id}"', - (None, None, str(DeploymentStatus.STOPPED)) - ) - - self._conn.commit() - - def _connect_db(self) -> None: - """Connect to deploy database. - - Create deployments table and open connection to database. - """ - - self._conn = sqlite3.connect(self._DEPLOY_DB, check_same_thread=False) - self._cursor = self._conn.cursor() - - columns_description = ', '.join([ - col_name + ' ' + col_type for col_name, col_type in self._deploy_table_schema.items() - ]) - - self._cursor.execute( - f'CREATE TABLE IF NOT EXISTS {self._DEPLOYMENTS_TABLE} ({columns_description})' - ) - - def _next_id(self) -> int: - """Get value that is one more than current maximum id""" - - self._cursor.execute(f'SELECT IFNULL(MAX(id), 0) FROM {self._DEPLOYMENTS_TABLE}') - deploy_id = self._cursor.fetchone()[0] + 1 - - return deploy_id - - def _insert_new_deployment_in_db( - self, project_id: int, model_id: Text, model_version: Text, model_uri: Text, - host: Text, port: int, pid: int, instance_name: Text, deployment_type: Text - ) -> int: - """Insert new deployment record in database. - Args: - project_id {int}: project id - model_id {Text}: model id (name) - model_version {Text}: model version - model_uri {Text}: path to model package - host {Text}: host address - port {int}: port number - pid {int}: deployment process number - instance_name {Text}: name of instance - deployment_type {Text}: deployment type - Returns: - int: id of insert deployment record - Notes: - * pid: is some positive integer in case of local deployment - and -1, if deployment type is remote; - * instance_name: is empty string for local and non-empty - string (name of remote virtual machine) for remote deployment. - """ - # pylint: disable=too-many-arguments - - deployment_id = self._next_id() - creation_datetime = get_rfc3339_time() - self._cursor.execute( - f'INSERT INTO {self._DEPLOYMENTS_TABLE} ' - f'(id, project_id, model_id, version, model_uri, host, port, ' - f'pid, instance_name, type, created_at, last_updated_at, status) ' - f'VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)', - ( - deployment_id, project_id, model_id, model_version, model_uri, - host, port, pid, instance_name, deployment_type, creation_datetime, - creation_datetime, str(DeploymentStatus.RUNNING) - ) - ) - self._conn.commit() - - return deployment_id diff --git a/services/deploy/src/deployments/gcp_deploy_utils.py b/services/deploy/src/deployments/gcp_deploy_utils.py index 1fb8de4..8dc7df2 100644 --- a/services/deploy/src/deployments/gcp_deploy_utils.py +++ b/services/deploy/src/deployments/gcp_deploy_utils.py @@ -4,13 +4,15 @@ """ # pylint: disable=wrong-import-order -import time - import googleapiclient.discovery import googleapiclient.errors +import time from typing import Dict, List, Optional, Text +from deploy.src.config import Config + + def gcp_create_instance(name: Text, gcp_project: Text, zone: Text, machine_type_name: Text, image: Text, bucket: Text, startup_script: Text) -> Dict: """Create Google Compute Engine instance. @@ -44,7 +46,7 @@ def gcp_create_instance(name: Text, gcp_project: Text, zone: Text, machine_type_ # pylint: disable=too-many-arguments compute = googleapiclient.discovery.build('compute', 'v1') - image_response = compute.images().get(image=image, project=gcp_project).execute() # pylint: disable=no-member + image_response = compute.images().get(image=image, project='cos-cloud').execute() # pylint: disable=no-member source_disk_image = image_response['selfLink'] machine_type = f'zones/{zone}/machineTypes/{machine_type_name}' config = { @@ -341,7 +343,8 @@ def gcp_deploy_model(model_uri: Text, docker_image: Text, instance_name: Text, f'docker run -v /home/gac.json:/root/gac.json -p {port}:{port} {docker_image} ' \ f'/bin/bash -c ' \ f'"export GOOGLE_APPLICATION_CREDENTIALS=/root/gac.json && ' \ - f'mlflow models serve --no-conda -m {model_uri} -h 0.0.0.0 -p {port}" ' + f'mlflow models serve --no-conda -m {model_uri} --host 0.0.0.0 --port {port} ' \ + f'--workers {Config().get("DEPLOY_SERVER_WORKERS")} | tee -a deployment.log" ' gcp_create_instance( name=instance_name, gcp_project=gcp_project, diff --git a/services/deploy/src/deployments/local.py b/services/deploy/src/deployments/local.py index 7bd58ed..50c9a33 100644 --- a/services/deploy/src/deployments/local.py +++ b/services/deploy/src/deployments/local.py @@ -1,27 +1,31 @@ """This module provides functions for working with local deployments.""" +import os import subprocess as sp -from typing import Optional, Text, Tuple +from typing import Text, Tuple -from deploy.src.utils import get_free_tcp_port from common.utils import kill +from deploy.src.config import Config +from deploy.src.utils import get_free_tcp_port -def create_local_deployment(model_uri: Text, host: Text) -> Tuple[sp.Popen, int]: +def create_local_deployment(model_uri: Text) -> Tuple[sp.Popen, int]: """Create local deployment process. Args: model_uri {Text}: path to model package - host {Text}: host address Returns: Tuple[subprocess.Popen, int]: (process, deployment port) """ + conf = Config() port = get_free_tcp_port() + log_path = os.path.join(conf.deployments_logs_dir, model_uri.replace('/','_') + '.log') process = sp.Popen( [ f'mlflow models serve --no-conda -m ' f'{model_uri} ' - f'-h {host} -p {port}' + f'--host 0.0.0.0 --port {port} --workers {conf.get("DEPLOY_SERVER_WORKERS")} ' + f'2>&1 | tee -a {log_path}' ], shell=True ) @@ -29,7 +33,7 @@ def create_local_deployment(model_uri: Text, host: Text) -> Tuple[sp.Popen, int] return process, port -def stop_local_deployment(pid: int, proc: Optional[sp.Popen]) -> None: +def stop_local_deployment(pid: int) -> None: """Stop local deployment. Args: pid {int}: process id @@ -37,8 +41,3 @@ def stop_local_deployment(pid: int, proc: Optional[sp.Popen]) -> None: """ kill(pid) - - # TODO (Alex): find how to avoid zombie processes without call method "communicate()" - if isinstance(proc, sp.Popen) and proc.poll() is None: - # Call method "communicate()" to really stop process (avoids zombie process) - proc.communicate() diff --git a/services/deploy/src/deployments/manager.py b/services/deploy/src/deployments/manager.py new file mode 100644 index 0000000..150ee4c --- /dev/null +++ b/services/deploy/src/deployments/manager.py @@ -0,0 +1,704 @@ +""" +This module contains class for deployments management - DeployManager. +It allows to create, run, stop and delete deployments of different types. + +Deployment is process which loads model and is available by http. +Models are deployed via MLflow. Deployment has two endpoints: + * /ping = healthcheck; + * /invocations = predict. +More information about MLflow models deploy: + https://www.mlflow.org/docs/latest/models.html#deploy-mlflow-models + +Now it's supported types of deployments: + * local - runs new process locally; + * gcp - creates new GCE instance and runs MLflow model deploy process on it. +""" + +# pylint: disable=wrong-import-order + + +import json +import logging +import os +import psycopg2 +import psycopg2.errors +from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT +import requests +from typing import Dict, List, Optional, Text, Tuple + +from common.types import StrEnum +from common.utils import is_model, ModelDoesNotExistError, is_remote, get_rfc3339_time,\ + get_utc_timestamp +from deploy.src.config import Config +from deploy.src.deployments.gcp import create_gcp_deployment, wait_gcp_host_ip, stop_gcp_deployment +from deploy.src.deployments.gcp_deploy_utils import generate_gcp_instance_name +from deploy.src.deployments.local import create_local_deployment, stop_local_deployment +from deploy.src.deployments.utils import get_schema_file_path, validate_data, \ + BadInputDataSchemaError, predict_data_to_mlflow_data_format, mlflow_model_predict,\ + schema_file_exists, tfdv_object_to_dict, read_tfdv_statistics, get_gcp_deployment_config,\ + get_local_deployment_config +from deploy.src.utils import local_model_uri_to_gs_blob, upload_local_mlflow_model_to_gs + + +logging.basicConfig(level=logging.DEBUG) + + +class DeploymentNotFoundError(Exception): + """Deployment not found""" + + +class InvalidDeploymentType(Exception): + """Invalid deployment type""" + + +class LocalModelDeployRemotelyError(Exception): + """Local model cannot be deployed remotely""" + + +class DeploymentStatus(StrEnum): + """Deployment status enum. + Statuses: + * NOT_FOUND - there is no record in database; + * RUNNING - deployment process is running; + * STOPPED - deployment process is stopped; + * DELETED - deployment is marked as deleted in database. + """ + + NOT_FOUND = 'not found' + RUNNING = 'running' + STOPPED = 'stopped' + DELETED = 'deleted' + + +class DeploymentType(StrEnum): + """Deployment type""" + + LOCAL = 'local' + GCP = 'gcp' + + +class DeployDbSchema: + + CONFIG = Config() + DB_NAME = CONFIG.get('DEPLOY_DB_NAME') + DEPLOYMENTS_TABLE = 'deployment' + INCOMING_DATA_TABLE = 'incoming_data' + + def __init__(self): + + self._create_db() + + self._connection = psycopg2.connect( + database=self.CONFIG.get('PROJECTS_DB_NAME'), + host=self.CONFIG.get('DB_HOST'), + port=self.CONFIG.get('DB_PORT'), + user=self.CONFIG.get('DB_USER'), + password=self.CONFIG.get('DB_PASSWORD') + ) + self._cursor = self._connection.cursor() + + self.create_deployments_table() + self.create_incoming_data_table() + + def create_deployments_table(self): + + schema = { + 'id': 'SERIAL PRIMARY KEY', + 'project_id': 'INT', + 'model_id': 'TEXT', + 'version': 'TEXT', + 'model_uri': 'TEXT', + 'host': 'TEXT', + 'port': 'INT', + 'pid': 'INT', + 'instance_name': 'TEXT', + 'type': 'TEXT', + 'created_at': 'TEXT', + 'last_updated_at': 'TEXT', + 'status': 'TEXT' + } + self._create_table(self.DEPLOYMENTS_TABLE, schema) + + def create_incoming_data_table(self): + + schema = { + 'id': 'SERIAL PRIMARY KEY', + 'deployment_id': 'INT', + 'incoming_data': 'TEXT', # TODO: как лучше сохранять данные, в каком формате? + 'timestamp': 'REAL', + 'is_valid': 'INT', + 'anomalies': 'TEXT' + } + self._create_table(self.INCOMING_DATA_TABLE, schema) + + def _create_table(self, table_name: Text, table_schema: Dict): + + columns_description = ', '.join([ + col_name + ' ' + col_type for col_name, col_type in table_schema.items() + ]) + + self._cursor.execute( + f'CREATE TABLE IF NOT EXISTS {table_name} ({columns_description})' + ) + + self._connection.commit() + + def _create_db(self): + + connection = psycopg2.connect( + host=self.CONFIG.get('DB_HOST'), + port=self.CONFIG.get('DB_PORT'), + user=self.CONFIG.get('DB_USER'), + password=self.CONFIG.get('DB_PASSWORD') + ) + connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + cursor = connection.cursor() + + try: + cursor.execute(f'CREATE DATABASE {self.DB_NAME};') + except psycopg2.errors.DuplicateDatabase: + pass + + connection.close() + + +class Deployment: + """ + Deployment. + """ + + def __init__(self, **deployment_config): + + self.config = deployment_config + + def up(self, model_uri: Text) -> Tuple[Text, int, int, Text]: + """ + Up new deployment. + Args: + model_uri {Text}: model uri + Returns: + Tuple[Text, int, int, Text]: (host, port, pid, instance_name) + """ + raise NotImplementedError('To be implemented') + + def stop(self, process_id: int, instance_name: Text) -> None: + """ + Stop deployment. + Args: + process_id {int}: process id + instance_name {Text}: instance name + """ + raise NotImplementedError('To be implemented') + + def predict(self, model_uri: Text, host: Text, port: int, data: Text) \ + -> Tuple[int, Dict, Optional[requests.Response]]: + + """ + Predict data. + Args: + model_uri {Text}: model uri + host {Text}: host ip or domain name + port {int}: port number + data {Text}: data to predict + Returns: + Tuple[int, Dict, Optional[requests.Response]]: + (data_is_valid_flag, anomalies_dictionary, requests.Response or None) + """ + + logging.info('predict') + + self._check_model_exists(model_uri) + + schema_file_path = get_schema_file_path(model_uri) + data_is_valid = -1 + anomalies = {} + response = None + + if os.getenv('VALIDATE_ON_PREDICT') == 'true': + data_is_valid, anomalies = validate_data(data, schema_file_path) + + if data_is_valid: + converted_data = predict_data_to_mlflow_data_format(data) + response = mlflow_model_predict(host, port, converted_data) + + return data_is_valid, anomalies, response + + def ping(self, host: Text, port: int) -> bool: + """ + Ping deployment. + Args: + host {Text}: host ip or domain name + port {int}: port number + Returns: + bool: True if deployment with host and port is reachable, otherwise False + """ + + logging.info(f'ping http://{host}:{port}') + + try: + requests.get(f'http://{host}:{port}/ping') + return True + except requests.exceptions.ConnectionError: + return False + + def schema(self, model_uri: Text) -> Dict: + """ + Get deployment's model schema. + Args: + model_uri {Text}: model uri + Returns: + Dict: model schema. + """ + + logging.info(f'get schema for {model_uri}') + + schema_file_path = get_schema_file_path(model_uri) + + if schema_file_exists(schema_file_path): + return tfdv_object_to_dict(read_tfdv_statistics(schema_file_path)) + else: + return {} + + def _check_model_exists(self, model_uri: Text) -> None: + """ + Check if model exists. + Args: + model_uri {Text}: model uri + Raises: + ModelDoesNotExistError: if model does not exists or is not MLflow model + """ + + if not is_model(model_uri): + raise ModelDoesNotExistError( + f'Model {model_uri} does not exist or is not MLflow model') + + +class LocalDeployment(Deployment): + """ + Local deployment. + """ + + def up(self, model_uri: Text) -> Tuple[Text, int, int, Text]: + """ + Up new deployment. + Args: + model_uri {Text}: model uri + Returns: + Tuple[Text, int, int, Text]: (host, port, pid, instance_name) + """ + + logging.info('up local deployment') + + self._check_model_exists(model_uri) + + instance_name = '' + host = '0.0.0.0' + process, port = create_local_deployment(model_uri) + pid = process.pid + + return host, port, pid, instance_name + + def stop(self, process_id: int = None, instance_name: Text = None): + """ + Stop deployment. + Args: + process_id {int}: process id + instance_name {Text}: instance name + """ + + logging.info('stop local deployment') + stop_local_deployment(process_id) + + +class GCPDeployment(Deployment): + """ + GCP deployment. + """ + + def __init__(self, **deployment_config): + + super().__init__(**deployment_config) + self._GCP_INSTANCE_CONNECTION_TIMEOUT = 30 + + def up(self, model_uri: Text) -> Tuple[Text, int, int, Text]: + """ + Up new deployment. + Args: + model_uri {Text}: model uri + Returns: + Tuple[Text, int, int, Text]: (host, port, pid, instance_name) + """ + + logging.info('up gcp deployment') + + self._check_model_exists(model_uri) + + pid = -1 + cached_model_uri = model_uri + + if not is_remote(model_uri): + + logging.info('local model, gcp deployment') + cached_model_uri = os.path.join( + 'gs://', self.config.get('bucket'), + local_model_uri_to_gs_blob(model_uri) + ) + + logging.info(f'cached_model_uri: {cached_model_uri}') + + if not is_model(cached_model_uri): + logging.info('upload local model to gs bucket') + upload_local_mlflow_model_to_gs(model_uri) + + instance_name = generate_gcp_instance_name() + create_gcp_deployment(cached_model_uri, self.config, instance_name) + port = self.config.get('port') + host = wait_gcp_host_ip(instance_name, self.config, self._GCP_INSTANCE_CONNECTION_TIMEOUT) + + return host, port, pid, instance_name + + def stop(self, process_id: int = None, instance_name: Text = None): + """ + Stop deployment. + Args: + process_id {int}: process id + instance_name {Text}: instance name + """ + + logging.info(f'stop gcp instance {instance_name}') + stop_gcp_deployment(instance_name, self.config) + + +class DeployManager: + """ + Manage deployments. + Methods: + create(Deployment): creates new deployment. + get_status(Deployment): get deployment status. + predict(Deployment, str): predict data on deployment. + stop(Deployment): stop deployment. + list(): get deployments list. + """ + # pylint: disable=too-many-instance-attributes + def __init__(self): + """ + Args: + workspace {Text}: workspace folder + gcp_config {Dict}: config for GCP deployment + """ + # pylint: disable=invalid-name + self.CONFIG = Config() + self._WORKSPACE = self.CONFIG.get('WORKSPACE') + self._GCP_CONFIG = self.CONFIG.get_gcp_config() + self._GCP_INSTANCE_CONNECTION_TIMEOUT = 30 + + self._connection = psycopg2.connect( + database=self.CONFIG.get('PROJECTS_DB_NAME'), + host=self.CONFIG.get('DB_HOST'), + port=self.CONFIG.get('DB_PORT'), + user=self.CONFIG.get('DB_USER'), + password=self.CONFIG.get('DB_PASSWORD') + ) + self._cursor = self._connection.cursor() + + def create_deployment(self, project_id: int, model_id: Text, model_version: Text, + model_uri: Text, deployment_type: Text) -> int: + """Create deployment. + Args: + project_id {int}: project id + model_id {Text}: model id (name) + model_version {Text}: model version + model_uri {Text}: path to model package + deployment_type {Text}: deployment type + Returns: + int: id of created deployment + """ + # pylint: disable=too-many-arguments + + deployment = self._make_deployment(deployment_type) + host, port, pid, instance_name = deployment.up(model_uri) + + deployment_id = self._insert_new_deployment_in_db( + project_id, model_id, model_version, model_uri, + host, port, pid, instance_name, deployment_type + ) + + return deployment_id + + def run(self, deployment_id: int) -> None: + """Run deployment. + Args: + deployment_id {int}: deployment id + """ + + self._cursor.execute( + f'SELECT type, model_uri, status ' + f'FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'WHERE id = {deployment_id}' + ) + deployment_row = self._cursor.fetchone() + + if deployment_row is None: + raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') + + deployment_type, model_uri, status = deployment_row + + if status == DeploymentStatus.RUNNING: + return + + deployment = self._make_deployment(deployment_type) + host, port, pid, instance_name = deployment.up(model_uri) + + self._cursor.execute( + f'UPDATE {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'SET host = %s, port = %s, status = %s, pid = %s, instance_name = %s, last_updated_at = %s ' + f'WHERE id = {deployment_id}', + (host, port, str(DeploymentStatus.RUNNING), pid, instance_name, get_rfc3339_time()) + ) + self._connection.commit() + + def stop(self, deployment_id: int) -> None: + """Stop deployment. + Args: + deployment_id {int}: deployment id + """ + + self._cursor.execute( + f'SELECT status, type, pid, instance_name ' + f'FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'WHERE id = {deployment_id} AND ' + f' status <> \'{str(DeploymentStatus.DELETED)}\'' + ) + deployment_row = self._cursor.fetchone() + + if deployment_row is None: + raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') + + status, deployment_type, pid, instance_name = deployment_row + + if status == DeploymentStatus.STOPPED: + return + + deployment = self._make_deployment(deployment_type) + deployment.stop(pid, instance_name) + + self._cursor.execute( + f'UPDATE {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'SET status = %s, host = %s, port = %s, last_updated_at = %s ' + f'WHERE id = {deployment_id}', + (str(DeploymentStatus.STOPPED), None, None, get_rfc3339_time()) + ) + self._connection.commit() + + def delete(self, deployment_id: int) -> None: + """Delete deployment (mark as deleted. + Args: + deployment_id {int}: deployment id + """ + + self.stop(deployment_id) + self._cursor.execute( + f'UPDATE {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'SET status = %s, host = %s, port = %s, last_updated_at = %s ' + f'WHERE id = {deployment_id}', + (str(DeploymentStatus.DELETED), None, None, get_rfc3339_time()) + ) + self._connection.commit() + + def predict(self, deployment_id: int, data: Text) -> requests.Response: + """Predict data on deployment. + Args: + deployment_id {int}: deployment id + data {Text}: data to predict + """ + + self._cursor.execute( + f'SELECT model_uri, host, port, type ' + f'FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'WHERE id = {deployment_id} AND ' + f' status <> \'{str(DeploymentStatus.DELETED)}\'' + ) + deployment_row = self._cursor.fetchone() + + if deployment_row is None: + raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') + + model_uri, host, port, deployment_type = deployment_row + deployment = self._make_deployment(deployment_type) + data_is_valid, anomalies, response = deployment.predict(model_uri, host, port, data) + + self._cursor.execute( + f'INSERT INTO {DeployDbSchema.INCOMING_DATA_TABLE} ' + f'(deployment_id,incoming_data,timestamp,is_valid,anomalies) ' + f'VALUES (%s,%s,%s,%s,%s)', + (deployment_id, data, get_utc_timestamp(), int(data_is_valid), json.dumps(anomalies)) + ) + self._connection.commit() + + if not data_is_valid: + raise BadInputDataSchemaError({'anomalies': anomalies}) + + return response + + def list(self) -> List[Dict]: + """Get list of deployments info. + Returns: + List[Dict] + """ + + self._cursor.execute( + f'SELECT id, project_id, model_id, version, model_uri, ' + f'type, created_at, instance_name, status, host, port ' + f'FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'WHERE status <> \'{str(DeploymentStatus.DELETED)}\'' + ) + rows = self._cursor.fetchall() + deployments = [] + + for row in rows: + deployments.append({ + 'id': str(row[0]), + 'project_id': str(row[1]), + 'model_id': row[2], + 'version': row[3], + 'model_uri': row[4], + 'type': row[5], + 'created_at': row[6], + 'status': row[8], + 'host': row[9], + 'port': str(row[10]) if row[10] is not None else row[10] + }) + + return deployments + + def ping(self, deployment_id: int) -> bool: + """Ping deployment. + Args: + deployment_id {int}: deployment id + Returns: + True deployment is available by http, otherwise False + """ + + self._cursor.execute( + f'SELECT host, port, status, type ' + f'FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'WHERE id = {deployment_id}' + ) + deployment_row = self._cursor.fetchone() + + if deployment_row is None: + raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') + + host, port, status, deployment_type = deployment_row + + if status != DeploymentStatus.RUNNING: + return False + + deployment = self._make_deployment(deployment_type) + + return deployment.ping(host, port) + + def deployment_schema(self, deployment_id: int) -> Dict: + + self._cursor.execute( + f'SELECT model_uri, type ' + f'FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'WHERE id = {deployment_id} AND status <> \'{str(DeploymentStatus.DELETED)}\'' + ) + deployment_row = self._cursor.fetchone() + + if not deployment_row: + raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') + + model_uri, deployment_type = deployment_row + deployment = self._make_deployment(deployment_type) + + return deployment.schema(model_uri) + + def check_and_update_deployments_statuses(self) -> None: + """Check if deployment status. + If status "running" is not confirmed, change status to "stopped" + """ + + self._cursor.execute( + f'SELECT id, host, port FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'WHERE status = \'{str(DeploymentStatus.RUNNING)}\'' + ) + running_local_deployments = self._cursor.fetchall() + + for deployment_id, host, port in running_local_deployments: + + try: + requests.get( + url=f'http://{host}:{port}/ping', + timeout=self._GCP_INSTANCE_CONNECTION_TIMEOUT // 5 + ) + except requests.exceptions.ConnectionError: + self._cursor.execute( + f'UPDATE {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'SET host = %s, port = %s, status = %s ' + f'WHERE id = {deployment_id}', + (None, None, str(DeploymentStatus.STOPPED)) + ) + + self._connection.commit() + + def _insert_new_deployment_in_db( + self, project_id: int, model_id: Text, model_version: Text, model_uri: Text, + host: Text, port: int, pid: int, instance_name: Text, deployment_type: Text + ) -> int: + """Insert new deployment record in database. + Args: + project_id {int}: project id + model_id {Text}: model id (name) + model_version {Text}: model version + model_uri {Text}: path to model package + host {Text}: host address + port {int}: port number + pid {int}: deployment process number + instance_name {Text}: name of instance + deployment_type {Text}: deployment type + Returns: + int: id of insert deployment record + Notes: + * pid: is some positive integer in case of local deployment + and -1, if deployment type is remote; + * instance_name: is empty string for local and non-empty + string (name of remote virtual machine) for remote deployment. + """ + # pylint: disable=too-many-arguments + + creation_datetime = get_rfc3339_time() + self._cursor.execute( + f'INSERT INTO {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'(project_id, model_id, version, model_uri, host, port, ' + f'pid, instance_name, type, created_at, last_updated_at, status) ' + f'VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ' + f'RETURNING id', + ( + project_id, model_id, model_version, model_uri, + host, port, pid, instance_name, deployment_type, creation_datetime, + creation_datetime, str(DeploymentStatus.RUNNING) + ) + ) + deployment_id = self._cursor.fetchone()[0] + + self._connection.commit() + + return deployment_id + + @staticmethod + def _make_deployment(deployment_type): + + if deployment_type == DeploymentType.LOCAL: + deployment_config = get_local_deployment_config() + return LocalDeployment(**deployment_config) + + elif deployment_type == DeploymentType.GCP: + # TODO: think about config.get_gcp_confgi() -> GCP_CONFIG + deployment_config = get_gcp_deployment_config() + return GCPDeployment(**deployment_config) + + else: + raise InvalidDeploymentType(f'Invalid deployment type: {deployment_type}') + diff --git a/services/deploy/src/deployments/utils.py b/services/deploy/src/deployments/utils.py index fd8a032..d84dd24 100644 --- a/services/deploy/src/deployments/utils.py +++ b/services/deploy/src/deployments/utils.py @@ -2,8 +2,56 @@ # pylint: disable=wrong-import-order +try: + from google.cloud import storage + from google.protobuf.json_format import MessageToDict +except ImportError: + pass + +import os +import pandas as pd +from pandas.io.json import build_table_schema import requests -from typing import Text +import time + +try: + import tensorflow_data_validation as tfdv + from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList + from tensorflow_metadata.proto.v0.schema_pb2 import Schema +except ImportError: + pass + +from typing import Dict, NewType, Text, Tuple + +from deploy.src import config + + +if 'DatasetFeatureStatisticsList' not in dir(): + DatasetFeatureStatisticsList = NewType('DatasetFeatureStatisticsList', object) + +if 'Schema' not in dir(): + Schema = NewType('Schema', object) + + +class BadInputDataSchemaError(Exception): + """Bad input data schema""" + + +class IncorrectDataError(Exception): + """Incorrect data""" + + +TFDV_PANDAS_TYPES = { + 'INT': 'integer', + 'FLOAT': 'number', + 'STRING': 'string', + 'BOOL': 'bool' +} + +PANDAS_TFDV_TYPES = { + pandas_type: tfdv_type + for tfdv_type, pandas_type in TFDV_PANDAS_TYPES.items() +} def mlflow_model_predict(host: Text, port: int, data: Text) -> requests.Response: @@ -23,3 +71,506 @@ def mlflow_model_predict(host: Text, port: int, data: Text) -> requests.Response data=data ) return predict_resp + + +def schema_file_exists(schema_file: Text) -> bool: + """ + Check if schema file exists. + Args: + schema_file {Text}: path to schema file + Returns: + True if schema file exists, otherwise False + """ + + if schema_file.startswith('gs://'): + bucket, *schema_file_path_parts = schema_file.strip('gs://').split('/') + schema_file_path = '/'.join(schema_file_path_parts) + client = storage.Client() + bucket = client.get_bucket(bucket) + schema_file_blob = bucket.blob(schema_file_path) + + return schema_file_blob.exists() + + if os.path.exists(schema_file): + return True + + +def get_schema_file_name() -> Text: + """ + Returns: + Text: get schema filename + """ + + # TODO (Alex): remove hardcoded schema name + SCHEMA_NAME = 'stats.tfdv' + + return SCHEMA_NAME + + +def get_schema_file_path(model_uri: Text) -> Text: + """ + Get schema file full path. + Args: + model_uri {Text}: full path to model + Returns: + Text: full path to schema + """ + + schema_path = os.path.join(os.path.dirname(model_uri), get_schema_file_name()) + + return schema_path + + +def download_blob(blob_full_path: Text, destination_file_name: Text) -> None: + """Downloads a blob from the bucket. + Args: + blob_full_path {Text}: full path to blob (gs:///path/to/file) + destination_file_name {Text}: destination to download file + """ + + blob_full_path = blob_full_path.strip('gs://') + bucket_name, *blob_path_parts = blob_full_path.split('/') + source_blob_name = '/'.join((blob_path_parts)) + + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(source_blob_name) + + blob.download_to_filename(destination_file_name) + + +def read_tfdv_statistics(schema_path: Text) -> DatasetFeatureStatisticsList: + """ + Read TensorFlow data validation (TFDV) statistics. + Args: + schema_file {Text}: path to TFDV statistics file + Returns: + DatasetFeatureStatisticsList: TFDV statistics + """ + + schema_file = schema_path + + if schema_path.startswith('gs://'): + dest_folder = f'/tmp/{time.time()}' + schema_file = os.path.join(dest_folder, get_schema_file_name()) + os.makedirs(os.path.dirname(dest_folder), exist_ok=True) + download_blob(schema_path, schema_path) + + with open(schema_file, 'rb') as inp_stats: + string_stats = inp_stats.read() + statistics = DatasetFeatureStatisticsList().FromString(string_stats) + + return statistics + + +def tfdv_object_to_dict(tfdv_object: object) -> Dict: + """ + Convert TFDV object to dictionary. + Args: + tfdv_object {DatasetFeatureStatisticsList}: TFDV object + Returns: + Dict: dictionary + """ + + return MessageToDict(tfdv_object) + + +def get_numeric_features_intervals(statistics: DatasetFeatureStatisticsList) -> Dict[str, Tuple]: + """ + Get numeric features intervals. + Args: + statistics {DatasetFeatureStatisticsList}: TFDV statistics. + Returns: + Dict[str, Tuple]: dictionary with structure: + { + : (min_value, max_value) + } + """ + + statistics_dict = tfdv_object_to_dict(statistics) + features = statistics_dict['datasets'][0]['features'] + features_intervals = {} + + for ft in features: + + if ft['type'] in ['INT', 'FLOAT']: + ft_num_stats = ft['numStats'] + ft_interval = ( + ft_num_stats['min'], + ft_num_stats['max'], + ) + features_intervals[ft['path']['step'][0]] = ft_interval + + return features_intervals + + +def get_pandas_df_schema(df: pd.DataFrame) -> Dict[Text, Text]: + """ + Get dataframe schema using pandas.io.json.build_table_schema. + Args: + df {pandas.DataFrame}: dataframe + Returns: + Dict[Text, Text]: dictionary with structure: + { + : + } + """ + + return {f['name']: f['type'] for f in build_table_schema(df, index=False)['fields']} + + +def tfdv_schema_to_dict(schema: Schema) -> Dict[Text, Text]: + """ + Convert TFDV schema to flat dictionary. + Args: + schema {tensorflow_metadata.proto.v0.schema_pb2.Schema}: TFDV schema + Returns: + Dict[Text, Text]: dictionary with structure: + { + : + } + """ + + raw_schema = MessageToDict(schema) + schema_dict = {} + + for ft in raw_schema['feature']: + schema_dict[ft['name']] = TFDV_PANDAS_TYPES[ft.get('type', 'INT')] + + return schema_dict + + +def load_data(data: Text) -> pd.DataFrame: + """ + Load data to dataframe from string. + Args: + data {Text}: text json string convertible to pandas dataframe, orient=table + Returns: + pandas.DataFrame: dataframe + """ + + df = pd.read_json(data, orient='table') + + return df + + +def tfdv_pandas_schemas_anomalies(tfdv_schema: Schema, + df_schema: Dict[Text, Text]) -> Tuple[bool, Dict]: + """ + Compare TFDV and pandas schemas - compare and check column names and types. + Args: + schema {tensorflow_metadata.proto.v0.schema_pb2.Schema}: TFDV schema + df_schema {Dict[Text, Text]}: dictified pandas schema + Returns: + Tuple[bool, Dict]: + True if anomalies are detected, otherwise False + dictionary with structure: + { + : { + 'description': , + 'severity': 'ERROR', + 'shortDescription': , + 'reason': [{'type': , + 'shortDescription': , + 'description': }], + 'path': {'step': []} + } + } + """ + + tfdv_dictified_schema = tfdv_schema_to_dict(tfdv_schema) + required_columns = set(tfdv_dictified_schema.keys()) + existing_columns = set(df_schema.keys()) + columns = required_columns.union(existing_columns) + anomalies = {} + + for col in columns: + + if col not in existing_columns: + + anomalies[col] = { + 'description': 'The feature was present in fewer examples than expected.', + 'severity': 'ERROR', + 'shortDescription': 'Column dropped', + 'reason': [{'type': 'FEATURE_TYPE_LOW_FRACTION_PRESENT', + 'shortDescription': 'Column dropped', + 'description': 'The feature was present in fewer examples than expected.'}], + 'path': {'step': [col]} + } + + continue + + if col not in required_columns: + + anomalies[col] = { + 'description': 'New column (column in data but not in schema)', + 'severity': 'ERROR', + 'shortDescription': 'New column', + 'reason': [{'type': 'SCHEMA_NEW_COLUMN', + 'shortDescription': 'New column', + 'description': 'New column (column in data but not in schema)'}], + 'path': {'step': [col]} + } + + continue + + required_type = tfdv_dictified_schema[col] + real_type = df_schema[col] + + if required_type != real_type: + + short_description = f'Expected data of type: {PANDAS_TFDV_TYPES[required_type]} ' \ + f'but got {PANDAS_TFDV_TYPES[required_type]}' + anomalies[col] = { + 'description': '', + 'severity': 'ERROR', + 'shortDescription': short_description, + 'reason': [{'type': 'UNKNOWN_TYPE', + 'shortDescription': short_description, + 'description': ''}], + 'path': {'step': [col]} + } + + return len(anomalies) > 0, anomalies + + +def data_intervals_anomalies(df: pd.DataFrame, + statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]: + """ + Check if values of each column are between min and max value of the same column from schema. + Args: + df {pandas.DataFrame}: dataframe + statistics {DatasetFeatureStatisticsList}: TFDV statistics + Returns: + Tuple[bool, Dict]: + True if anomalies are detected, otherwise False, + dictionary with structure: + { + : { + 'description': '', + 'severity': 'ERROR', + 'shortDescription': , + 'reason': [{'type': 'UNKNOWN_TYPE', + 'shortDescription': ]} + } + } + """ + + features_intervals = get_numeric_features_intervals(statistics) + anomalies = {} + + for col in df.columns: + + if col in features_intervals and df[col].dtype.name != 'object': + + df_col_min, df_col_max = df[col].min().item(), df[col].max().item() + schema_col_min, schema_col_max = features_intervals[col] + + if type(df_col_min) == type(schema_col_min) and df_col_min < schema_col_min: + + short_description = f'Min value {df_col_min} is less ' \ + f'schema min value {schema_col_min}' + + anomalies[col] = { + 'description': '', + 'severity': 'ERROR', + 'shortDescription': short_description, + 'reason': [{'type': 'UNKNOWN_TYPE', + 'shortDescription': short_description, + 'description': ''}], + 'path': {'step': [col]} + } + + if type(df_col_max) == type(schema_col_max) and df_col_max > schema_col_max: + + short_description = f'Max value {df_col_max} is more ' \ + f'schema max value {schema_col_max}' + anomalies[col] = { + 'description': '', + 'severity': 'ERROR', + 'shortDescription': short_description, + 'reason': [{'type': 'UNKNOWN_TYPE', + 'shortDescription': short_description, + 'description': ''}], + 'path': {'step': [col]} + } + + return len(anomalies) > 0, anomalies + + +def tfdv_statistics_anomalies(source_statistics: DatasetFeatureStatisticsList, + input_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]: + """ + Compare two TDFV statistics and return anomalies. + Args: + source_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: source statistics + input_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: input statistics + Returns: + Tuple[bool, Dict]: + True if anomalies are detected, otherwise False, + dictionary with structure: + { + : { + 'description': , + 'severity': 'ERROR', + 'shortDescription': , + 'reason': [{'type': , + 'shortDescription': , + 'description': }], + 'path': {'step': []} + } + } + """ + + schema = tfdv.infer_schema(source_statistics) + anomalies = tfdv_object_to_dict( + tfdv.validate_statistics(statistics=input_statistics, schema=schema) + ) + tfdv_anomalies = anomalies.get('anomalyInfo', {}) + + return len(tfdv_anomalies) > 0, tfdv_anomalies + + +def tfdv_and_additional_anomalies(df: pd.DataFrame, + tfdv_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]: + """ + Get TFDV and additional anomalies. + Args: + df {pandas.DataFrame}: dataframe + tfdv_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: TFDV statistics + Returns: + Tuple[bool, Dict]: + True if anomalies are detected, otherwise False, + dictionary with structure: + { + : { + 'description': , + 'severity': 'ERROR', + 'shortDescription': , + 'reason': [{'type': , + 'shortDescription': , + 'description': }], + 'path': {'step': []} + } + } + """ + + df_statistics = tfdv.generate_statistics_from_dataframe(df) + interval_anomalies_detected = False + tfdv_anomalies_detected, tfdv_anomalies = tfdv_statistics_anomalies( + tfdv_statistics, df_statistics + ) + interval_anomalies = {} + + if len(tfdv_anomalies) > 0: + tfdv_anomalies_detected = True + + if os.getenv('CHECK_NUMERIC_INTERVALS_ON_PREDICT') == 'true': + interval_anomalies_detected, interval_anomalies = data_intervals_anomalies(df, tfdv_statistics) + + anomalies_detected = tfdv_anomalies_detected or interval_anomalies_detected + anomalies = {**tfdv_anomalies, **interval_anomalies} + + return anomalies_detected, anomalies + + +def pandas_schema_anomalies(df: pd.DataFrame, + tfdv_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]: + """ + Get dataframe schema anomalies comparing pandas and TFDV schema. + Args: + df {pandas.DataFrame}: dataframe + tfdv_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: TFDV statistics + Returns: + Tuple[bool, Dict]: + True if anomalies are detected, otherwise False, + dictionary with structure: + { + : { + 'description': , + 'severity': 'ERROR', + 'shortDescription': , + 'reason': [{'type': , + 'shortDescription': , + 'description': }], + 'path': {'step': []} + } + } + """ + + tfdv_schema = tfdv.infer_schema(tfdv_statistics) + pandas_schema = get_pandas_df_schema(df) + + return tfdv_pandas_schemas_anomalies(tfdv_schema, pandas_schema) + + +def validate_data(data: Text, schema_file_path: Text) -> Tuple[bool, Dict]: + """ + Validate data sent for prediction. + Args: + data {Text}: json string which can be loaded by panda.read_json(_, orient='split') + schema_file_path {Text}: path schema file + Returns: + Tuple[bool, Dict]: + True if data is valid, otherwise False, + dictionary with structure: + { + : { + 'description': , + 'severity': 'ERROR', + 'shortDescription': , + 'reason': [{'type': , + 'shortDescription': , + 'description': }], + 'path': {'step': []} + } + } + """ + + if not schema_file_exists(schema_file_path): + return True, {} + + tfdv_statistics = read_tfdv_statistics(schema_file_path) + + try: + df = load_data(data) + except Exception as e: + raise BadInputDataSchemaError( + f'Bad input data schema, pandas cannot load data, details: {str(e)}') + + if df.shape[0] >= int(os.getenv('BIG_DATASET_MIN_SIZE', 10e7)): + validate_func = tfdv_and_additional_anomalies + else: + validate_func = pandas_schema_anomalies + + anomalies_detected, anomalies = validate_func(df, tfdv_statistics) + + return not anomalies_detected, anomalies + + +def predict_data_to_mlflow_data_format(data: Text) -> Text: + """ + Convert data sent for prediction to format usable by MLflow model server. + Args: + data {Text}: json string which can be loaded by panda.read_json(_, orient='split') + Returns: + Text: pandas-records string + """ + + df = load_data(data) + data_in_mlflow_format = str(df.to_numpy().tolist()) + + return data_in_mlflow_format + + +def get_local_deployment_config() -> Dict: + + return {} + + +def get_gcp_deployment_config() -> Dict: + + return config.Config().get_gcp_config() + diff --git a/services/deploy/src/routers/deployments.py b/services/deploy/src/routers/deployments.py index ac5e864..6549a39 100644 --- a/services/deploy/src/routers/deployments.py +++ b/services/deploy/src/routers/deployments.py @@ -4,13 +4,23 @@ from fastapi import APIRouter, Form from http import HTTPStatus +import pandas as pd +import psycopg2 from starlette.responses import JSONResponse, Response + +try: + import tensorflow_data_validation as tfdv +except ImportError: + pass + from typing import Text -from common.utils import error_response, is_model, ModelDoesNotExistError -from deploy.src import config -from deploy.src.deployments import DeployManager -from deploy.src.deployments import DeploymentNotFoundError +from common.utils import error_response +from deploy.src.config import Config +from deploy.src.deployments.manager import DeploymentNotFoundError, DeployDbSchema, DeployManager +from deploy.src.deployments.utils import get_schema_file_path, read_tfdv_statistics,\ + tfdv_object_to_dict, load_data, schema_file_exists, tfdv_statistics_anomalies,\ + data_intervals_anomalies router = APIRouter() # pylint: disable=invalid-name @@ -35,9 +45,6 @@ def create_and_run_deployment( """ deploy_manager = DeployManager() - if not is_model(model_uri): - raise ModelDoesNotExistError(f'Model {model_uri} does not exist or is not MLflow model') - deployment_id = deploy_manager.create_deployment( project_id, model_id, version, model_uri, type ) @@ -88,7 +95,7 @@ def predict(deployment_id: int, data: Text = Form(...)) -> JSONResponse: if response.status_code != HTTPStatus.OK: return error_response( http_response_code=response.status_code, - message=response.json().message() + message=response.json().get('message') ) return JSONResponse({'prediction': response.text}) @@ -155,3 +162,100 @@ def ping(deployment_id: int) -> Response: return Response(status_code=HTTPStatus.OK) else: return Response(status_code=HTTPStatus.BAD_REQUEST) + + +@router.get('/deployments/{deployment_id}/schema') +def get_deployment_data_schema(deployment_id: int) -> JSONResponse: + """Get deployment data schema. + Args: + deployment_id {int}: deployment id + Returns: + starlette.responses.JSONResponse + """ + + deploy_manager = DeployManager() + deployment_schema = deploy_manager.deployment_schema(deployment_id) + + return JSONResponse(deployment_schema) + + +@router.get('/deployments/{deployment_id}/validation-report') +def get_validation_report(deployment_id: int, + timestamp_from: float, + timestamp_to: float) -> JSONResponse: + + conf = Config() + connection = psycopg2.connect( + database=conf.get('PROJECTS_DB_NAME'), + host=conf.get('DB_HOST'), + port=conf.get('DB_PORT'), + user=conf.get('DB_USER'), + password=conf.get('DB_PASSWORD') + ) + cursor = connection.cursor() + + cursor.execute( + f'SELECT model_uri FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' + f'WHERE id = {deployment_id}' + ) + + try: + model_uri = cursor.fetchone()[0] + except TypeError: + raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') + + cursor.execute( + f'SELECT incoming_data FROM {DeployDbSchema.INCOMING_DATA_TABLE} ' + f'WHERE deployment_id = {deployment_id} AND ' + f' timestamp >= {timestamp_from} AND timestamp <= {timestamp_to}' + ) + + schema_file_path = get_schema_file_path(model_uri) + + if not schema_file_exists(schema_file_path): + return JSONResponse({}) + + tfdv_statistics = read_tfdv_statistics(schema_file_path) + tfdv_statistics_dict = tfdv_object_to_dict(tfdv_statistics) + + data_batches = cursor.fetchall() + dataframes = [] + + for batch in data_batches: + df = load_data(batch[0]) + dataframes.append(df) + + if len(dataframes) == 0: + return JSONResponse({}) + + incoming_data_df = pd.concat(dataframes, ignore_index=False) + + """ + Convert object columns to string to avoid errors when trying to generate TFDV + statistics. If column contains string and numeric values it's type is object, but + the column still contains values of different types. And TFDV tries to convert string + value to numeric (integer, float) => error + """ + object_columns = incoming_data_df.select_dtypes(include='object').columns.tolist() + incoming_data_df[object_columns] = incoming_data_df[object_columns].astype('str') + + incoming_data_statistics = tfdv.generate_statistics_from_dataframe(incoming_data_df) + incoming_data_statistics_dict = tfdv_object_to_dict(incoming_data_statistics) + + tfdv_anomalies_detected, tfdv_anomalies = tfdv_statistics_anomalies( + tfdv_statistics, incoming_data_statistics + ) + interval_anomalies_detected, interval_anomalies = data_intervals_anomalies( + incoming_data_df, tfdv_statistics + ) + anomalies_detected = tfdv_anomalies_detected or interval_anomalies_detected + anomalies = {**tfdv_anomalies, **interval_anomalies} + + return JSONResponse({ + 'timestamp_from': timestamp_from, + 'timestamp_to': timestamp_to, + 'model_statistics': tfdv_statistics_dict, + 'incoming_data_statistics': incoming_data_statistics_dict, + 'anomalies_detected': anomalies_detected, + 'anomalies_info': anomalies + }) diff --git a/services/deploy/src/utils.py b/services/deploy/src/utils.py index 191a814..27e6843 100644 --- a/services/deploy/src/utils.py +++ b/services/deploy/src/utils.py @@ -2,7 +2,15 @@ # pylint: disable=wrong-import-order +from google.cloud import storage +import os import socket +from typing import Text + +from deploy.src.config import Config + + +conf = Config() def get_free_tcp_port() -> int: @@ -17,3 +25,51 @@ def get_free_tcp_port() -> int: tcp.close() return port + + +def upload_blob_to_gs(bucket_name: Text, source_file_name: Text, + destination_blob_name: Text) -> None: + """ + Upload file to Google Cloud Storage. + Args: + bucket_name {Text}: bucket name + source_file_name {Text}: name of file to upload + destination_blob_name {Text}: blob name in the storage + """ + + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(destination_blob_name) + + blob.upload_from_filename(source_file_name) + + +def local_model_uri_to_gs_blob(model_uri: Text) -> Text: + """ + Get gs blob name for local model. + Args: + model_uri {Text}: model uri + Returns: + Text: model blob name + """ + + return os.path.join( + 'mlpanel/cache/models', model_uri.replace(conf.get('WORKSPACE'), '').strip('/') + ) + + +def upload_local_mlflow_model_to_gs(model_uri: Text) -> None: + """ + Upload local mlflow model to Google Cloud Storage. + Args: + model_uri {Text}: model uri + """ + + bucket_name = conf.get('bucket') + model_blob = local_model_uri_to_gs_blob(model_uri) + + for file in os.listdir(model_uri): + + source_file_name = os.path.join(model_uri, file) + destination_blob_name = os.path.join(model_blob, file) + upload_blob_to_gs(bucket_name, source_file_name, destination_blob_name) diff --git a/services/deploy/startup.sh b/services/deploy/startup.sh new file mode 100755 index 0000000..eb5f897 --- /dev/null +++ b/services/deploy/startup.sh @@ -0,0 +1,7 @@ +sudo chown user $WORKSPACE + +while ! pg_isready -h $DB_HOST -p $DB_PORT; do + echo "Wait for db server..." && sleep 1s; +done; + +uvicorn --host 0.0.0.0 --port 9000 src.app:app --reload \ No newline at end of file diff --git a/services/deploy/tests/model/MLmodel b/services/deploy/tests/integration/base/model/MLmodel similarity index 100% rename from services/deploy/tests/model/MLmodel rename to services/deploy/tests/integration/base/model/MLmodel diff --git a/services/deploy/tests/model/conda.yaml b/services/deploy/tests/integration/base/model/conda.yaml similarity index 100% rename from services/deploy/tests/model/conda.yaml rename to services/deploy/tests/integration/base/model/conda.yaml diff --git a/services/deploy/tests/model/model.pkl b/services/deploy/tests/integration/base/model/model.pkl similarity index 100% rename from services/deploy/tests/model/model.pkl rename to services/deploy/tests/integration/base/model/model.pkl diff --git a/services/deploy/tests/test_app.py b/services/deploy/tests/integration/base/test_app.py similarity index 68% rename from services/deploy/tests/test_app.py rename to services/deploy/tests/integration/base/test_app.py index 1ba43c3..64e28ec 100644 --- a/services/deploy/tests/test_app.py +++ b/services/deploy/tests/integration/base/test_app.py @@ -1,7 +1,9 @@ +import json import pytest import re import shutil from starlette.testclient import TestClient +import time from deploy.src import app, config @@ -9,15 +11,20 @@ @pytest.fixture(scope='module') def client(): - config.WORKSPACE = 'tmp_ws' app.init() return TestClient(app.app) +@pytest.fixture(scope='module') +def deployment_run_timeout(): + + return 20 + + def teardown_module(): - shutil.rmtree(config.WORKSPACE, ignore_errors=True) + shutil.rmtree(config.Config().get('WORKSPACE'), ignore_errors=True) # Tests on default (just created = empty) database @@ -78,7 +85,7 @@ def test_create_deployment(client): 'project_id': 1, 'model_id': 'IrisLogregModel', 'version': '1', - 'model_uri': './tests/model', + 'model_uri': './tests/integration/base/model', 'type': 'local' } ) @@ -98,6 +105,40 @@ def test_create_deployment(client): assert re.match(r'''^([\s\d]+)$''', deployment.get('port', '')) is not None +def test_predict(client, deployment_run_timeout): + + deployment_is_running = False + start = time.time() + + while not deployment_is_running: + + ping_resp = client.get('/deployments/1/ping') + + if ping_resp.status_code == 200: + deployment_is_running = True + + if time.time() - start > deployment_run_timeout: + break + + predict_resp = client.post( + '/deployments/1/predict', + data={ + 'data': '{"schema": {"fields":[{"name":"index","type":"integer"},' + '{"name":"sepal_length","type":"number"},{"name":"sepal_width","type":"number"},' + '{"name":"petal_length","type":"number"},{"name":"petal_width","type":"number"}],' + '"primaryKey":["index"],"pandas_version":"0.20.0"}, ' + '"data": [{"index":0,"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,' + '"petal_width":0.2},{"index":1,"sepal_length":4.9,"sepal_width":3.0,"petal_length":1.4,' + '"petal_width":0.2},{"index":2,"sepal_length":4.7,"sepal_width":3.2,"petal_length":1.3,' + '"petal_width":0.2},{"index":3,"sepal_length":4.6,"sepal_width":3.1,"petal_length":1.5,' + '"petal_width":0.2}]}' + } + ) + + assert predict_resp.status_code == 200 + assert len(json.loads(predict_resp.json()['prediction'])) == 4 + + # # POST /deployments def test_create_bad_type_deployment(client): @@ -107,12 +148,12 @@ def test_create_bad_type_deployment(client): 'project_id': 1, 'model_id': 'IrisLogregModel', 'version': '1', - 'model_uri': './tests/model', + 'model_uri': './tests/integration/model_without_schema/model', 'type': 'unknown' } ) - assert create_response.status_code == 404 + assert create_response.status_code == 400 assert create_response.json().get('message') == 'Invalid deployment type: unknown' @@ -148,3 +189,4 @@ def test_delete_deployment(client): assert get_response.status_code == 404 assert get_response.json().get('message') == 'Deployment with ID 1 not found' + diff --git a/services/deploy/tests/test.sh b/services/deploy/tests/test.sh index 9fe1f4c..eb0af1a 100755 --- a/services/deploy/tests/test.sh +++ b/services/deploy/tests/test.sh @@ -1,8 +1,59 @@ -#!/usr/bin/env bash +#!/usr/bin/env sh # Run tests in docker + +realpath() { + OURPWD=$PWD + cd "$(dirname "$1")" + LINK=$(readlink "$(basename "$1")") + while [ "$LINK" ]; do + cd "$(dirname "$LINK")" + LINK=$(readlink "$(basename "$1")") + done + REALPATH="$PWD/$(basename "$1")" + cd "$OURPWD" + echo "$REALPATH" +} + KEYS=$1 +CURDIR=$(dirname $(realpath "$0")) +NETWORK=mlpanel-test-projects-network +DEPLOY_DB_NAME=deploy_db +POSTGRES_USER=user +POSTGRES_PASSWORD=passw +DB_HOST=mlpanel-database-test +DB_PORT=5432 + +docker network rm $NETWORK +docker network create -d bridge --internal $NETWORK -docker run -v $(pwd):/home/deploy \ - -v $(pwd)/../../common:/home/common \ +echo "Up PostgreSQL..." +docker run \ + -d \ + --network=$NETWORK \ + --name=$DB_HOST \ + -e POSTGRES_PASSWORD=passw \ + -e POSTGRES_USER=user \ + -e PGDATA=/var/lib/postgresql/data/pgdata \ + --rm \ + --expose $DB_PORT \ + postgres + +echo "Wait for postgres server..." +docker exec \ + $DB_HOST \ + /bin/bash -c \ + 'while ! pg_isready; do sleep 1; done;' + +echo "Run tests..." +docker run --network=$NETWORK \ + -v $CURDIR/..:/home/deploy \ + -v $CURDIR/../../../common:/home/common \ -e WORKSPACE=/home/workspace \ - mlpanel-deploy /bin/bash -c "pytest $KEYS tests/" \ No newline at end of file + -e DEPLOY_DB_NAME=$DEPLOY_DB_NAME \ + -e POSTGRES_USER=$POSTGRES_USER \ + -e POSTGRES_PASSWORD=$POSTGRES_PASSWORD \ + -e DB_HOST=$DB_HOST \ + -e DB_PORT=$DB_PORT \ + mlrepa/mlpanel-base-deploy:latest /bin/bash -c "pytest $KEYS tests/" + +docker stop $DB_HOST \ No newline at end of file diff --git a/services/projects/.dockerignore b/services/projects/.dockerignore new file mode 100644 index 0000000..11b9b62 --- /dev/null +++ b/services/projects/.dockerignore @@ -0,0 +1 @@ +examples/venv/ \ No newline at end of file diff --git a/services/projects/Dockerfile b/services/projects/Dockerfile deleted file mode 100644 index 50f31bd..0000000 --- a/services/projects/Dockerfile +++ /dev/null @@ -1,44 +0,0 @@ -FROM python:3.7 - -RUN pip install aiofiles==0.4.0 \ - email-validator==1.0.5 \ - fastapi==0.42.0 \ - flask==1.1.1 \ - flask-cors==3.0.8 \ - google-api-python-client==1.7.9 \ - google-cloud-storage==1.20.0 \ - jinja2==2.10.3 \ - mlflow==1.4.0 \ - oauth2client==3.0.0 \ - psutil==5.6.2 \ - pytest==5.2.2 \ - python-multipart==0.0.5 \ - pyyaml==5.1.2 \ - uvicorn==0.10.8 - - -RUN apt-get update && \ - apt-get install -y sudo && \ - useradd -m user -u 1000 && \ - echo 'user:user' | chpasswd user && \ - echo "user ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/user && \ - chmod 0440 /etc/sudoers.d/user && \ - chown -R user /home - -RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz - -# Installing the package -RUN mkdir -p /usr/local/gcloud \ - && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ - && /usr/local/gcloud/google-cloud-sdk/install.sh - -# Adding the package path to local -ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin - -USER user - -WORKDIR /home/projects - -ENV PYTHONPATH=/home - -CMD sudo chown -R user $WORKSPACE && uvicorn --host 0.0.0.0 --port 8080 src.app:app --reload \ No newline at end of file diff --git a/services/projects/README.md b/services/projects/README.md index 49c614b..cbe3ddf 100644 --- a/services/projects/README.md +++ b/services/projects/README.md @@ -1,21 +1,10 @@ # projects -## Run tests (in docker) - -```bash -chmod +x tests/test.sh -./tests/test.sh -``` - -or with any `pytest` key, for example, verbose flag: - -```bash -chmod +x tests/test.sh -./tests/test.sh -v -``` - ## Run examples +**Note!** Variable `AUTH_REQUIRED` must be `false` (in `.env`) +before running script `demo_ws.py`. + Folder *examples/* contains simple examples of client for communication with tracking servers. diff --git a/services/projects/examples/data/.gitignore b/services/projects/examples/data/.gitignore index c1200f4..26482fc 100644 --- a/services/projects/examples/data/.gitignore +++ b/services/projects/examples/data/.gitignore @@ -1,3 +1,2 @@ -iris_data.joblib -/* +labeled_iris.csv !.gitignore \ No newline at end of file diff --git a/services/projects/examples/data/iris.csv b/services/projects/examples/data/iris.csv new file mode 100644 index 0000000..6464198 --- /dev/null +++ b/services/projects/examples/data/iris.csv @@ -0,0 +1,151 @@ +sepal_length,sepal_width,petal_length,petal_width,species +5.1,3.5,1.4,0.2,setosa +4.9,3,1.4,0.2,setosa +4.7,3.2,1.3,0.2,setosa +4.6,3.1,1.5,0.2,setosa +5,3.6,1.4,0.2,setosa +5.4,3.9,1.7,0.4,setosa +4.6,3.4,1.4,0.3,setosa +5,3.4,1.5,0.2,setosa +4.4,2.9,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5.4,3.7,1.5,0.2,setosa +4.8,3.4,1.6,0.2,setosa +4.8,3,1.4,0.1,setosa +4.3,3,1.1,0.1,setosa +5.8,4,1.2,0.2,setosa +5.7,4.4,1.5,0.4,setosa +5.4,3.9,1.3,0.4,setosa +5.1,3.5,1.4,0.3,setosa +5.7,3.8,1.7,0.3,setosa +5.1,3.8,1.5,0.3,setosa +5.4,3.4,1.7,0.2,setosa +5.1,3.7,1.5,0.4,setosa +4.6,3.6,1,0.2,setosa +5.1,3.3,1.7,0.5,setosa +4.8,3.4,1.9,0.2,setosa +5,3,1.6,0.2,setosa +5,3.4,1.6,0.4,setosa +5.2,3.5,1.5,0.2,setosa +5.2,3.4,1.4,0.2,setosa +4.7,3.2,1.6,0.2,setosa +4.8,3.1,1.6,0.2,setosa +5.4,3.4,1.5,0.4,setosa +5.2,4.1,1.5,0.1,setosa +5.5,4.2,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5,3.2,1.2,0.2,setosa +5.5,3.5,1.3,0.2,setosa +4.9,3.1,1.5,0.1,setosa +4.4,3,1.3,0.2,setosa +5.1,3.4,1.5,0.2,setosa +5,3.5,1.3,0.3,setosa +4.5,2.3,1.3,0.3,setosa +4.4,3.2,1.3,0.2,setosa +5,3.5,1.6,0.6,setosa +5.1,3.8,1.9,0.4,setosa +4.8,3,1.4,0.3,setosa +5.1,3.8,1.6,0.2,setosa +4.6,3.2,1.4,0.2,setosa +5.3,3.7,1.5,0.2,setosa +5,3.3,1.4,0.2,setosa +7,3.2,4.7,1.4,versicolor +6.4,3.2,4.5,1.5,versicolor +6.9,3.1,4.9,1.5,versicolor +5.5,2.3,4,1.3,versicolor +6.5,2.8,4.6,1.5,versicolor +5.7,2.8,4.5,1.3,versicolor +6.3,3.3,4.7,1.6,versicolor +4.9,2.4,3.3,1,versicolor +6.6,2.9,4.6,1.3,versicolor +5.2,2.7,3.9,1.4,versicolor +5,2,3.5,1,versicolor +5.9,3,4.2,1.5,versicolor +6,2.2,4,1,versicolor +6.1,2.9,4.7,1.4,versicolor +5.6,2.9,3.6,1.3,versicolor +6.7,3.1,4.4,1.4,versicolor +5.6,3,4.5,1.5,versicolor +5.8,2.7,4.1,1,versicolor +6.2,2.2,4.5,1.5,versicolor +5.6,2.5,3.9,1.1,versicolor +5.9,3.2,4.8,1.8,versicolor +6.1,2.8,4,1.3,versicolor +6.3,2.5,4.9,1.5,versicolor +6.1,2.8,4.7,1.2,versicolor +6.4,2.9,4.3,1.3,versicolor +6.6,3,4.4,1.4,versicolor +6.8,2.8,4.8,1.4,versicolor +6.7,3,5,1.7,versicolor +6,2.9,4.5,1.5,versicolor +5.7,2.6,3.5,1,versicolor +5.5,2.4,3.8,1.1,versicolor +5.5,2.4,3.7,1,versicolor +5.8,2.7,3.9,1.2,versicolor +6,2.7,5.1,1.6,versicolor +5.4,3,4.5,1.5,versicolor +6,3.4,4.5,1.6,versicolor +6.7,3.1,4.7,1.5,versicolor +6.3,2.3,4.4,1.3,versicolor +5.6,3,4.1,1.3,versicolor +5.5,2.5,4,1.3,versicolor +5.5,2.6,4.4,1.2,versicolor +6.1,3,4.6,1.4,versicolor +5.8,2.6,4,1.2,versicolor +5,2.3,3.3,1,versicolor +5.6,2.7,4.2,1.3,versicolor +5.7,3,4.2,1.2,versicolor +5.7,2.9,4.2,1.3,versicolor +6.2,2.9,4.3,1.3,versicolor +5.1,2.5,3,1.1,versicolor +5.7,2.8,4.1,1.3,versicolor +6.3,3.3,6,2.5,virginica +5.8,2.7,5.1,1.9,virginica +7.1,3,5.9,2.1,virginica +6.3,2.9,5.6,1.8,virginica +6.5,3,5.8,2.2,virginica +7.6,3,6.6,2.1,virginica +4.9,2.5,4.5,1.7,virginica +7.3,2.9,6.3,1.8,virginica +6.7,2.5,5.8,1.8,virginica +7.2,3.6,6.1,2.5,virginica +6.5,3.2,5.1,2,virginica +6.4,2.7,5.3,1.9,virginica +6.8,3,5.5,2.1,virginica +5.7,2.5,5,2,virginica +5.8,2.8,5.1,2.4,virginica +6.4,3.2,5.3,2.3,virginica +6.5,3,5.5,1.8,virginica +7.7,3.8,6.7,2.2,virginica +7.7,2.6,6.9,2.3,virginica +6,2.2,5,1.5,virginica +6.9,3.2,5.7,2.3,virginica +5.6,2.8,4.9,2,virginica +7.7,2.8,6.7,2,virginica +6.3,2.7,4.9,1.8,virginica +6.7,3.3,5.7,2.1,virginica +7.2,3.2,6,1.8,virginica +6.2,2.8,4.8,1.8,virginica +6.1,3,4.9,1.8,virginica +6.4,2.8,5.6,2.1,virginica +7.2,3,5.8,1.6,virginica +7.4,2.8,6.1,1.9,virginica +7.9,3.8,6.4,2,virginica +6.4,2.8,5.6,2.2,virginica +6.3,2.8,5.1,1.5,virginica +6.1,2.6,5.6,1.4,virginica +7.7,3,6.1,2.3,virginica +6.3,3.4,5.6,2.4,virginica +6.4,3.1,5.5,1.8,virginica +6,3,4.8,1.8,virginica +6.9,3.1,5.4,2.1,virginica +6.7,3.1,5.6,2.4,virginica +6.9,3.1,5.1,2.3,virginica +5.8,2.7,5.1,1.9,virginica +6.8,3.2,5.9,2.3,virginica +6.7,3.3,5.7,2.5,virginica +6.7,3,5.2,2.3,virginica +6.3,2.5,5,1.9,virginica +6.5,3,5.2,2,virginica +6.2,3.4,5.4,2.3,virginica +5.9,3,5.1,1.8,virginica diff --git a/services/projects/examples/demo_ws.py b/services/projects/examples/demo_ws.py index 45f984c..e0eaead 100644 --- a/services/projects/examples/demo_ws.py +++ b/services/projects/examples/demo_ws.py @@ -14,22 +14,23 @@ logging.basicConfig(level=logging.DEBUG) -URI = 'http://0.0.0.0:8080' -ENVS = dotenv_values('../../../config/.env') +URI = 'http://0.0.0.0:8089' +ENVS = dotenv_values('../../../config/base/.env') GOOGLE_APPLICATION_CREDENTIALS = ENVS.get('GOOGLE_APPLICATION_CREDENTIALS', '/').split('/')[-1] -GOOGLE_APPLICATION_CREDENTIALS = f'../../../config/{GOOGLE_APPLICATION_CREDENTIALS}' +GOOGLE_APPLICATION_CREDENTIALS = f'../../../config/credentials/{GOOGLE_APPLICATION_CREDENTIALS}' def generate_project(name): logging.info(f'Creating project "{name}"') - create_resp = requests.post(url=f'{URI}/projects', data={'name': name}).json() + create_resp = requests.post(url=f'{URI}/projects', data={'name': name}, verify=False).json() + print(create_resp) project_id = create_resp.get('id') tracking_uri = create_resp.get('mlflowUri') logging.info(f'Run tracking server of project "{name}"') - requests.put(url=f'{URI}/projects/{project_id}/run') - project_healthcheck_resp = requests.get(url=f'{URI}/projects/{project_id}/healthcheck') + requests.put(url=f'{URI}/projects/{project_id}/run', verify=False) + project_healthcheck_resp = requests.get(url=f'{URI}/projects/{project_id}/healthcheck', verify=False) if project_healthcheck_resp.status_code == 200: @@ -41,7 +42,7 @@ def generate_project(name): while True: print(str(waiting_time) + ' seconds...') try: - r = requests.get(tracking_uri) + r = requests.get(tracking_uri, verify=False) if r.status_code == HTTPStatus.OK: tracking_server_loaded = True @@ -61,7 +62,9 @@ def generate_project(name): logging.info('Run examples...') logging.info('run example1.py ...') p1 = sp.Popen( - f'export MLFLOW_TRACKING_URI={tracking_uri} && python example1.py', + f'export MLFLOW_TRACKING_URI={tracking_uri} && ' + f'export MLFLOW_TRACKING_INSECURE_TLS="true" && ' + f'python example1.py', shell=True ) p1.wait() @@ -70,6 +73,7 @@ def generate_project(name): logging.info('run example2.py with default parameters...') p2 = sp.Popen( f'export MLFLOW_TRACKING_URI={tracking_uri} && ' + f'export MLFLOW_TRACKING_INSECURE_TLS="true" && ' f'export GOOGLE_APPLICATION_CREDENTIALS={GOOGLE_APPLICATION_CREDENTIALS} && ' f'python example2.py', shell=True @@ -80,6 +84,7 @@ def generate_project(name): logging.info('run example2.py with cli parameters...') p3 = sp.Popen( f'export MLFLOW_TRACKING_URI={tracking_uri} && ' + f'export MLFLOW_TRACKING_INSECURE_TLS="true" && ' f'export GOOGLE_APPLICATION_CREDENTIALS={GOOGLE_APPLICATION_CREDENTIALS} && ' f'python example2.py --C 0.1 --solver newton-cg', shell=True @@ -90,6 +95,7 @@ def generate_project(name): logging.info('run example3.py with default parameters...') p4 = sp.Popen( f'export MLFLOW_TRACKING_URI={tracking_uri} && ' + f'export MLFLOW_TRACKING_INSECURE_TLS="true" && ' f'export GOOGLE_APPLICATION_CREDENTIALS={GOOGLE_APPLICATION_CREDENTIALS} && ' f'python example3.py', shell=True @@ -100,6 +106,7 @@ def generate_project(name): logging.info('run example3.py with cli parameters...') p5 = sp.Popen( f'export MLFLOW_TRACKING_URI={tracking_uri} && ' + f'export MLFLOW_TRACKING_INSECURE_TLS="true" && ' f'export GOOGLE_APPLICATION_CREDENTIALS={GOOGLE_APPLICATION_CREDENTIALS} && ' f'python example3.py --C 0.1 --kernel linear', shell=True @@ -109,7 +116,7 @@ def generate_project(name): print() logging.info('Terminate tracking server') - requests.put(url=f'{URI}/projects/{project_id}/terminate') + requests.put(url=f'{URI}/projects/{project_id}/terminate', verify=False) else: @@ -128,7 +135,7 @@ def generate_project(name): healthcheck_resp = None try: - healthcheck_resp = requests.get(f'{URI}/healthcheck') + healthcheck_resp = requests.get(f'{URI}/healthcheck', verify=False) except requests.ConnectionError: pass @@ -143,4 +150,4 @@ def generate_project(name): else: print() - logging.error('FastAPI app is not running!') \ No newline at end of file + logging.error('FastAPI app is not running!') diff --git a/services/projects/examples/example2.py b/services/projects/examples/example2.py index 2ad4f72..90b902f 100644 --- a/services/projects/examples/example2.py +++ b/services/projects/examples/example2.py @@ -5,14 +5,14 @@ """ import argparse -import joblib -import numpy as np import mlflow from mlflow.sklearn import log_model -import os -from sklearn import datasets +import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +import tensorflow_data_validation as tfdv if __name__ == '__main__': @@ -23,45 +23,45 @@ args = parser.parse_args() - path = os.path.dirname(os.path.abspath(__file__)) experiment_name = 'IrisLogreg' mlflow.set_experiment(experiment_name) - with mlflow.start_run() as run: + DATASET = 'data/iris.csv' + TARGET_LABELED_DATASET = 'data/labeled_iris.csv' + TARGET_COLUMN = 'species' + IRIS_STATISTICS = '/tmp/stats.tfdv' - # import some data to play with - iris = datasets.load_iris() + with mlflow.start_run() as run: - dataset = os.path.join(path, 'data/iris_data.joblib') + dataset = pd.read_csv(DATASET) + dataset[TARGET_COLUMN] = LabelEncoder().fit_transform(dataset[TARGET_COLUMN]) + dataset.to_csv(TARGET_LABELED_DATASET, index=False) - joblib.dump(iris, dataset) - # with open(dataset, 'w') as data_f: - # json.dump(iris, data_f) + statistics = tfdv.generate_statistics_from_dataframe(dataset.drop(TARGET_COLUMN, axis=1)) - mlflow.log_artifact(dataset) + with open(IRIS_STATISTICS, 'wb') as out_stats: + out_stats.write(statistics.SerializeToString()) - iris_X = iris.data - iris_y = iris.target + mlflow.log_artifact(DATASET) + mlflow.log_artifact(TARGET_LABELED_DATASET) + mlflow.log_artifact(IRIS_STATISTICS) - indices = np.random.permutation(len(iris_X)) + train, test = train_test_split(dataset, test_size=0.2, random_state=42) - mlflow.log_param('indices', indices.tolist()) + X_train = train.drop(TARGET_COLUMN, axis=1).astype('float32') + y_train = train[TARGET_COLUMN].astype('int32') - iris_X_train = iris_X[indices[:-10]] - iris_y_train = iris_y[indices[:-10]] - iris_X_test = iris_X[indices[-10:]] - iris_y_test = iris_y[indices[-10:]] + X_test = test.drop(TARGET_COLUMN, axis=1).astype('float32') + y_test = test[TARGET_COLUMN].astype('int32') logreg = LogisticRegression(C=args.C, solver=args.solver, multi_class='multinomial') - # Create an instance of Logistic Regression Classifier and fit the data. - logreg.fit(iris_X_train, iris_y_train) + logreg.fit(X_train, y_train) mlflow.log_params(logreg.get_params()) - prediction = logreg.predict(iris_X_test) - - f1 = f1_score(iris_y_test, prediction, average='macro') + prediction = logreg.predict(X_test) + f1 = f1_score(y_test, prediction, average='macro') mlflow.log_metric('f1', f1) diff --git a/services/projects/examples/example3.py b/services/projects/examples/example3.py index 7a63f64..9a9b6c9 100644 --- a/services/projects/examples/example3.py +++ b/services/projects/examples/example3.py @@ -5,14 +5,14 @@ """ import argparse -import joblib -import numpy as np import mlflow from mlflow.sklearn import log_model -import os -from sklearn import datasets -from sklearn.svm import SVC +import pandas as pd from sklearn.metrics import f1_score +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +from sklearn.svm import SVC +import tensorflow_data_validation as tfdv if __name__ == '__main__': @@ -23,45 +23,46 @@ args = parser.parse_args() - path = os.path.dirname(os.path.abspath(__file__)) experiment_name = 'IrisSVC' mlflow.set_experiment(experiment_name) - with mlflow.start_run() as run: + DATASET = 'data/iris.csv' + TARGET_LABELED_DATASET = 'data/labeled_iris.csv' + TARGET_COLUMN = 'species' + IRIS_STATISTICS = '/tmp/stats.tfdv' - # import some data to play with - iris = datasets.load_iris() - - dataset = os.path.join(path, 'data/iris_data.joblib') + with mlflow.start_run() as run: + dataset = pd.read_csv(DATASET) + dataset[TARGET_COLUMN] = LabelEncoder().fit_transform(dataset[TARGET_COLUMN]) + dataset.to_csv(TARGET_LABELED_DATASET, index=False) - joblib.dump(iris, dataset) - # with open(dataset, 'w') as data_f: - # json.dump(iris, data_f) + statistics = tfdv.generate_statistics_from_dataframe(dataset.drop(TARGET_COLUMN, axis=1)) - mlflow.log_artifact(dataset) + with open(IRIS_STATISTICS, 'wb') as out_stats: + out_stats.write(statistics.SerializeToString()) - iris_X = iris.data - iris_y = iris.target + mlflow.log_artifact(DATASET) + mlflow.log_artifact(TARGET_LABELED_DATASET) + mlflow.log_artifact(IRIS_STATISTICS) - indices = np.random.permutation(len(iris_X)) + train, test = train_test_split(dataset, test_size=0.2, random_state=42) - mlflow.log_param('indices', indices.tolist()) + X_train = train.drop(TARGET_COLUMN, axis=1).astype('float32') + y_train = train[TARGET_COLUMN].astype('int32') - iris_X_train = iris_X[indices[:-10]] - iris_y_train = iris_y[indices[:-10]] - iris_X_test = iris_X[indices[-10:]] - iris_y_test = iris_y[indices[-10:]] + X_test = test.drop(TARGET_COLUMN, axis=1).astype('float32') + y_test = test[TARGET_COLUMN].astype('int32') svc_model = SVC(C=args.C, kernel=args.kernel) # Create an instance of Logistic Regression Classifier and fit the data. - svc_model.fit(iris_X_train, iris_y_train) + svc_model.fit(X_train, y_train) mlflow.log_params(svc_model.get_params()) - prediction = svc_model.predict(iris_X_test) + prediction = svc_model.predict(X_test) - f1 = f1_score(iris_y_test, prediction, average='macro') + f1 = f1_score(y_test, prediction, average='macro') mlflow.log_metric('f1', f1) diff --git a/services/projects/examples/requirements.txt b/services/projects/examples/requirements.txt index 5bd1d5b..92e36b2 100644 --- a/services/projects/examples/requirements.txt +++ b/services/projects/examples/requirements.txt @@ -1,8 +1,10 @@ +apache-beam[interactive]==2.19.0 google-cloud-storage==1.20.0 joblib==0.13.2 jupyter==1.0.0 -mlflow==1.4.0 +mlflow==1.6.0 numpy==1.17.0 python-dotenv==0.10.5 requests==2.22.0 -scikit-learn==0.20.4 \ No newline at end of file +scikit-learn==0.20.4 +tensorflow_data_validation==0.21.5 diff --git a/services/projects/src/app.py b/services/projects/src/app.py index 1b21d3f..fe71af4 100644 --- a/services/projects/src/app.py +++ b/services/projects/src/app.py @@ -5,30 +5,21 @@ from fastapi import FastAPI from http import HTTPStatus import json -import os import psutil -from starlette.middleware.cors import CORSMiddleware +import psycopg2 from starlette.responses import JSONResponse, Response from starlette.requests import Request -from starlette.templating import Jinja2Templates -import sqlite3 from common.utils import build_error_response -from projects.src import config -from projects.src.project_management import BadProjectNameError, \ - ProjectAlreadyExistsError, ProjectIsAlreadyRunningError, ProjectNotFoundError, \ - BadSQLiteTableSchema +from projects.src.config import Config +from projects.src.project_management import ProjectsDBSchema, BadProjectNameError, \ + ProjectAlreadyExistsError, ProjectIsAlreadyRunningError, ProjectNotFoundError from projects.src.routers import misc, projects, artifacts, registered_models, experiments, \ runs, deployments from projects.src.routers.utils import RegisteredModelNotFoundError app = FastAPI() # pylint: disable=invalid-name app.setup() -app.add_middleware( - CORSMiddleware, - allow_origins=['*'], - allow_methods=['GET', 'POST', 'PUT', 'DELETE'] -) app.include_router(projects.router) app.include_router(experiments.router) @@ -38,39 +29,42 @@ app.include_router(artifacts.router) app.include_router(deployments.router) -templates = Jinja2Templates(directory='src/templates') # pylint: disable=invalid-name + +conf = Config() +logger = conf.get_logger(__name__) @app.on_event('startup') def init() -> None: - """Init on application startup""" + """Init on application startup, create DB if not exists, check active projects""" try: - config.set_logger() - projects_db = os.path.join(config.WORKSPACE, config.PROJECTS_DB_NAME) - - if os.path.exists(projects_db): - - conn = sqlite3.connect(projects_db) - cursor = conn.cursor() - cursor.execute(f'SELECT id, pid FROM {config.PROJECTS_TABLE}') - - terminated_projects = [] - for project_id, pid in cursor.fetchall(): - if not psutil.pid_exists(pid): - terminated_projects.append(project_id) - - seq = ','.join(['?'] * len(terminated_projects)) - cursor.execute( - f'UPDATE {config.PROJECTS_TABLE} SET pid = -1 WHERE id in ({seq})', - terminated_projects - ) - conn.commit() - conn.close() - - except BadSQLiteTableSchema as e: # pylint: disable=invalid-name - config.DB_SCHEMA['status'] = 'fail' - config.DB_SCHEMA['message'] = str(e) + # TODO: add .create_if_not_exist() + ProjectsDBSchema() + + # find gost projects (check PIDs) set '-1' if processed stopped|not exists (default) + # TODO: wrap in method + connection = psycopg2.connect( + database=conf.get('PROJECTS_DB_NAME'), + host=conf.get('DB_HOST'), + port=conf.get('DB_PORT'), + user=conf.get('DB_USER'), + password=conf.get('DB_PASSWORD') + ) + cursor = connection.cursor() + cursor.execute(f'SELECT id, pid FROM {ProjectsDBSchema.PROJECTS_TABLE}') + + for project_id, pid in cursor.fetchall(): + if not psutil.pid_exists(pid): + cursor.execute( + f'UPDATE {ProjectsDBSchema.PROJECTS_TABLE} SET pid = -1 WHERE id = %s', + (project_id,) + ) + connection.commit() + connection.close() + + except Exception as e: # pylint: disable=invalid-name + logger.error(e, exc_info=True) @app.middleware('http') @@ -84,8 +78,10 @@ async def before_and_after_request(request: Request, call_next) -> Response: """ # pylint: disable=too-many-locals,invalid-name,broad-except - if config.DB_SCHEMA['status'] == 'fail': - return JSONResponse(config.DB_SCHEMA, HTTPStatus.INTERNAL_SERVER_ERROR) + # check DB schema is valid + # TODO (Alex): rewrite + # if config.DB_SCHEMA['status'] == 'fail': + # return JSONResponse(config.DB_SCHEMA, HTTPStatus.INTERNAL_SERVER_ERROR) try: response = await call_next(request) @@ -100,17 +96,13 @@ async def before_and_after_request(request: Request, call_next) -> Response: return build_error_response(HTTPStatus.CONFLICT, e) except Exception as e: + logger.error(e, exc_info=True) return build_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, e) ip = request.headers.get('X-Forwarded-For', request.client.host) host = request.client.host.split(':', 1)[0] query_params = dict(request.query_params) - # ISSUE: it's problematic to get body in middleware, links: - # https://github.com/encode/starlette/issues/495, - # https://github.com/tiangolo/fastapi/issues/394 - # body = dict(await request.body()) - log_params = [ ('method', request.method), ('path', request.url.path), @@ -118,11 +110,10 @@ async def before_and_after_request(request: Request, call_next) -> Response: ('ip', ip), ('host', host), ('params', query_params) - # ('body', body) ] line = json.dumps(dict(log_params)) - config.logger.info(line) + logger.debug(line) final_response = response if response.headers.get('content-type') == 'application/json': @@ -151,7 +142,6 @@ async def before_and_after_request(request: Request, call_next) -> Response: else: final_response = JSONResponse(json_content) - final_response.headers['Access-Control-Allow-Origin'] = '*' final_response.status_code = response.status_code return final_response diff --git a/services/projects/src/config.py b/services/projects/src/config.py index 221db69..296b22b 100644 --- a/services/projects/src/config.py +++ b/services/projects/src/config.py @@ -5,60 +5,71 @@ import logging import os -import time from logging.handlers import RotatingFileHandler +import time -from projects.src.utils import get_external_ip - +from common.utils import validate_env_var -HOST_MODE = os.getenv('HOST_MODE', 'local') -HOST_IP = '0.0.0.0' -if HOST_MODE == 'remote': - HOST_IP = get_external_ip() +class Config: -WORKSPACE = os.getenv('WORKSPACE') + def __init__(self): -# Check if WORKSPACE is set -if WORKSPACE is None: - raise EnvironmentError('Failed because env var WORKSPACE is not set') + self._load_env_vars() + self._check_env_vars() -PROJECTS_DB_NAME = 'projects.db' -PROJECTS_TABLE = 'project' + def get(self, env_var): + return self.env_vars.get(env_var) -LAST_MLFLOW_UI_URL = None + def get_logger(self, name): -DB_SCHEMA = { - 'status': 'ok', - 'message': '' -} + logger = logging.getLogger(name) + worksapce = self.get('WORKSPACE') + os.makedirs(worksapce, exist_ok=True) + LOGFILE = os.path.join(worksapce, 'projects.log') + LOGLEVEL = os.getenv('LOGLEVEL', 'INFO').upper() -logger = logging.getLogger(__name__) + if LOGLEVEL not in logging._nameToLevel.keys(): # pylint: disable=protected-access + LOGLEVEL = 'INFO' + f_handler = RotatingFileHandler(LOGFILE, mode='a') + f_handler.setLevel(LOGLEVEL) -def create_workspace_dir() -> None: - """Create workspace directory""" + f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + f_format.converter = time.gmtime + f_handler.setFormatter(f_format) - os.makedirs(WORKSPACE, exist_ok=True) + logger.setLevel(LOGLEVEL) + logger.addHandler(f_handler) + return logger -def set_logger() -> None: - """Set logger""" + def _load_env_vars(self): - create_workspace_dir() + self.env_vars = { + 'HOST_IP': os.getenv('HOST_IP', '0.0.0.0'), + 'WORKSPACE': os.getenv('WORKSPACE'), + 'ARTIFACT_STORE': os.getenv('ARTIFACT_STORE'), + 'TRACKING_SERVER_PORTS': os.getenv('TRACKING_SERVER_PORTS'), + 'TRACKING_SERVER_WORKERS': os.getenv('TRACKING_SERVER_WORKERS', 1), + 'PROJECTS_DB_NAME': os.getenv('PROJECTS_DB_NAME'), + 'DB_HOST': os.getenv('DB_HOST'), + 'DB_PORT': os.getenv('DB_PORT'), + 'DB_USER': os.getenv('POSTGRES_USER'), + 'DB_PASSWORD': os.getenv('POSTGRES_PASSWORD') + } - LOGFILE = os.path.join(WORKSPACE, 'server.log') - LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper() + def _check_env_vars(self): + """Check if all required environment variables are set. + Raises: + EnvironmentError: if some required environment variable is not set + InvalidEnvVarValueError: if some environment variable is invalid + """ - if LOGLEVEL not in logging._nameToLevel.keys(): # pylint: disable=protected-access - LOGLEVEL = 'INFO' + for name, value in self.env_vars.items(): - f_handler = RotatingFileHandler(LOGFILE, mode='a') - f_handler.setLevel(LOGLEVEL) + if value is None: + raise EnvironmentError(f'Failed because {name} env var is not set') - f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - f_format.converter = time.gmtime - f_handler.setFormatter(f_format) + validate_env_var(name, str(value)) - logger.setLevel(LOGLEVEL) - logger.addHandler(f_handler) diff --git a/services/projects/src/project_management.py b/services/projects/src/project_management.py index a865697..2d721f3 100644 --- a/services/projects/src/project_management.py +++ b/services/projects/src/project_management.py @@ -7,14 +7,16 @@ import os import psutil +import psycopg2 +import psycopg2.errors +from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import shutil -import sqlite3 import subprocess from typing import Any, Dict, List, Set, Text from common.types import StrEnum from common.utils import get_rfc3339_time, kill, is_remote -from projects.src import config +from projects.src.config import Config from projects.src.utils import process_stat @@ -50,6 +52,73 @@ class ProjectStatus(StrEnum): ARCHIVED = 'archived' +class ProjectsDBSchema: + + CONFIG = Config() + DB_NAME = CONFIG.get('PROJECTS_DB_NAME') + PROJECTS_TABLE = 'project' + + def __init__(self): + + self._create_db() + + self._connection = psycopg2.connect( + database=self.CONFIG.get('PROJECTS_DB_NAME'), + host=self.CONFIG.get('DB_HOST'), + port=self.CONFIG.get('DB_PORT'), + user=self.CONFIG.get('DB_USER'), + password=self.CONFIG.get('DB_PASSWORD') + ) + self._cursor = self._connection.cursor() + + self.create_projects_table() + + def create_projects_table(self): + + schema = { + 'id': 'SERIAL PRIMARY KEY ', + 'name': 'TEXT UNIQUE', + 'description': 'TEXT DEFAULT NULL', + 'port': 'INTEGER UNIQUE', + 'path': 'TEXT', + 'archived': 'INT', + 'created_at': 'TEXT', + 'pid': 'INT' # subprocess IF for mlflow tracking server + } + + self._create_table(self.PROJECTS_TABLE, schema) + + def _create_db(self): + + connection = psycopg2.connect( + host=self.CONFIG.get('DB_HOST'), + port=self.CONFIG.get('DB_PORT'), + user=self.CONFIG.get('DB_USER'), + password=self.CONFIG.get('DB_PASSWORD') + ) + connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + cursor = connection.cursor() + + try: + cursor.execute(f'CREATE DATABASE {self.DB_NAME};') + except psycopg2.errors.DuplicateDatabase: + pass + + connection.close() + + def _create_table(self, table_name: Text, table_schema: Dict): + + columns_description = ', '.join([ + col_name + ' ' + col_type for col_name, col_type in table_schema.items() + ]) + + self._cursor.execute( + f'CREATE TABLE IF NOT EXISTS {table_name} ({columns_description})' + ) + + self._connection.commit() + + class ProjectManager: """Project manager. Allows to create, run, terminate and delete projects. @@ -57,36 +126,25 @@ class ProjectManager: # pylint: disable=too-many-instance-attributes def __init__(self): - self._WORKSPACE = config.WORKSPACE - self._PROJECTS_DB_NAME = config.PROJECTS_DB_NAME + self.CONFIG = Config() + self._WORKSPACE = self.CONFIG.get('WORKSPACE') + self._PROJECTS_DB_NAME = self.CONFIG.get('PROJECTS_DB_NAME') self._TRACKING_DB = os.path.join(self._WORKSPACE, self._PROJECTS_DB_NAME) - self._PROJECTS_TABLE = config.PROJECTS_TABLE - - self._ARTIFACTS_STORE = os.getenv('ARTIFACT_STORE') + self._ARTIFACTS_STORE = self.CONFIG.get('ARTIFACT_STORE') if self._ARTIFACTS_STORE is None: raise EnvironmentError('Failed because env var ARTIFACT_STORE is not set') self._ports_range = self._get_ports_range() - self._conn = sqlite3.connect(self._TRACKING_DB, check_same_thread=False) - self._cursor = self._conn.cursor() - self._schema = { - 'id': 'INTEGER UNIQUE', - 'name': 'TEXT UNIQUE', - 'description': 'TEXT DEFAULT ""', - 'port': 'INTEGER UNIQUE', - 'path': 'TEXT', - 'archived': 'INTEGER', - 'created_at': 'TIMESTAMP', - 'pid': 'INTEGER' - } - columns_description = ', '.join([ - col_name + ' ' + col_type for col_name, col_type in self._schema.items() - ]) - self._cursor.execute( - f'CREATE TABLE IF NOT EXISTS {self._PROJECTS_TABLE} ({columns_description})') - self._check_tracking_db_schema() + self._connection = psycopg2.connect( + database=self.CONFIG.get('PROJECTS_DB_NAME'), + host=self.CONFIG.get('DB_HOST'), + port=self.CONFIG.get('DB_PORT'), + user=self.CONFIG.get('DB_USER'), + password=self.CONFIG.get('DB_PASSWORD') + ) + self._cursor = self._connection.cursor() def create_project(self, name: Text, description: Text = '') -> int: """Create new project: create project folder in workspace @@ -113,18 +171,29 @@ def create_project(self, name: Text, description: Text = '') -> int: raise ProjectAlreadyExistsError( f'Project "{name}" already exists {additional_error_msg}') - self._cursor.execute(f'SELECT IFNULL(MAX(id), 0) FROM {self._PROJECTS_TABLE}') - project_id = self._cursor.fetchone()[0] + 1 + port = self._get_free_port() + + self._cursor.execute( + f'INSERT INTO {ProjectsDBSchema.PROJECTS_TABLE} ' + f'(name, description, port, archived, created_at, pid) ' + f'VALUES (%s,%s,%s,%s,%s,%s) ' + f'RETURNING id', + (name, description, port, 0, get_rfc3339_time(), -1) + ) + + project_id = self._cursor.fetchone()[0] + print('project_id:', project_id) project_path = os.path.join(self._WORKSPACE, str(project_id)) os.makedirs(project_path, exist_ok=True) - port = self._get_free_port() + self._cursor.execute( - f'INSERT INTO {self._PROJECTS_TABLE} ' - f'(id, name, description, port, path, archived, created_at, pid) ' - f'VALUES (?,?,?,?,?,?,?,?)', - (project_id, name, description, port, project_path, 0, get_rfc3339_time(), -1) + f'UPDATE {ProjectsDBSchema.PROJECTS_TABLE} ' + f'SET path = %s ' + f'WHERE id = {project_id}', + (project_path, ) ) - self._conn.commit() + + self._connection.commit() return project_id @@ -155,12 +224,12 @@ def delete_project(self, id: int) -> None: if not self._project_id_exists(id): raise ProjectNotFoundError(f'Project with ID {id} not found') - self._cursor.execute(f'SELECT path FROM {self._PROJECTS_TABLE} WHERE id = {id}') + self._cursor.execute(f'SELECT path FROM {ProjectsDBSchema.PROJECTS_TABLE} WHERE id = {id}') project_path = self._cursor.fetchone()[0] shutil.rmtree(project_path, ignore_errors=True) - self._cursor.execute(f'DELETE from {self._PROJECTS_TABLE} WHERE id = {id}') - self._conn.commit() + self._cursor.execute(f'DELETE from {ProjectsDBSchema.PROJECTS_TABLE} WHERE id = {id}') + self._connection.commit() def list_projects(self) -> List[Dict]: """Get list of existed projects. @@ -182,7 +251,7 @@ def list_projects(self) -> List[Dict]: ] """ - self._cursor.execute(f'SELECT * FROM {self._PROJECTS_TABLE}') + self._cursor.execute(f'SELECT * FROM {ProjectsDBSchema.PROJECTS_TABLE}') projects = [] for rec in self._cursor.fetchall(): @@ -201,7 +270,7 @@ def list_projects(self) -> List[Dict]: 'name': name, 'description': description, 'status': str(status), - 'mlflowUri': f'http://{config.HOST_IP}:{port}', + 'mlflowUri': f'http://{self.CONFIG.get("HOST_IP")}:{port}', 'createdBy': 0, 'createdAt': created_at, 'path': path @@ -254,7 +323,7 @@ def get_project_by_name(self, name: Text) -> Dict: if not self._project_name_exists(name): raise ProjectNotFoundError(f'Project with ID {id} not found') - self._cursor.execute(f'SELECT * FROM {self._PROJECTS_TABLE} WHERE name = "{name}"') + self._cursor.execute(f'SELECT * FROM {ProjectsDBSchema.PROJECTS_TABLE} WHERE name = \'{name}\'') rec = self._cursor.fetchone() project = { 'id': rec[0], @@ -267,7 +336,7 @@ def get_project_by_name(self, name: Text) -> Dict: return project - def get_tracking_uri(self, project_id: int) -> Text: + def get_internal_tracking_uri(self, project_id: int) -> Text: """Get tracking uri by project id. Args: project_id {int}: project id @@ -275,7 +344,7 @@ def get_tracking_uri(self, project_id: int) -> Text: Text: MLflow tracking server uri for the project """ - return self.get_project(project_id).get('mlflowUri') + return self.get_project(project_id).get('mlflowUri').replace('https', 'http') def archive(self, project_id: int) -> None: """Archive project. @@ -317,25 +386,31 @@ def run(self, project_id: int) -> bool: else: default_artifact_root = os.path.join(self._WORKSPACE, str(project_id), self._ARTIFACTS_STORE) - project_log = os.path.join(path, 'project.log') + access_log = os.path.join(path, 'access.log') + errors_log = os.path.join(path, 'errors.log') + mlflow_stdout_log = os.path.join(path, 'mlflow_stdout.log') + tracking_server_process = subprocess.Popen( [ + f'GUNICORN_CMD_ARGS="--access-logfile={access_log} --error-logfile={errors_log} ' + f'--log-level=debug" ' f'mlflow server ' f'--backend-store-uri {project_mlflow_db} ' f'--default-artifact-root {default_artifact_root} ' f'--host 0.0.0.0 --port {port} ' - f'2>&1 | tee -a {project_log}' + f'--workers {self.CONFIG.get("TRACKING_SERVER_WORKERS")} ' + f'2>&1 | tee -a {mlflow_stdout_log}' ], shell=True ) self._cursor.execute( - f'UPDATE {self._PROJECTS_TABLE} ' - f'SET pid = ? ' + f'UPDATE {ProjectsDBSchema.PROJECTS_TABLE} ' + f'SET pid = %s ' f'WHERE id = {project_id}', (tracking_server_process.pid,) ) - self._conn.commit() + self._connection.commit() return tracking_server_process.poll() is None @@ -345,7 +420,7 @@ def terminate(self, project_id: int) -> None: project_id {int}: project id """ - self._cursor.execute(f'SELECT pid FROM {self._PROJECTS_TABLE} WHERE id = {project_id}') + self._cursor.execute(f'SELECT pid FROM {ProjectsDBSchema.PROJECTS_TABLE} WHERE id = {project_id}') project_pid = self._cursor.fetchone() if project_pid: @@ -356,12 +431,12 @@ def terminate(self, project_id: int) -> None: kill(pid) self._cursor.execute( - f'UPDATE {self._PROJECTS_TABLE} ' - f'SET pid = ? ' + f'UPDATE {ProjectsDBSchema.PROJECTS_TABLE} ' + f'SET pid = %s ' f'WHERE id = {project_id}', (-1,) ) - self._conn.commit() + self._connection.commit() def running_projects_stat(self) -> List[Dict]: """Get statistics by running projects. @@ -377,7 +452,7 @@ def running_projects_stat(self) -> List[Dict]: stat = [] - self._cursor.execute(f'SELECT id, pid FROM {self._PROJECTS_TABLE} WHERE pid <> -1') + self._cursor.execute(f'SELECT id, pid FROM {ProjectsDBSchema.PROJECTS_TABLE} WHERE pid <> -1') rows = self._cursor.fetchall() for project_id, pid in rows: @@ -390,31 +465,17 @@ def running_projects_stat(self) -> List[Dict]: return stat - def _check_tracking_db_schema(self): - - self._cursor.execute('PRAGMA table_info(project);') - columns = [row[1] for row in self._cursor.fetchall()] - missing_columns = set(self._schema.keys()).difference(set(columns)) - - if len(missing_columns) != 0: - raise BadSQLiteTableSchema( - f'Bad (obsolete) table {self._PROJECTS_TABLE} ' - f'schema, missing columns: {missing_columns}. ' - f'Remove content of {self._WORKSPACE}/ and rerun application' - ) - - @staticmethod - def _get_ports_range() -> Set: + def _get_ports_range(self) -> Set: """Get available ports range. Returns: Set: set of port numbers """ - ports_range = os.getenv('MLFLOW_TRACKING_SERVERS_PORTS_RANGE') + ports_range = self.CONFIG.get('TRACKING_SERVER_PORTS') if ports_range is None: raise EnvironmentError( - 'Failed because env var MLFLOW_TRACKING_SERVERS_PORTS_RANGE is not set') + 'Failed because env var TRACKING_SERVER_PORTS is not set') start_port, end_port = list(map(int, ports_range.split('-'))) @@ -431,7 +492,7 @@ def _get_free_port(self) -> int: int: free port """ - self._cursor.execute(f'SELECT port FROM {self._PROJECTS_TABLE}') + self._cursor.execute(f'SELECT port FROM {ProjectsDBSchema.PROJECTS_TABLE}') assigned_ports = {record[0] for record in self._cursor.fetchall()} free_ports = self._ports_range.difference(assigned_ports) @@ -449,7 +510,7 @@ def _project_name_exists(self, name: Text) -> bool: bool: True if project name exists, False otherwise """ - self._cursor.execute(f'SELECT * FROM {self._PROJECTS_TABLE} WHERE name = "{name}"') + self._cursor.execute(f'SELECT * FROM {ProjectsDBSchema.PROJECTS_TABLE} WHERE name = \'{name}\'') project = self._cursor.fetchone() return project is not None @@ -462,7 +523,7 @@ def _project_id_exists(self, id: int) -> bool: bool: True if project id exists, False otherwise """ - self._cursor.execute(f'SELECT * FROM {self._PROJECTS_TABLE} WHERE id = {id}') + self._cursor.execute(f'SELECT * FROM {ProjectsDBSchema.PROJECTS_TABLE} WHERE id = {id}') records = self._cursor.fetchall() return len(records) != 0 @@ -493,10 +554,10 @@ def _set_archived_status(self, project_id: int, archive: bool = True) -> None: raise ProjectNotFoundError(f'Project with ID {project_id} not found') self._cursor.execute( - f'UPDATE {self._PROJECTS_TABLE} SET archived = ? WHERE id = {project_id}', + f'UPDATE {ProjectsDBSchema.PROJECTS_TABLE} SET archived = %s WHERE id = {project_id}', (1 if archive is True else 0,) ) - self._conn.commit() + self._connection.commit() def _update_project_field(self, id: int, field: Text, value: Any) -> None: """Update project field. @@ -510,12 +571,12 @@ def _update_project_field(self, id: int, field: Text, value: Any) -> None: raise ProjectNotFoundError(f'Project with ID {id} not found') self._cursor.execute( - f'UPDATE {self._PROJECTS_TABLE} ' - f'SET {field} = ?' + f'UPDATE {ProjectsDBSchema.PROJECTS_TABLE} ' + f'SET {field} = %s' f'WHERE id = {id}', (value,) ) - self._conn.commit() + self._connection.commit() def _get_pid(self, project_id: int) -> int: """ @@ -526,9 +587,10 @@ def _get_pid(self, project_id: int) -> int: int: pid """ - pid = self._cursor.execute( - f'SELECT pid FROM {self._PROJECTS_TABLE} WHERE id = {project_id}' - ).fetchone() + self._cursor.execute( + f'SELECT pid FROM {ProjectsDBSchema.PROJECTS_TABLE} WHERE id = {project_id}' + ) + pid = self._cursor.fetchone() if pid: return pid[0] diff --git a/services/projects/src/routers/artifacts.py b/services/projects/src/routers/artifacts.py index 44bcbaa..ff262d3 100644 --- a/services/projects/src/routers/artifacts.py +++ b/services/projects/src/routers/artifacts.py @@ -7,11 +7,13 @@ import os import requests from starlette.responses import JSONResponse +from starlette.requests import Request from typing import Dict, Text from common.types import StrEnum from common.utils import error_response, is_model from projects.src.project_management import ProjectManager +from projects.src.utils import log_request router = APIRouter() # pylint: disable=invalid-name @@ -49,7 +51,7 @@ def get_artifact_type(root_uri: Text, artifact: Dict) -> ArtifactType: @router.get('/artifacts', tags=['artifacts']) -def list_artifacts(project_id: int, run_id: Text) -> JSONResponse: +def list_artifacts(request: Request, project_id: int, run_id: Text) -> JSONResponse: """Get artifacts list. Args: project_id {int}: project id @@ -58,8 +60,13 @@ def list_artifacts(project_id: int, run_id: Text) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id, + 'run_id': run_id + }) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) runs_resp = requests.get( url=f'{url}/api/2.0/preview/mlflow/artifacts/list?run_id={run_id}' ) diff --git a/services/projects/src/routers/deployments.py b/services/projects/src/routers/deployments.py index cba458f..b468162 100644 --- a/services/projects/src/routers/deployments.py +++ b/services/projects/src/routers/deployments.py @@ -5,16 +5,19 @@ from fastapi import APIRouter, Form import requests from starlette.responses import JSONResponse, Response +from starlette.requests import Request from typing import Text from projects.src.routers.utils import get_model_version_uri +from projects.src.utils import log_request router = APIRouter() # pylint: disable=invalid-name @router.post('/deployments', tags=['deployments']) -def create_deployment(project_id: int, model_id: Text, version: Text, type: Text) -> JSONResponse: +def create_deployment(request: Request, project_id: int, + model_id: Text, version: Text, type: Text) -> JSONResponse: """Create deployment. Args: project_id {int}: project id @@ -26,6 +29,13 @@ def create_deployment(project_id: int, model_id: Text, version: Text, type: Text """ # pylint: disable=redefined-builtin + log_request(request, { + 'project_id': project_id, + 'model_id': model_id, + 'version': version, + 'type': type + }) + model_uri = get_model_version_uri(project_id, model_id, version) deploy_resp = requests.post( url='http://deploy:9000/deployments', @@ -42,20 +52,23 @@ def create_deployment(project_id: int, model_id: Text, version: Text, type: Text @router.put('/deployments/{deployment_id}/run', tags=['deployments']) -def run_deployment(deployment_id: int) -> JSONResponse: +def run_deployment(request: Request, deployment_id: int) -> JSONResponse: """Run deployment. Args: deployment_id {int}: deployment id Returns: starlette.responses.JSONResponse """ + log_request(request, { + 'deployment_id': deployment_id + }) deploy_resp = requests.put(f'http://deploy:9000/deployments/{deployment_id}/run') return JSONResponse(deploy_resp.json(), deploy_resp.status_code) @router.put('/deployments/{deployment_id}/stop', tags=['deployments']) -def stop_deployment(deployment_id: int) -> JSONResponse: +def stop_deployment(request: Request, deployment_id: int) -> JSONResponse: """Stop deployment. Args: deployment_id {int}: deployment id @@ -63,12 +76,16 @@ def stop_deployment(deployment_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request, { + 'deployment_id': deployment_id + }) + deploy_resp = requests.put(f'http://deploy:9000/deployments/{deployment_id}/stop') return JSONResponse(deploy_resp.json(), deploy_resp.status_code) @router.post('/deployments/{deployment_id}/predict', tags=['deployments']) -def predict(deployment_id: int, data: Text = Form(...)) -> JSONResponse: +def predict(request: Request, deployment_id: int, data: Text = Form(...)) -> JSONResponse: """Predict data on deployment. Args: deployment_id {int}: deployment id @@ -77,6 +94,11 @@ def predict(deployment_id: int, data: Text = Form(...)) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request, { + 'deployment_id': deployment_id, + 'data': data + }) + deploy_resp = requests.post( url=f'http://deploy:9000/deployments/{deployment_id}/predict', data={'data': data} @@ -85,18 +107,19 @@ def predict(deployment_id: int, data: Text = Form(...)) -> JSONResponse: @router.get('/deployments', tags=['deployments']) -def list_deployments() -> JSONResponse: +def list_deployments(request: Request) -> JSONResponse: """Get deployments list. Returns: starlette.responses.JSONResponse """ + log_request(request) deployments = requests.get('http://deploy:9000/deployments').json() return JSONResponse(deployments) @router.get('/deployments/{deployment_id}', tags=['deployments']) -def get_deployment(deployment_id: int) -> JSONResponse: +def get_deployment(request: Request, deployment_id: int) -> JSONResponse: """Get deployment. Args: deployment_id {int}: deployment id @@ -104,12 +127,14 @@ def get_deployment(deployment_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request) + deploy_resp = requests.get(f'http://deploy:9000/deployments/{deployment_id}') return JSONResponse(deploy_resp.json(), deploy_resp.status_code) @router.delete('/deployments/{deployment_id}', tags=['deployments']) -def delete_deployment(deployment_id: int) -> JSONResponse: +def delete_deployment(request: Request, deployment_id: int) -> JSONResponse: """Delete deployment (mark deployment as deleted). Args: deployment_id {int}: deployment id @@ -117,12 +142,16 @@ def delete_deployment(deployment_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request, { + 'deployment_id': deployment_id + }) + deploy_resp = requests.delete(f'http://deploy:9000/deployments/{deployment_id}') return JSONResponse(deploy_resp.json(), deploy_resp.status_code) @router.get('/deployments/{deployment_id}/ping') -def ping(deployment_id: int) -> Response: +def ping(request: Request, deployment_id: int) -> Response: """Ping deployment. Args: deployment_id {int}: deployment id @@ -130,5 +159,39 @@ def ping(deployment_id: int) -> Response: starlette.responses.JSONResponse """ + log_request(request) + deploy_resp = requests.get(f'http://deploy:9000/deployments/{deployment_id}/ping') return JSONResponse(deploy_resp.text, status_code=deploy_resp.status_code) + + +@router.get('/deployments/{deployment_id}/schema') +def get_deployment_data_schema(request: Request, deployment_id: int) -> JSONResponse: + """Get deployment data schema. + Args: + deployment_id {int}: deployment id + Returns: + starlette.responses.JSONResponse + """ + + log_request(request) + + deploy_resp = requests.get(f'http://deploy:9000/deployments/{deployment_id}/schema') + + return JSONResponse(deploy_resp.json(), status_code=deploy_resp.status_code) + + +@router.get('/deployments/{deployment_id}/validation-report') +def get_validation_report( + request: Request, deployment_id: int, + timestamp_from: float, + timestamp_to: float) -> JSONResponse: + + log_request(request) + + deploy_resp = requests.get( + f'http://deploy:9000/deployments/{deployment_id}/validation-report?' + f'timestamp_from={timestamp_from}×tamp_to={timestamp_to}' + ) + + return JSONResponse(deploy_resp.json(), status_code=deploy_resp.status_code) diff --git a/services/projects/src/routers/experiments.py b/services/projects/src/routers/experiments.py index 0596a96..fb0c783 100644 --- a/services/projects/src/routers/experiments.py +++ b/services/projects/src/routers/experiments.py @@ -6,25 +6,28 @@ from http import HTTPStatus import requests from starlette.responses import JSONResponse, RedirectResponse +from starlette.requests import Request from typing import Text +from common.utils import error_response, get_utc_timestamp from projects.src.project_management import ProjectManager -from projects.src.utils import get_utc_timestamp -from common.utils import error_response +from projects.src.utils import log_request router = APIRouter() # pylint: disable=invalid-name @router.get('/experiments', tags=['experiments']) -def list_experiments(project_id: int) -> JSONResponse: +def list_experiments(request: Request, project_id: int) -> JSONResponse: """Get experiments list. Args: project_id {int}: project id Returns: starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) resp = requests.get( url=f'{url}/api/2.0/preview/mlflow/experiments/list' ) @@ -67,6 +70,7 @@ def list_experiments(project_id: int) -> JSONResponse: @router.post('/experiments', tags=['experiments']) def create_experiment( + request: Request, project_id: int, user_id: Text = Form(''), name: Text = Form(''), @@ -82,8 +86,15 @@ def create_experiment( starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id, + 'user_id': user_id, + 'name': name, + 'description': description + }) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) creation_resp = requests.post( url=f'{url}/api/2.0/preview/mlflow/experiments/create', json={'name': name} @@ -132,7 +143,7 @@ def create_experiment( @router.get('/experiments/{experiment_id}', tags=['experiments']) -def get_experiment(experiment_id: Text, project_id: int) -> JSONResponse: +def get_experiment(request: Request, experiment_id: Text, project_id: int) -> JSONResponse: """Get experiment. Args: experiment_id {Text}: experiment id @@ -141,8 +152,10 @@ def get_experiment(experiment_id: Text, project_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) experiment_resp = requests.get( url=f'{url}/api/2.0/preview/mlflow/experiments/get?experiment_id={experiment_id}' ) @@ -183,7 +196,7 @@ def get_experiment(experiment_id: Text, project_id: int) -> JSONResponse: @router.delete('/experiments/{experiment_id}', tags=['experiments']) -def delete_experiment(experiment_id: Text, project_id: int) -> JSONResponse: +def delete_experiment(request: Request, experiment_id: Text, project_id: int) -> JSONResponse: """Delete experiment. Args: experiment_id {Text}: experiment id @@ -192,8 +205,13 @@ def delete_experiment(experiment_id: Text, project_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id, + 'experiment_id': experiment_id + }) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) experiment_resp = requests.post( url=f'{url}/api/2.0/preview/mlflow/experiments/delete', json={'experiment_id': experiment_id} diff --git a/services/projects/src/routers/misc.py b/services/projects/src/routers/misc.py index e86b3dc..23fc453 100644 --- a/services/projects/src/routers/misc.py +++ b/services/projects/src/routers/misc.py @@ -5,22 +5,25 @@ from fastapi import APIRouter from starlette.responses import JSONResponse +from starlette.requests import Request from typing import Text from projects.src.project_management import ProjectManager -from projects.src.utils import system_stat +from projects.src.utils import system_stat, log_request router = APIRouter() # pylint: disable=invalid-name @router.get('/stat', tags=['miscellaneous']) -def stat() -> JSONResponse: +def stat(request: Request) -> JSONResponse: """Get statistics of resources usage. Returns: starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() return JSONResponse(dict( @@ -30,10 +33,12 @@ def stat() -> JSONResponse: @router.get('/healthcheck', tags=['miscellaneous']) -def healthcheck() -> Text: +def healthcheck(request: Request) -> Text: """Get healthcheck. Returns: Text: OK """ + log_request(request) + return 'OK' diff --git a/services/projects/src/routers/projects.py b/services/projects/src/routers/projects.py index 87819e3..0f4ea8d 100644 --- a/services/projects/src/routers/projects.py +++ b/services/projects/src/routers/projects.py @@ -6,28 +6,34 @@ from fastapi import APIRouter, Form from http import HTTPStatus import requests -from starlette.responses import JSONResponse, Response +from starlette.responses import JSONResponse +from starlette.requests import Request from typing import Text -from projects.src.project_management import ProjectManager from common.utils import error_response +from projects.src.project_management import ProjectManager +from projects.src.utils import log_request router = APIRouter() # pylint: disable=invalid-name @router.get('/projects', tags=['projects']) -def list_projects() -> JSONResponse: +def list_projects(request: Request) -> JSONResponse: """Get projects list. Returns: starlette.responses.JSONResponse """ + + log_request(request) + project_manager = ProjectManager() projects = project_manager.list_projects() return JSONResponse(projects) @router.post('/projects', tags=['projects']) -def create_project(name: Text = Form(...), description: Text = Form('')) -> JSONResponse: +def create_project(request: Request, name: Text = Form(...), + description: Text = Form('')) -> JSONResponse: """Create project. Args: name {Text}: project name @@ -36,6 +42,11 @@ def create_project(name: Text = Form(...), description: Text = Form('')) -> JSON starlette.responses.JSONResponse """ + log_request(request, { + 'name': name, + 'description': description + }) + project_manager = ProjectManager() project_id = project_manager.create_project(name, description) project = project_manager.get_project(project_id) @@ -44,7 +55,7 @@ def create_project(name: Text = Form(...), description: Text = Form('')) -> JSON @router.get('/projects/{project_id}', tags=['projects']) -def get_project(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin +def get_project(request: Request, project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin """Get project. Args: project_id {int}: project id @@ -52,13 +63,15 @@ def get_project(project_id: int) -> JSONResponse: # pylint: disable=invalid-nam starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() project = project_manager.get_project(project_id) return JSONResponse(project) @router.put('/projects/{project_id}', tags=['projects']) -def update_project(project_id: int, name: Text = Form(None), +def update_project(request: Request, project_id: int, name: Text = Form(None), description: Text = Form(None)) -> JSONResponse: """Update project. Args: @@ -69,6 +82,12 @@ def update_project(project_id: int, name: Text = Form(None), starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id, + 'name': name, + 'description': description + }) + project_manager = ProjectManager() if name is not None: project_manager.update_project_name(project_id, name) @@ -82,7 +101,7 @@ def update_project(project_id: int, name: Text = Form(None), @router.delete('/projects/{project_id}', tags=['projects']) -def delete_project(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin +def delete_project(request: Request, project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin """Delete project. Args: project_id {int}: project id @@ -90,6 +109,10 @@ def delete_project(project_id: int) -> JSONResponse: # pylint: disable=invalid- starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id + }) + project_manager = ProjectManager() project = project_manager.get_project(project_id) project_manager.terminate(project_id) @@ -99,7 +122,7 @@ def delete_project(project_id: int) -> JSONResponse: # pylint: disable=invalid- @router.get('/projects/{project_id}/healthcheck', tags=['projects']) -def project_healthcheck(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin +def project_healthcheck(request: Request, project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin """Get project healthcheck (check if project's tracking server process was started). Args: project_id {int}: project id @@ -107,6 +130,8 @@ def project_healthcheck(project_id: int) -> JSONResponse: # pylint: disable=inv starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() project = project_manager.get_project(project_id) is_running = project_manager._is_running(project_id) @@ -118,7 +143,7 @@ def project_healthcheck(project_id: int) -> JSONResponse: # pylint: disable=inv @router.put('/projects/{project_id}/run', tags=['projects']) -def run_project(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin +def run_project(request: Request, project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin """Run project's tracking server. Args: project_id {int}: project id @@ -126,6 +151,10 @@ def run_project(project_id: int) -> JSONResponse: # pylint: disable=invalid-nam starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id + }) + project_manager = ProjectManager() running = project_manager.run(project_id) @@ -140,7 +169,7 @@ def run_project(project_id: int) -> JSONResponse: # pylint: disable=invalid-nam @router.put('/projects/{project_id}/terminate', tags=['projects']) -def terminate_project(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin +def terminate_project(request: Request, project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin """Terminate project's tracking server. Args: project_id {int}: project id @@ -148,6 +177,10 @@ def terminate_project(project_id: int) -> JSONResponse: # pylint: disable=inval starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id + }) + project_manager = ProjectManager() project_manager.terminate(project_id) project = project_manager.get_project(project_id) @@ -156,7 +189,7 @@ def terminate_project(project_id: int) -> JSONResponse: # pylint: disable=inval @router.put('/projects/{project_id}/archive', tags=['projects']) -def archive(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin +def archive(request: Request, project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin """Archive project. Args: id {int}: project id @@ -164,6 +197,10 @@ def archive(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,re starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id + }) + project_manager = ProjectManager() project_manager.terminate(project_id) project_manager.archive(project_id) @@ -173,7 +210,7 @@ def archive(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,re @router.put('/projects/{project_id}/restore', tags=['projects']) -def restore(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin +def restore(request: Request, project_id: int) -> JSONResponse: # pylint: disable=invalid-name,redefined-builtin """Restore project. Args: project_id {int}: project id @@ -181,6 +218,10 @@ def restore(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,re starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id + }) + project_manager = ProjectManager() project_manager.restore(project_id) project = project_manager.get_project(project_id) @@ -189,7 +230,7 @@ def restore(project_id: int) -> JSONResponse: # pylint: disable=invalid-name,re @router.get('/projects/{project_id}/ping', tags=['projects']) -def ping(project_id: int) -> JSONResponse: +def ping(request: Request, project_id: int) -> JSONResponse: """Ping project's tracking server. Args: project_id {int}: project id @@ -197,8 +238,10 @@ def ping(project_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) project = project_manager.get_project(project_id) try: diff --git a/services/projects/src/routers/registered_models.py b/services/projects/src/routers/registered_models.py index 4512ae8..0b2f5e8 100644 --- a/services/projects/src/routers/registered_models.py +++ b/services/projects/src/routers/registered_models.py @@ -7,19 +7,21 @@ from http import HTTPStatus import requests from starlette.responses import JSONResponse +from starlette.requests import Request from typing import Optional, Text +from common.utils import error_response, is_model, ModelDoesNotExistError from projects.src.project_management import ProjectManager from projects.src.routers.utils import get_model_versions, filter_model_versions, \ check_if_project_and_model_exist -from common.utils import error_response, is_model, ModelDoesNotExistError +from projects.src.utils import log_request router = APIRouter() # pylint: disable=invalid-name @router.get('/registered-models', tags=['registered-models']) -def list_models(project_id: int) -> JSONResponse: +def list_models(request: Request, project_id: int) -> JSONResponse: """Get models list. Args: project_id {int}: project id @@ -27,8 +29,10 @@ def list_models(project_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) resp = requests.get(f'{url}/api/2.0/preview/mlflow/registered-models/list') registered_models = [] @@ -46,6 +50,7 @@ def list_models(project_id: int) -> JSONResponse: @router.post('/registered-models', tags=['registered-models']) def register_model( + request: Request, project_id: int, name: Text = Form(...), source: Text = Form(...), @@ -61,11 +66,18 @@ def register_model( starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id, + 'name': name, + 'source': source, + 'run_id': run_id + }) + project_manager = ProjectManager() if not is_model(source): raise ModelDoesNotExistError(f'Model {source} does not exist or is not MLflow model') - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) requests.post( url=f'{url}/api/2.0/preview/mlflow/registered-models/create', json={'name': name} @@ -94,7 +106,7 @@ def register_model( @router.get('/registered-models/{model_id}', tags=['registered-models']) -def get_model(model_id: Text, project_id: int) -> JSONResponse: +def get_model(request: Request, model_id: Text, project_id: int) -> JSONResponse: """Get model. Args: @@ -104,8 +116,10 @@ def get_model(model_id: Text, project_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) model_resp = requests.post( url=f'{url}/api/2.0/preview/mlflow/registered-models/get-details', json={'registered_model': {'name': model_id}} @@ -129,7 +143,7 @@ def get_model(model_id: Text, project_id: int) -> JSONResponse: @router.delete('/registered-models/{model_id}', tags=['registered-models']) -def delete_model(model_id: Text, project_id: int) -> JSONResponse: +def delete_model(request: Request, model_id: Text, project_id: int) -> JSONResponse: """Delete model. Args: model_id {Text}: model id (name) @@ -138,8 +152,13 @@ def delete_model(model_id: Text, project_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id, + 'model_id': model_id + }) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) model_resp = requests.delete( url=f'{url}/api/2.0/preview/mlflow/registered-models/delete', json={'registered_model': {'name': model_id}} @@ -155,7 +174,8 @@ def delete_model(model_id: Text, project_id: int) -> JSONResponse: @router.get('/model-versions', tags=['model-versions']) -def list_model_versions(project_id: int, model_id: Optional[Text] = None) -> JSONResponse: +def list_model_versions(request: Request, project_id: int, + model_id: Optional[Text] = None) -> JSONResponse: """Get model versions list. Args: project_id {int}: project id @@ -164,6 +184,8 @@ def list_model_versions(project_id: int, model_id: Optional[Text] = None) -> JSO starlette.responses.JSONResponse """ + log_request(request) + model_versions = get_model_versions(project_id) if model_id is not None: @@ -193,7 +215,8 @@ def list_model_versions(project_id: int, model_id: Optional[Text] = None) -> JSO @router.get('/model-versions/{version}', tags=['model-versions']) -def get_model_version(version: Text, project_id: int, model_id: Text) -> JSONResponse: +def get_model_version(request: Request, version: Text, project_id: int, + model_id: Text) -> JSONResponse: """Get model versions list. Args: project_id {int}: project id @@ -202,6 +225,8 @@ def get_model_version(version: Text, project_id: int, model_id: Text) -> JSONRes starlette.responses.JSONResponse """ + log_request(request) + check_if_project_and_model_exist(project_id, model_id) model_versions = filter_model_versions(get_model_versions(project_id), model_id) diff --git a/services/projects/src/routers/runs.py b/services/projects/src/routers/runs.py index 70abfb3..a13b432 100644 --- a/services/projects/src/routers/runs.py +++ b/services/projects/src/routers/runs.py @@ -6,17 +6,19 @@ from http import HTTPStatus import requests from starlette.responses import JSONResponse +from starlette.requests import Request from typing import Text -from projects.src.project_management import ProjectManager from common.utils import error_response +from projects.src.project_management import ProjectManager +from projects.src.utils import log_request router = APIRouter() # pylint: disable=invalid-name @router.get('/runs', tags=['runs']) -def list_runs(project_id: int, experiment_id: Text) -> JSONResponse: +def list_runs(request: Request, project_id: int, experiment_id: Text) -> JSONResponse: """Get runs list. Args: project_id {int}: project id @@ -25,8 +27,10 @@ def list_runs(project_id: int, experiment_id: Text) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) resp = requests.post( url=f'{url}/api/2.0/preview/mlflow/runs/search', json={'experiment_ids': [experiment_id]} @@ -47,7 +51,7 @@ def list_runs(project_id: int, experiment_id: Text) -> JSONResponse: @router.get('/runs/{run_id}', tags=['runs']) -def get_run(run_id: Text, project_id: int) -> JSONResponse: +def get_run(request: Request, run_id: Text, project_id: int) -> JSONResponse: """Get run. Args: run_id {Text}: run id @@ -56,8 +60,10 @@ def get_run(run_id: Text, project_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) resp = requests.get( url=f'{url}/api/2.0/preview/mlflow/runs/get?run_id={run_id}', ) @@ -74,7 +80,7 @@ def get_run(run_id: Text, project_id: int) -> JSONResponse: @router.delete('/runs/{run_id}', tags=['runs']) -def delete_run(run_id: Text, project_id: int) -> JSONResponse: +def delete_run(request: Request, run_id: Text, project_id: int) -> JSONResponse: """Delete run. Args: run_id {Text}: run id @@ -83,8 +89,13 @@ def delete_run(run_id: Text, project_id: int) -> JSONResponse: starlette.responses.JSONResponse """ + log_request(request, { + 'project_id': project_id, + 'run_id': run_id + }) + project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) resp = requests.post( url=f'{url}/api/2.0/preview/mlflow/runs/delete', json={'run_id': run_id} diff --git a/services/projects/src/routers/utils.py b/services/projects/src/routers/utils.py index e5a5821..6cfc969 100644 --- a/services/projects/src/routers/utils.py +++ b/services/projects/src/routers/utils.py @@ -22,7 +22,7 @@ def get_model_versions(project_id: int) -> List[Dict]: """ project_manager = ProjectManager() - tracking_uri = project_manager.get_tracking_uri(project_id) + tracking_uri = project_manager.get_internal_tracking_uri(project_id) model_versions_resp = requests.get( url=f'{tracking_uri}/api/2.0/preview/mlflow/model-versions/search' ) @@ -76,7 +76,7 @@ def check_if_project_and_model_exist(project_id: int, model_id: Text) -> None: """ project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) if url is None: raise ProjectNotFoundError(f'Project with ID {project_id} not found') @@ -101,7 +101,7 @@ def get_model_version_uri(project_id: int, model_id: Text, version: Text) -> Tex """ project_manager = ProjectManager() - url = project_manager.get_tracking_uri(project_id) + url = project_manager.get_internal_tracking_uri(project_id) model_resp = requests.post( url=f'{url}/api/2.0/preview/mlflow/registered-models/get-details', json={'registered_model': {'name': model_id}} diff --git a/services/projects/src/utils.py b/services/projects/src/utils.py index 135d42f..c207da3 100644 --- a/services/projects/src/utils.py +++ b/services/projects/src/utils.py @@ -2,10 +2,16 @@ # pylint: disable=wrong-import-order -import datetime +import json import psutil -import requests -from typing import Dict, Text +from starlette.requests import Request +from typing import Dict + +from projects.src.config import Config + + +conf = Config() +logger = conf.get_logger(__name__) def bytes2mb(byte_number: int) -> int: @@ -109,20 +115,20 @@ def process_stat(pid: int) -> Dict: } -def get_utc_timestamp() -> Text: - """Get utc timestamp. - Returns: - Text: utc timestamp - """ - - return str(datetime.datetime.utcnow().timestamp()) +def log_request(request: Request, body_params: Dict = None): + ip = request.headers.get('X-Forwarded-For', request.client.host) + host = request.client.host.split(':', 1)[0] + query_params = dict(request.query_params) -def get_external_ip() -> Text: - """Get external ip address. - Returns: - Text: ip address - """ + log_params = [ + ('method', request.method), + ('path', request.url.path), + ('ip', ip), + ('host', host), + ('params', query_params), + ('body', body_params or {}) + ] - resp = requests.get('https://api.ipify.org/?format=json') - return resp.json().get('ip') \ No newline at end of file + line = json.dumps(dict(log_params)) + logger.debug(line) \ No newline at end of file diff --git a/services/projects/startup.sh b/services/projects/startup.sh new file mode 100755 index 0000000..29b20db --- /dev/null +++ b/services/projects/startup.sh @@ -0,0 +1,7 @@ +sudo chown user $WORKSPACE + +while ! pg_isready -h $DB_HOST -p $DB_PORT; do + echo "Wait for db server..." && sleep 1s; +done; + +uvicorn --host 0.0.0.0 --port 8080 src.app:app --reload \ No newline at end of file diff --git a/services/projects/tests/test_app.py b/services/projects/tests/integration/test_app.py similarity index 99% rename from services/projects/tests/test_app.py rename to services/projects/tests/integration/test_app.py index dc0fc1d..ebd8e19 100644 --- a/services/projects/tests/test_app.py +++ b/services/projects/tests/integration/test_app.py @@ -12,7 +12,6 @@ @pytest.fixture(scope='module') def client(): - # config.WORKSPACE = 'tmp_ws' app.init() return TestClient(app.app) @@ -49,7 +48,7 @@ def wait_loading(url: Text, timeout: int = 10): def teardown_module(): - shutil.rmtree(config.WORKSPACE, ignore_errors=True) + shutil.rmtree(config.Config().get('WORKSPACE'), ignore_errors=True) # /projects (GET) - list project diff --git a/services/projects/tests/test.sh b/services/projects/tests/test.sh index 98ed8ef..7cb3aa2 100755 --- a/services/projects/tests/test.sh +++ b/services/projects/tests/test.sh @@ -1,10 +1,64 @@ -#!/usr/bin/env bash +#!/usr/bin/env sh # Run tests in docker + +realpath() { + OURPWD=$PWD + cd "$(dirname "$1")" + LINK=$(readlink "$(basename "$1")") + while [ "$LINK" ]; do + cd "$(dirname "$LINK")" + LINK=$(readlink "$(basename "$1")") + done + REALPATH="$PWD/$(basename "$1")" + cd "$OURPWD" + echo "$REALPATH" +} + KEYS=$1 +CURDIR=$(dirname $(realpath "$0")) + +NETWORK=mlpanel-test-projects-network +PROJECTS_DB_NAME=projects_db +POSTGRES_USER=user +POSTGRES_PASSWORD=passw +DB_HOST=mlpanel-database-test +DB_PORT=5432 + +docker network rm $NETWORK +docker network create -d bridge --internal $NETWORK -docker run -v $(pwd):/home/projects \ - -v $(pwd)/../../common:/home/common \ +echo "Up PostgreSQL..." +docker run \ + -d \ + --network=$NETWORK \ + --name=$DB_HOST \ + -e POSTGRES_PASSWORD=passw \ + -e POSTGRES_USER=user \ + -e PGDATA=/var/lib/postgresql/data/pgdata \ + --rm \ + --expose $DB_PORT \ + postgres:12.2 + +echo "Wait for postgres server..." +docker exec \ + $DB_HOST \ + /bin/bash -c \ + 'while ! pg_isready; do sleep 1; done;' + +echo "Run tests..." +docker run --network=$NETWORK \ + -v $CURDIR/..:/home/projects \ + -v $CURDIR/../../../common:/home/common \ -e WORKSPACE=/home/workspace \ -e ARTIFACT_STORE=mlruns \ - -e MLFLOW_TRACKING_SERVERS_PORTS_RANGE=5000-5020 \ - mlpanel-projects /bin/bash -c "pytest $KEYS tests/" \ No newline at end of file + -e TRACKING_SERVER_PORTS=5000-5100 \ + -e PROJECTS_DB_NAME=$PROJECTS_DB_NAME \ + -e POSTGRES_USER=$POSTGRES_USER \ + -e POSTGRES_PASSWORD=$POSTGRES_PASSWORD \ + -e DB_HOST=$DB_HOST \ + -e DB_PORT=$DB_PORT \ + mlrepa/mlpanel-base-projects:latest \ + /bin/bash -c "pytest $KEYS tests/integration" + + +docker stop $DB_HOST \ No newline at end of file diff --git a/services/projects/tests/test_project_management.py b/services/projects/tests/test_project_management.py deleted file mode 100644 index e68cf95..0000000 --- a/services/projects/tests/test_project_management.py +++ /dev/null @@ -1,107 +0,0 @@ -import os -import pytest -import shutil -import sqlite3 - -from projects.src import config -from projects.src.project_management import ProjectManager, BadProjectNameError, \ - ProjectAlreadyExistsError, ProjectNotFoundError - - -class TestTrackingServerManager: - - def setup_method(self): - - os.makedirs(config.WORKSPACE, exist_ok=True) - self.tsm = ProjectManager() - - def teardown_method(self): - - shutil.rmtree(config.WORKSPACE, ignore_errors=True) - - def test_init(self): - - assert os.path.exists(os.path.join(config.WORKSPACE, config.PROJECTS_DB_NAME)) is True - - def test_create_project(self): - - project = 'New Project' - self.tsm.create_project(project) - - conn = sqlite3.connect(os.path.join(config.WORKSPACE, config.PROJECTS_DB_NAME)) - cur = conn.cursor() - cur.execute('SELECT * FROM project') - - project_id, project_name, project_description, port, path, archived, created_at, pid = cur.fetchone() - - assert project_id == 1 - assert project_name == project - assert project_description == '' - assert isinstance(port, int) - assert path == os.path.join(config.WORKSPACE, '1') - assert os.path.exists(path) is True - assert archived == 0 - assert pid == -1 - - conn.close() - - def test_create_project_already_exists(self): - - project = 'New Project' - self.tsm.create_project(project) - - with pytest.raises(ProjectAlreadyExistsError): - self.tsm.create_project(project) - - @pytest.mark.parametrize( - 'name', - [1, None, object] - ) - def test_create_project_bad_name(self, name): - - with pytest.raises(BadProjectNameError): - self.tsm.create_project(name) - - def test_list_projects_no_projects(self): - - assert self.tsm.list_projects() == list() - - def test_list_projects(self): - - projects = ['proj1', 'proj2', 'proj3'] - - for proj in projects: - self.tsm.create_project(proj) - - assert set(projects) == set([p.get('name') for p in self.tsm.list_projects()]) - - def test_get_project_no_projects(self): - - with pytest.raises(ProjectNotFoundError): - assert self.tsm.get_project(id=0) - - def test_archive(self): - - project_name = 'New Project' - project_id = self.tsm.create_project(project_name) - self.tsm.archive(project_id) - project_info = self.tsm.get_project(project_id) - - assert project_info['status'] == 'archived' - - def test_run_terminate_server(self): - - project_name = 'Running Project' - - project_id = self.tsm.create_project(project_name) - self.tsm.run(project_id) - - assert self.tsm._is_running(project_id) is True - - self.tsm.terminate(project_id) - - assert self.tsm._is_running(project_id) is False - - - - diff --git a/services/ui/Dockerfile b/services/ui/Dockerfile deleted file mode 100644 index 489f5be..0000000 --- a/services/ui/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM ubuntu:16.04 - -# Install NodeJS -RUN apt-get update && \ - apt-get install -y git curl python3-software-properties && \ - curl -sL https://deb.nodesource.com/setup_12.x | bash - && \ - apt-get install -y nodejs - -# Clone UI repository -RUN cd /home && \ - git clone https://github.com/mlrepa/mlpanel-ui.git && \ - cd mlpanel-ui && \ - git checkout $MLPANEL_UI_BRANCH - -WORKDIR /home/mlpanel-ui - -RUN npm install - -# At each run check updates -CMD git fetch && git checkout $MLPANEL_UI_BRANCH && git pull origin $MLPANEL_UI_BRANCH && npm start diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..56232d2 --- /dev/null +++ b/start.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +source functions.sh +handle_help start +check_input_edition + +if [ 'base' == $EDITION ]; then + export $(cat config/base/.env | grep "^[^#;]") + docker-compose -f config/base/docker-compose.yaml up +else + echo "Edition $EDITION undefined" +fi; + diff --git a/stop.sh b/stop.sh new file mode 100755 index 0000000..943b462 --- /dev/null +++ b/stop.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +source functions.sh +handle_help stop +check_input_edition + +if [ 'base' == "$EDITION" ]; then + export $(cat config/base/.env | grep "^[^#;]") + docker-compose -f config/base/docker-compose.yaml down +else + echo "Edition $EDITION undefined" +fi; \ No newline at end of file diff --git a/swagger.yaml b/swagger.yaml index 146979c..f8a1325 100644 --- a/swagger.yaml +++ b/swagger.yaml @@ -7,63 +7,50 @@ info: email: apiteam@swagger.io license: name: Apache 2.0 - url: http://www.apache.org/licenses/LICENSE-2.0.html + url: https://www.apache.org/licenses/LICENSE-2.0.html version: 1.0.0 +security: + - bearerAuth: [] servers: - - url: http://localhost:8080/ + - url: http://localhost:8089/ + - url: https://localhost:8089/ + paths: - /login: - post: - summary: Login - requestBody: - content: - multipart/form-data: - schema: - required: - - password - - username - properties: - username: - type: string - description: username - password: - type: string - description: password - required: true - responses: - 302: - description: Redirect to /index - content: {} - /logout: - get: - summary: Logout - responses: - 302: - description: Logout and redirect to /index - content: {} - /register: + /token: post: - summary: Register new user + summary: Get access token requestBody: content: multipart/form-data: schema: required: - - password - - username + - client_id + - client_secret properties: - username: + client_id: type: string - description: username - password: + description: Client id + client_secret: type: string - description: password + description: Client secret required: true responses: - 302: - description: 'Redirect: registered=>/index; username is busy || to short - password=>/registration' - content: {} + 200: + description: Access token + content: + application/json: + schema: + type: object + properties: + access_token: + description: Access token + type: string + 401: + description: Access denied + 403: + description: Forbidden + 500: + $ref: '#/components/responses/InternalServerError' /projects: get: @@ -1149,8 +1136,50 @@ paths: - data properties: data: - type: string - example: '[[1,2,3,4]]' + type: object + properties: + schema: + type: object + properties: + fields: + type: array + items: + description: Column names and types + type: object + properties: + name: + description: Column name + type: string + type: + description: Column type + type: string + data: + type: array + items: + type: object + example: + { + 'data': + { + 'schema': { + 'fields': [ + {'name': 'sepal_length', 'type': 'number'}, + {'name': 'sepal_width', 'type': 'number'}, + {'name': 'petal_length', 'type': 'number'}, + {'name': 'petal_width', 'type': 'number'} + ] + }, + 'data': [ + { + 'sepal_length': 6.1, + 'sepal_width': 2.8, + 'petal_length': 4.7, + 'petal_width': 1.2 + } + ] + } + } + required: true responses: 200: @@ -1163,6 +1192,15 @@ paths: prediction: description: Prediction result type: string + 400: + description: Bad schema + content: + application/json: + schema: + type: object + properties: + anomalies: + $ref: '#/components/schemas/DataAnomalies' 404: $ref: '#/components/responses/NotFound' 500: @@ -1190,6 +1228,84 @@ paths: 500: $ref: '#/components/responses/InternalServerError' + /deployments/{deployment_id}/schema: + get: + tags: + - deployments + summary: Get deployment schema + parameters: + - name: deployment_id + in: path + description: Deployment ID + required: true + schema: + type: integer + responses: + 200: + description: Tensorflow data validation statistics converted to dictionary + 404: + $ref: '#/components/responses/NotFound' + 500: + $ref: '#/components/responses/InternalServerError' + + /deployments/{deployment_id}/validation-report: + get: + tags: + - deployments + summary: Get validation reports for specified period + parameters: + - name: deployment_id + in: path + description: Deployment ID + required: true + schema: + type: integer + - name: timestamp_from + in: query + description: Timestamp of period start + required: true + schema: + type: number + format: float + - name: timestamp_to + in: query + description: Timestamp of period end + required: true + schema: + type: number + format: float + responses: + 200: + description: Validation report + content: + application/json: + schema: + type: object + properties: + timestamp_from: + description: Timestamp of period start + type: number + format: float + timestamp_to: + description: Timestamp of period end + type: number + format: float + model_statistics: + description: TFDV statistics for model train data + type: object + incoming_data_statistics: + description: TFDV statistics for data incoming to predict for specified period + type: object + anomalies_detected: + description: Anomalies detected flag + type: boolean + anomalies_info: + $ref: '#/components/schemas/DataAnomalies' + 404: + $ref: '#/components/responses/NotFound' + 500: + $ref: '#/components/responses/InternalServerError' + /artifacts: get: tags: @@ -1267,6 +1383,12 @@ paths: components: + securitySchemes: + bearerAuth: # arbitrary name for the security scheme + type: http + scheme: bearer + bearerFormat: JWT + responses: NotFound: description: The specified resource was not found @@ -1589,6 +1711,45 @@ components: - code - message + DataAnomalies: + type: object + properties: + : + type: object + properties: + description: + description: Anomaly description + type: string + severity: + description: Anomaly severity + type: string + shortDescription: + description: Anomaly short description + type: string + reason: + type: array + items: + type: object + properties: + type: + description: Error type + type: string + shortDescription: + description: Anomaly short description + type: string + description: + description: Anomaly description + type: string + path: + type: object + properties: + step: + type: string + example: + + + + CPU: type: "object" properties: