Skip to content

Commit

Permalink
[CELERY] Integration
Browse files Browse the repository at this point in the history
  • Loading branch information
StanGirard committed Jul 23, 2020
2 parents 1fb107e + 4855f38 commit 8146eb6
Show file tree
Hide file tree
Showing 42 changed files with 831 additions and 318 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ visited.db
test.py
.idea/*
.env
dump.rdb
22 changes: 13 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ I've grown tired of SEO agencies making us pay hundreds of euros for simple tool

## Installation

You need **Python3**
You need:
- **Python3**
- **[Redis Server](https://redis.io/topics/quickstart)**


```Bash
git clone https://github.com/StanGirard/SEOToolkit
Expand All @@ -42,24 +45,25 @@ Then install dependencies
pip install -r requirements.txt
```

or you can use Docker
## Running

### Flask
```Bash
docker pull stangirard/osat:latest
python3 run.py
```

## Running

### Redis Server
```Bash
flask run
redis-server
```

or with docker

### Celery Worker
```Bash
docker run -d -p 5000:5000 stangirard/osat:latest
celery worker -A celery_worker.celery --loglevel=info
```



## Dashboard

You can access the dashboard by going to [localhost:5000](http://localhost:5000)
Expand Down
5 changes: 5 additions & 0 deletions celery_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from toolkit import celery
from toolkit.factory import create_app
from toolkit.celery_utils import init_celery
app = create_app()
init_celery(celery, app)
4 changes: 4 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ class Config:
FLASK_APP = environ.get('FLASK_APP', "SEOToolkit")
FLASK_ENV = environ.get('FLASK_ENV', 'development')
GOOGLE_API_KEY = environ.get('GOOGLE_API_KEY', "None")

# Celery
CELERY_BROKER_URL = environ.get('CELERY_BROKER_URL','redis://localhost:6379/0')
CELERY_RESULT_BACKEND = environ.get('CELERY_RESULT_BACKEND','redis://localhost:6379/0')

# Database
SQLALCHEMY_DATABASE_URI = environ.get("SQLALCHEMY_DATABASE_URI", "sqlite:///database.db")
Expand Down
30 changes: 16 additions & 14 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
SQLAlchemy==1.3.13
matplotlib==3.1.2
inscriptis==0.0.4.1.1
seaborn==0.10.1
numpy==1.18.1
beautifulsoup4==4.9.1
bokeh==2.0.2
requests==2.20.0
nltk==3.4.5
Flask_SQLAlchemy==2.4.1
pandas==1.0.1
celery==4.4.6
Flask_SQLAlchemy==2.4.3
Flask==1.1.1
networkx==2.4
scipy==1.4.0
google==2.0.3
gensim==3.8.1
beautifulsoup4==4.9.1
python-dotenv==0.13.0
google==2.0.3
inscriptis==0.0.4.1.1
matplotlib==3.1.2
networkx==2.4
nltk==3.4.5
numpy==1.18.1
pandas==1.0.1
python-dotenv==0.14.0
requests==2.20.0
redis==3.5.3
scikit_learn==0.23.1
scipy==1.4.0
seaborn==0.10.1
SQLAlchemy==1.3.18
6 changes: 6 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from toolkit import factory
import toolkit

if __name__ == "__main__":
app = factory.create_app(celery=toolkit.celery)
app.run(host='0.0.0.0')
5 changes: 5 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
redis-server &
python3 run.py &
celery worker -A celery_worker.celery --loglevel=info --pool=solo


17 changes: 6 additions & 11 deletions toolkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
from flask import Flask
from celery import Celery
from flask_sqlalchemy import SQLAlchemy
dbAlchemy = SQLAlchemy()

def make_celery(app_name=__name__):
backend = "redis://localhost:6379/0"
broker = backend.replace("0", "1")
return Celery(app_name, backend=backend, broker=broker)

def create_app():
"""Construct the core application."""
app = Flask(__name__)
app.config.from_object('config.Config')
dbAlchemy.init_app(app)

with app.app_context():
import toolkit.routes # Import routes
dbAlchemy.create_all() # Create sql tables for our data models
return app
celery = make_celery()
8 changes: 8 additions & 0 deletions toolkit/celery_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
def init_celery(celery, app):
celery.conf.update(app.config)
TaskBase = celery.Task
class ContextTask(TaskBase):
def __call__(self, *args, **kwargs):
with app.app_context():
return TaskBase.__call__(self, *args, **kwargs)
celery.Task = ContextTask
14 changes: 14 additions & 0 deletions toolkit/celeryapp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from celery import Celery

def make_celery(app):
celery = Celery(app.import_name, backend=app.config['CELERY_RESULT_BACKEND'],
broker=app.config['CELERY_BROKER_URL'])
celery.conf.update(app.config)
TaskBase = celery.Task
class ContextTask(TaskBase):
abstract = True
def __call__(self, *args, **kwargs):
with app.app_context():
return TaskBase.__call__(self, *args, **kwargs)
celery.Task = ContextTask
return celery
88 changes: 88 additions & 0 deletions toolkit/celeryapp/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import math
import time
from datetime import datetime
import json

from celery.signals import task_prerun, worker_process_init
from flask import current_app as app
from sqlalchemy import update
from toolkit import celery
from toolkit import dbAlchemy as db
from toolkit.controller.graphs.core import generate_interactive_graph
from toolkit.controller.keywords.core import get_query_results
from toolkit.controller.seo.audit import get_all_links_website
from toolkit.controller.seo.headers import find_all_headers_url
from toolkit.controller.seo.images import find_all_images
from toolkit.controller.seo.lighthouse import audit_google_lighthouse_full
from toolkit.controller.seo.links import find_all_links
from toolkit.controller.serp.core import query_domain_serp
from toolkit.models import Audit, LighthouseScore

# @task_prerun.connect
# def celery_prerun(*args, **kwargs):
# #print g
# print("Launching Celery App")

@celery.task(bind=True,name="Lighthouse")
def LighthouseAudit(self,url):
new_score = LighthouseScore(
url = url,status_job="RUNNING",task_id=str(self.request.id), accessibility=0,pwa=0,seo=0, best_practices=0,performance=0, begin_date=datetime.now()
)
db.session.add(new_score)
db.session.commit()
value = audit_google_lighthouse_full(url)
accessibility = int(math.floor(value["lighthouseResult"]["categories"]["accessibility"]["score"] * 100))
seo = int(math.floor(value["lighthouseResult"]["categories"]["seo"]["score"] * 100))
pwa = int(math.floor(value["lighthouseResult"]["categories"]["pwa"]["score"] * 100))
best_practices = int(math.floor(value["lighthouseResult"]["categories"]["best-practices"]["score"] * 100))
performance = int(math.floor(value["lighthouseResult"]["categories"]["performance"]["score"] * 100))
conn = db.engine.connect()
smt = update(LighthouseScore).where(LighthouseScore.url == url).values(accessibility=accessibility,pwa=pwa,seo=seo, best_practices=best_practices,performance=performance, status_job="FINISHED")
conn.execute(smt)
return {'url': url, 'status': 'Task completed!'}

@celery.task(bind=True,name="Graphs")
def GraphsGenerate(self,domain):
result = generate_interactive_graph(domain,str(self.request.id), False, 500)
return {'url': domain, 'status': 'Task completed!'}

@celery.task(bind=True,name="SerpRank")
def SerpRank(self,query, domain, lang, tld):
result = query_domain_serp(query, domain, lang, tld, str(self.request.id))
return {'url': domain, 'status': 'Task completed!'}

@celery.task(bind=True,name="Keywords")
def KeywordsGet(self,query):
result = get_query_results(query, str(self.request.id))
return {'url': query, 'status': 'Task completed!'}

@celery.task(bind=True,name="Extract")
def Extractor(self,extract_type, url):
new_audit = Audit(
url=url, result=None, type_audit=extract_type,status_job="RUNNING",task_id=str(self.request.id), begin_date=datetime.now()
)
db.session.add(new_audit)
db.session.commit()
if extract_type == "Headers":
value = find_all_headers_url(url)
conn = db.engine.connect()
smt = update(Audit).where(Audit.url == url).where(Audit.type_audit == extract_type).values(result=json.dumps(value), status_job="FINISHED")
conn.execute(smt)
if extract_type == "Links":
value = find_all_links(url)
conn = db.engine.connect()
smt = update(Audit).where(Audit.url == url).where(Audit.type_audit == extract_type).values(result=json.dumps(value), status_job="FINISHED")
conn.execute(smt)
if extract_type == "Links_Website":
value = get_all_links_website(url)
conn = db.engine.connect()
smt = update(Audit).where(Audit.url == url).where(Audit.type_audit == extract_type).values(result=json.dumps(value), status_job="FINISHED")
conn.execute(smt)
if extract_type == "Images":
print("hello")
value = find_all_images(url)
conn = db.engine.connect()
smt = update(Audit).where(Audit.url == url).where(Audit.type_audit == extract_type).values(result=json.dumps(value), status_job="FINISHED")
conn.execute(smt)

return {'url': url,"Extract": extract_type, 'status': 'Task completed!'}
80 changes: 50 additions & 30 deletions toolkit/controller/graphs/core.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
import logging
import math
import urllib
import urllib.parse
from datetime import datetime, timedelta

from bokeh.embed import components
import urllib.parse
import networkx as nx
import seaborn as sns
from bokeh.embed import components
from bokeh.layouts import row
from bokeh.models import (BoxZoomTool, Circle, ColorBar, ColumnDataSource,
DataTable, HoverTool, MultiLine, Range1d, ResetTool,
TableColumn)
from bokeh.models.graphs import NodesAndLinkedEdges
from bokeh.palettes import Spectral4, Spectral6, Spectral8
from bokeh.plotting import figure, from_networkx
from bokeh.models import (BoxZoomTool, Circle, HoverTool,
MultiLine, Range1d, ResetTool, ColorBar,
ColumnDataSource, DataTable, TableColumn)
from bokeh.transform import linear_cmap
from bokeh.palettes import Spectral4, Spectral8, Spectral6
from bokeh.models.graphs import NodesAndLinkedEdges
from bokeh.layouts import row
from flask import render_template

from flask import render_template, request
from sqlalchemy import update
from toolkit import dbAlchemy as db
from toolkit.lib.api_tools import generate_answer
from toolkit.lib.http_tools import request_parse, request_status_code
import seaborn as sns
import logging
from toolkit.models import Graphs

palette = sns.color_palette("hls", 99)
pal_hex_lst = palette.as_hex()
Expand Down Expand Up @@ -153,33 +157,49 @@ def update_or_insert_graph_in_db(conn, urls, maximum, update=False):
return render_template("graphs/bokeh.jinja2", script=script, div=div, domain=domain, template="Flask", time=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))


def generate_interactive_graph(conn, urls, relaunch, maxi_urls):




def update_or_insert_graph_in_db( urls, maximum, updating=False):
plot, domain = generate_graph_internal_link_interactive(urls, maximum)
script, div = components(plot)
conn = db.engine.connect()
smt = update(Graphs).where(Graphs.urls == urls).values(script= script,
div = div, begin_date=datetime.now(), status_job="FINISHED")
conn.execute(smt)
return render_template("graphs/bokeh.jinja2", script=script, div=div, domain=domain, template="Flask", time=datetime.now())

def generate_interactive_graph(urls, relaunch,task, maxi_urls):
if urls is None:
return "Empty Url paramaters"
maximum_urls = 500
if maxi_urls is not None:
maximum_urls = int(maxi_urls)
stopped, already_exists = graphs.check_status_url(conn, urls, "FINISHED")

if stopped == True:

# If not first time
if already_exists:
query_result = graphs.select_visited(conn, urls)
urls_exists = Graphs.query.filter(Graphs.urls == urls).count()
if urls_exists > 0:
stopped = Graphs.query.filter(Graphs.urls == urls and Graphs.status_job == "RUNNING").first()
if stopped.status_job == "FINISHED":
query_result = Graphs.query.filter(Graphs.urls == urls and Graphs.status_job == "RUNNING").first()
# ALREADY VISITED IN THE LAST 24 HOURS

if datetime.strptime(query_result[0][2], '%m/%d/%Y, %H:%M:%S') + timedelta(hours=24) > datetime.now() and relaunch != "True":
return render_template("graphs/bokeh.jinja2", script=query_result[0][3], div=query_result[0][4], domain=urllib.parse.urlparse(query_result[0][1]).netloc, template="Flask", time=datetime.strptime(query_result[0][2], '%m/%d/%Y, %H:%M:%S'))
if query_result.begin_date + timedelta(hours=24) > datetime.now() and relaunch != "True":
return render_template("graphs/bokeh.jinja2", script=query_result.script, div=query_result.div, domain=urllib.parse.urlparse(query_result.urls).netloc, template="Flask", time=query_result.begin_date)

# More than 24 hours or parameter redo is True
if (datetime.strptime(query_result[0][2], '%m/%d/%Y, %H:%M:%S') + timedelta(hours=24) < datetime.now() or relaunch == "True"):
graphs.update_running_db(conn, ("RUNNING", urls))
return update_or_insert_graph_in_db(conn, urls, maximum_urls, True)
if query_result.begin_date + timedelta(hours=24) < datetime.now() or relaunch == "True":
conn = db.engine.connect()
smt = update(Graphs).where(Graphs.urls == urls).values(status_job="RUNNING")
conn.execute(smt)
return update_or_insert_graph_in_db(urls, maximum_urls, True)

# If first time
else:
graphs.insert_url_db(conn, (urls, datetime.now().strftime(
"%m/%d/%Y, %H:%M:%S"), "", "", "RUNNING"))
return update_or_insert_graph_in_db(conn, urls, maximum_urls)
return {"error": "You graph is being generated. Please wait"}

else:
return "JOB IS ALREADY RUNNING. PLEASE WAIT AND REFRESH."
new_graph = Graphs(
urls = urls, script="", div="", status_job = "RUNNING",task_id=task, begin_date=datetime.now()
)
db.session.add(new_graph)
db.session.commit()
return update_or_insert_graph_in_db(urls, maximum_urls)
Empty file.
30 changes: 30 additions & 0 deletions toolkit/controller/keywords/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import json
from datetime import datetime

from sqlalchemy import update

from toolkit import dbAlchemy as db
from toolkit.controller.analysis.keywords import generate_results
from toolkit.models import Keywords
from toolkit.lib.api_tools import generate_answer


def get_query_results(query,task, redo=False):
check_exist = Keywords.query.filter(Keywords.query_text==query).count()
if check_exist > 0:
result = Keywords.query.filter(Keywords.query_text==query).first()
if result.status_job == "RUNNING":
return {"error": "query is already running, please wait and then refresh"}
elif result.status_job == "FINISHED":
return json.loads(result.results)
else:
new_keywords = Keywords(query_text=query, results="",
status_job="RUNNING",task_id=task,begin_date=datetime.now())
db.session.add(new_keywords)
db.session.commit()
results = generate_results(query, 20)
conn = db.engine.connect()
smt = update(Keywords).where(Keywords.query_text==query).values(results=json.dumps(results), status_job="FINISHED")
conn.execute(smt)
return results
return "error"
Empty file.
Loading

0 comments on commit 8146eb6

Please sign in to comment.