Merge pull request #2869 from chaoss/augur_0.76.1

Augur 0.76.1 Release
chaoss · Jul 23, 2024 · 5483169 · 5483169
2 parents 6bf35a5 + 0210296
commit 5483169
Show file tree

Hide file tree

Showing 21 changed files with 438 additions and 306 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ augur_export_env.sh
 !docker.config.json
 config.yml
 reports.yml
+*.pid
 
 node_modules/
 .idea/

diff --git a/.pylintrc b/.pylintrc
@@ -3,16 +3,6 @@
 # go here to check pylint codes if not explained
 #https://vald-phoenix.github.io/pylint-errors/
 
-#doc string checkers
-#enable=C0112,C0114,C0115,C0116
-
-# checks for black listed names being used
-#enable=C0102
-
-#refactoring checker
-#enable=R
-
-disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311,E0401,C0116
 
 
 # Analyse import fallback blocks. This can be used to support both Python 2 and
@@ -150,29 +140,9 @@ confidence=HIGH,
            INFERENCE_FAILURE,
            UNDEFINED
 
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once). You can also use "--disable=all" to
-# disable everything first and then re-enable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use "--disable=all --enable=classes
-# --disable=W".
-disable=raw-checker-failed,
-        bad-inline-option,
-        locally-disabled,
-        file-ignored,
-        suppressed-message,
-        useless-suppression,
-        deprecated-pragma,
-        use-symbolic-message-instead
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-enable=c-extension-no-member
+# Only enable specific messages
+disable=all
+enable=unused-import,redefined-outer-name,E1206,E1205,E0704,E0107,E4702,E1101,E0211,E0213,E0103,E1133,E1120,E3102,E0602,E1123,E0001,W0702,W1404,W0706,W0101,W0120,W0718,R1737,R1705,R1720,R1724,R1723,R0401,R1701,C1802,C0200,C0501,C0201,W1001,E1102,R0923
 
 
 [LOGGING]

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Augur NEW Release v0.76.0
+# Augur NEW Release v0.76.1
 
 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! 
 The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io 
@@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
 ## NEW RELEASE ALERT!
 ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)
 
-Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.0  
+Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.1  
 
 - The `main` branch is a stable version of our new architecture, which features:
   - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.

diff --git a/augur/api/routes/util.py b/augur/api/routes/util.py
@@ -5,13 +5,20 @@
 import sqlalchemy as s
 import pandas as pd
 import json
-from flask import Response, current_app
+from flask import Response, current_app, jsonify
 
 from augur.application.db.lib import get_value
 from augur.application.logs import AugurLogger
 
 logger = AugurLogger("augur").get_logger()
 
+@app.route("/api")
+def get_api_version():
+    return jsonify({
+        "status": "up",
+        "route": AUGUR_API_VERSION
+    })
+
 @app.route('/{}/repo-groups'.format(AUGUR_API_VERSION))
 def get_all_repo_groups(): #TODO: make this name automatic - wrapper?
     repoGroupsSQL = s.sql.text("""

diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py
@@ -10,9 +10,10 @@
 import logging
 import psutil
 import signal
-from redis.exceptions import ConnectionError as RedisConnectionError
 import uuid
 import traceback
+import requests
+from redis.exceptions import ConnectionError as RedisConnectionError
 from urllib.parse import urlparse
 
 from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records
@@ -38,14 +39,17 @@ def cli(ctx):
 @cli.command("start")
 @click.option("--disable-collection", is_flag=True, default=False, help="Turns off data collection workers")
 @click.option("--development", is_flag=True, default=False, help="Enable development mode, implies --disable-collection")
+@click.option("--pidfile", default="main.pid", help="File to store the controlling process ID in")
 @click.option('--port')
 @test_connection
 @test_db_connection
 @with_database
 @click.pass_context
-def start(ctx, disable_collection, development, port):
+def start(ctx, disable_collection, development, pidfile, port):
     """Start Augur's backend server."""
-
+    with open(pidfile, "w") as pidfile:
+        pidfile.write(str(os.getpid()))
+
     try:
         if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1":
             raise_open_file_limit(100000)
@@ -75,9 +79,25 @@ def start(ctx, disable_collection, development, port):
     gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file gunicorn.log"
     server = subprocess.Popen(gunicorn_command.split(" "))
 
-    time.sleep(3)
+    logger.info("awaiting Gunicorn start")
+    while not server.poll():
+        try:
+            api_response = requests.get(f"http://{host}:{port}/api")
+        except requests.exceptions.ConnectionError as e:
+            time.sleep(0.5)
+            continue
+
+        if not api_response.ok:
+            logger.critical("Gunicorn failed to start or was not reachable. Exiting")
+            exit(247)
+        break
+    else:
+        logger.critical("Gunicorn was shut down abnormally. Exiting")
+        exit(247)
+
     logger.info('Gunicorn webserver started...')
     logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}')
+    logger.info(f"The API is available at '{api_response.json()['route']}'")
 
     processes = start_celery_worker_processes(float(worker_vmem_cap), disable_collection)
 
@@ -91,7 +111,6 @@ def start(ctx, disable_collection, development, port):
     celery_beat_process = subprocess.Popen(celery_command.split(" "))    
 
     if not disable_collection:
-
         with DatabaseSession(logger, engine=ctx.obj.engine) as session:
 
             clean_collection_status(session)

diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py
@@ -249,15 +249,15 @@ def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys
             data = [data]
 
         else:
-            logger.info("Data must be a list or a dict")
+            logger.error("Data must be a list or a dict")
             return None
 
     if len(data) == 0:
         # self.logger.info("Gave no data to insert, returning...")
         return None
 
     if isinstance(data[0], dict) is False: 
-        logger.info("Must be list of dicts")
+        logger.error("Must be list of dicts")
         return None
 
     # remove any duplicate data 

diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py
@@ -7,6 +7,7 @@
 from augur.tasks.util.worker_util import parse_json_from_subprocess_call
 from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path
 from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
+from augur.tasks.util.metadata_exception import MetadataException
 
 
 def generate_deps_data(logger, repo_git):
@@ -94,50 +95,47 @@ def generate_scorecard(logger, repo_git):
 
     try: 
         required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard)
-    except Exception as e: 
-        logger.error(f"Could not parse required output! Error: {e}")
-        raise e        
-
-    # end
 
-    logger.info('adding to database...')
-    logger.debug(f"output: {required_output}")
+        logger.info('adding to database...')
+        logger.debug(f"output: {required_output}")
 
-    if not required_output['checks']:
-        logger.info('No scorecard checks found!')
-        return
-
-    #Store the overall score first
-    to_insert = []
-    overall_deps_scorecard = {
-        'repo_id': repo_id,
-        'name': 'OSSF_SCORECARD_AGGREGATE_SCORE',
-        'scorecard_check_details': required_output['repo'],
-        'score': required_output['score'],
-        'tool_source': 'scorecard_model',
-        'tool_version': '0.43.9',
-        'data_source': 'Git',
-        'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
-    }
-    to_insert.append(overall_deps_scorecard)
-   # bulk_insert_dicts(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"])
-
-    #Store misc data from scorecard in json field. 
-    for check in required_output['checks']:
-        repo_deps_scorecard = {
+        if not required_output['checks']:
+            logger.info('No scorecard checks found!')
+            return
+
+        #Store the overall score first
+        to_insert = []
+        overall_deps_scorecard = {
             'repo_id': repo_id,
-            'name': check['name'],
-            'scorecard_check_details': check,
-            'score': check['score'],
+            'name': 'OSSF_SCORECARD_AGGREGATE_SCORE',
+            'scorecard_check_details': required_output['repo'],
+            'score': required_output['score'],
             'tool_source': 'scorecard_model',
             'tool_version': '0.43.9',
             'data_source': 'Git',
             'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
         }
-        to_insert.append(repo_deps_scorecard)
-
-    bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"])
-
-    logger.info(f"Done generating scorecard for repo {repo_id} from path {path}")
+        to_insert.append(overall_deps_scorecard)
+    # bulk_insert_dicts(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"])
 
+        #Store misc data from scorecard in json field. 
+        for check in required_output['checks']:
+            repo_deps_scorecard = {
+                'repo_id': repo_id,
+                'name': check['name'],
+                'scorecard_check_details': check,
+                'score': check['score'],
+                'tool_source': 'scorecard_model',
+                'tool_version': '0.43.9',
+                'data_source': 'Git',
+                'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
+            }
+            to_insert.append(repo_deps_scorecard)
+
+        bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"])
+
+        logger.info(f"Done generating scorecard for repo {repo_id} from path {path}")
 
+    except Exception as e: 
+
+        raise MetadataException(e, f"required_output: {required_output}")