diff --git a/.gitignore b/.gitignore index 46e492cd47..8b8fb107ac 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ augur_export_env.sh !docker.config.json config.yml reports.yml +*.pid node_modules/ .idea/ diff --git a/.pylintrc b/.pylintrc index aec2f59d4c..c319333602 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,16 +3,6 @@ # go here to check pylint codes if not explained #https://vald-phoenix.github.io/pylint-errors/ -#doc string checkers -#enable=C0112,C0114,C0115,C0116 - -# checks for black listed names being used -#enable=C0102 - -#refactoring checker -#enable=R - -disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311,E0401,C0116 # Analyse import fallback blocks. This can be used to support both Python 2 and @@ -150,29 +140,9 @@ confidence=HIGH, INFERENCE_FAILURE, UNDEFINED -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then re-enable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - use-symbolic-message-instead - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member +# Only enable specific messages +disable=all +enable=unused-import,redefined-outer-name,E1206,E1205,E0704,E0107,E4702,E1101,E0211,E0213,E0103,E1133,E1120,E3102,E0602,E1123,E0001,W0702,W1404,W0706,W0101,W0120,W0718,R1737,R1705,R1720,R1724,R1723,R0401,R1701,C1802,C0200,C0501,C0201,W1001,E1102,R0923 [LOGGING] diff --git a/README.md b/README.md index e5dde81d04..b5b55526dc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.76.0 +# Augur NEW Release v0.76.1 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.0 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.1 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/augur/api/routes/util.py b/augur/api/routes/util.py index 457afaf6ed..16d3c4db1d 100644 --- a/augur/api/routes/util.py +++ b/augur/api/routes/util.py @@ -5,13 +5,20 @@ import sqlalchemy as s import pandas as pd import json -from flask import Response, current_app +from flask import Response, current_app, jsonify from augur.application.db.lib import get_value from augur.application.logs import AugurLogger logger = AugurLogger("augur").get_logger() +@app.route("/api") +def get_api_version(): + return jsonify({ + "status": "up", + "route": AUGUR_API_VERSION + }) + @app.route('/{}/repo-groups'.format(AUGUR_API_VERSION)) def get_all_repo_groups(): #TODO: make this name automatic - wrapper? repoGroupsSQL = s.sql.text(""" diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index ab8810a5a0..4a6df4d9d4 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -10,9 +10,10 @@ import logging import psutil import signal -from redis.exceptions import ConnectionError as RedisConnectionError import uuid import traceback +import requests +from redis.exceptions import ConnectionError as RedisConnectionError from urllib.parse import urlparse from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records @@ -38,14 +39,17 @@ def cli(ctx): @cli.command("start") @click.option("--disable-collection", is_flag=True, default=False, help="Turns off data collection workers") @click.option("--development", is_flag=True, default=False, help="Enable development mode, implies --disable-collection") +@click.option("--pidfile", default="main.pid", help="File to store the controlling process ID in") @click.option('--port') @test_connection @test_db_connection @with_database @click.pass_context -def start(ctx, disable_collection, development, port): +def start(ctx, disable_collection, development, pidfile, port): """Start Augur's backend server.""" - + with open(pidfile, "w") as pidfile: + pidfile.write(str(os.getpid())) + try: if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) @@ -75,9 +79,25 @@ def start(ctx, disable_collection, development, port): gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file gunicorn.log" server = subprocess.Popen(gunicorn_command.split(" ")) - time.sleep(3) + logger.info("awaiting Gunicorn start") + while not server.poll(): + try: + api_response = requests.get(f"http://{host}:{port}/api") + except requests.exceptions.ConnectionError as e: + time.sleep(0.5) + continue + + if not api_response.ok: + logger.critical("Gunicorn failed to start or was not reachable. Exiting") + exit(247) + break + else: + logger.critical("Gunicorn was shut down abnormally. Exiting") + exit(247) + logger.info('Gunicorn webserver started...') logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}') + logger.info(f"The API is available at '{api_response.json()['route']}'") processes = start_celery_worker_processes(float(worker_vmem_cap), disable_collection) @@ -91,7 +111,6 @@ def start(ctx, disable_collection, development, port): celery_beat_process = subprocess.Popen(celery_command.split(" ")) if not disable_collection: - with DatabaseSession(logger, engine=ctx.obj.engine) as session: clean_collection_status(session) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 7fb5ce0598..8db495c764 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -249,7 +249,7 @@ def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys data = [data] else: - logger.info("Data must be a list or a dict") + logger.error("Data must be a list or a dict") return None if len(data) == 0: @@ -257,7 +257,7 @@ def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys return None if isinstance(data[0], dict) is False: - logger.info("Must be list of dicts") + logger.error("Must be list of dicts") return None # remove any duplicate data diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 76c1b15098..8d60901ede 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -7,6 +7,7 @@ from augur.tasks.util.worker_util import parse_json_from_subprocess_call from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.tasks.util.metadata_exception import MetadataException def generate_deps_data(logger, repo_git): @@ -94,50 +95,47 @@ def generate_scorecard(logger, repo_git): try: required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) - except Exception as e: - logger.error(f"Could not parse required output! Error: {e}") - raise e - - # end - logger.info('adding to database...') - logger.debug(f"output: {required_output}") + logger.info('adding to database...') + logger.debug(f"output: {required_output}") - if not required_output['checks']: - logger.info('No scorecard checks found!') - return - - #Store the overall score first - to_insert = [] - overall_deps_scorecard = { - 'repo_id': repo_id, - 'name': 'OSSF_SCORECARD_AGGREGATE_SCORE', - 'scorecard_check_details': required_output['repo'], - 'score': required_output['score'], - 'tool_source': 'scorecard_model', - 'tool_version': '0.43.9', - 'data_source': 'Git', - 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - } - to_insert.append(overall_deps_scorecard) - # bulk_insert_dicts(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"]) - - #Store misc data from scorecard in json field. - for check in required_output['checks']: - repo_deps_scorecard = { + if not required_output['checks']: + logger.info('No scorecard checks found!') + return + + #Store the overall score first + to_insert = [] + overall_deps_scorecard = { 'repo_id': repo_id, - 'name': check['name'], - 'scorecard_check_details': check, - 'score': check['score'], + 'name': 'OSSF_SCORECARD_AGGREGATE_SCORE', + 'scorecard_check_details': required_output['repo'], + 'score': required_output['score'], 'tool_source': 'scorecard_model', 'tool_version': '0.43.9', 'data_source': 'Git', 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') } - to_insert.append(repo_deps_scorecard) - - bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"]) - - logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") + to_insert.append(overall_deps_scorecard) + # bulk_insert_dicts(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"]) + #Store misc data from scorecard in json field. + for check in required_output['checks']: + repo_deps_scorecard = { + 'repo_id': repo_id, + 'name': check['name'], + 'scorecard_check_details': check, + 'score': check['score'], + 'tool_source': 'scorecard_model', + 'tool_version': '0.43.9', + 'data_source': 'Git', + 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + } + to_insert.append(repo_deps_scorecard) + + bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"]) + + logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") + except Exception as e: + + raise MetadataException(e, f"required_output: {required_output}") diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 49ff2dc14b..e898f0b6df 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -25,8 +25,6 @@ # Returns None on failure. # NOTE: This function is being deprecated in favor of retrieve_dict_from_endpoint def request_dict_from_endpoint(logger, session, url, timeout_wait=10): - #logger.info(f"Hitting endpoint: {url}") - attempts = 0 response_data = None success = False @@ -35,7 +33,7 @@ def request_dict_from_endpoint(logger, session, url, timeout_wait=10): try: response = hit_api(session.oauths, url, logger) except TimeoutError: - logger.info( + logger.warning( f"User data request for enriching contributor data failed with {attempts} attempts! Trying again...") time.sleep(timeout_wait) continue @@ -56,20 +54,19 @@ def request_dict_from_endpoint(logger, session, url, timeout_wait=10): #If we get an error message that's not None if err and err != GithubApiResult.SUCCESS: attempts += 1 - logger.info(f"err: {err}") + logger.warning(f"err: {err}") continue - #logger.info(f"Returned dict: {response_data}") success = True break elif type(response_data) == list: logger.warning("Wrong type returned, trying again...") - logger.info(f"Returned list: {response_data}") + logger.debug(f"Returned list: {response_data}") elif type(response_data) == str: - logger.info( + logger.warning( f"Warning! page_data was string: {response_data}") if "" in response_data: - logger.info("HTML was returned, trying again...\n") + logger.warning("HTML was returned, trying again...\n") elif len(response_data) == 0: logger.warning("Empty string, trying again...\n") else: @@ -95,7 +92,6 @@ def request_dict_from_endpoint(logger, session, url, timeout_wait=10): def create_endpoint_from_email(email): - #self.logger.info(f"Trying to resolve contributor from email: {email}") # Note: I added "+type:user" to avoid having user owned organizations be returned # Also stopped splitting per note above. url = 'https://api.github.com/search/users?q={}+in:email+type:user'.format( @@ -106,7 +102,7 @@ def create_endpoint_from_email(email): def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): - logger.info( + logger.debug( f"Trying to create endpoint from commit hash: {commit_sha}") # https://api.github.com/repos/chaoss/augur/commits/53b0cc122ac9ecc1588d76759dc2e8e437f45b48 @@ -119,23 +115,19 @@ def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): raise KeyError # Else put into a more readable local var - #logger.info(f"Result: {result}") split_git = result.repo_git.split('/') repo_name_and_org = split_git[-2] + "/" + result.repo_name url = "https://api.github.com/repos/" + repo_name_and_org + "/commits/" + commit_sha - logger.info(f"Url: {url}") + logger.debug(f"Commit Hash URL: {url}") return url # Try to construct the best url to ping GitHub's API for a username given a full name. def create_endpoint_from_name(contributor): - #self.logger.info( - # f"Trying to resolve contributor from name: {contributor}") - # Try to get the 'names' field if 'commit_name' field is not present in contributor data. name_field = 'cmt_author_name' if 'commit_name' in contributor else 'name' @@ -159,20 +151,19 @@ def insert_alias(logger, contributor, email): contributor_table_data = get_contributors_by_github_user_id(contributor["gh_user_id"]) - # self.logger.info(f"Contributor query: {contributor_table_data}") # Handle potential failures if len(contributor_table_data) == 1: - logger.info( + logger.debug( f"cntrb_id {contributor_table_data[0].cntrb_id} found in database and assigned to enriched data") elif len(contributor_table_data) == 0: logger.error("Couldn't find contributor in database. Something has gone very wrong. Augur ran into a contributor whose login can be found in the contributor's table, but cannot be retrieved via the user_id that was gotten using the same login.") raise LookupError else: - logger.info( + logger.warning( f"There are more than one contributors in the table with gh_user_id={contributor['gh_user_id']}") - #logger.info(f"Creating alias for email: {email}") + logger.debug(f"Creating alias for email: {email}") #logger.info(f"{contributor_table_data} has type {type(contributor_table_data)}") # Insert a new alias that corresponds to where the contributor was found @@ -215,8 +206,6 @@ def resolve_if_login_existing(logger, contributor): return True # If not found, return false - logger.info( - f"Contributor not found in contributors table but can be added. Adding...") return False """ No longer used after orm upsert implement @@ -278,7 +267,6 @@ def fetch_username_from_email(logger, auth, commit): # email = commit['email_raw'] if 'email_raw' in commit else commit['email_raw'] if len(commit['email_raw']) <= 2: - logger.info("Email less than two characters") return login_json # Don't bother with emails that are blank or less than 2 characters try: @@ -293,9 +281,9 @@ def fetch_username_from_email(logger, auth, commit): # Check if the email result got anything, if it failed try a name search. if login_json is None or 'total_count' not in login_json or login_json['total_count'] == 0: - logger.info( + logger.warning( f"Could not resolve the username from {commit['email_raw']}") - logger.info(f"email api url {url}") + logger.debug(f"email api url {url}") return None else: @@ -321,7 +309,7 @@ def get_login_with_supplemental_data(logger, auth, commit_data): "email": commit_data['email_raw'], "name": commit_data['name'], } - logger.info(f"Inserting data to unresolved: {unresolved}") + logger.debug(f"Inserting data to unresolved: {unresolved}") try: @@ -331,12 +319,12 @@ def get_login_with_supplemental_data(logger, auth, commit_data): logger.error( f"Could not create new unresolved email {unresolved['email']}. Error: {e}") - logger.info( + logger.warning( "Could not resolve the username from the email. Trying a name only search...") try: url = create_endpoint_from_name(commit_data) except Exception as e: - logger.info( + logger.warning( f"Couldn't resolve name url with given data. Reason: {e}") return None @@ -344,11 +332,11 @@ def get_login_with_supplemental_data(logger, auth, commit_data): # total_count is the count of username's found by the endpoint. if login_json is None or 'total_count' not in login_json: - logger.info( + logger.error( "Search query returned an empty response, moving on...\n") return None if login_json['total_count'] == 0: - logger.info( + logger.error( "Search query did not return any results, adding commit's table remains null...\n") return None @@ -380,7 +368,7 @@ def get_login_with_commit_hash(logger, auth, commit_data, repo_id): login_json, _ = retrieve_dict_from_endpoint(logger, auth, url)#request_dict_from_endpoint(session,url) if login_json is None or 'sha' not in login_json: - logger.info(f"Search query returned empty data. Moving on. Data: {login_json}") + logger.debug(f"Search query returned empty data. Moving on. Data: {login_json}") return None try: diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index c2a221037f..fdf1f5da81 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -25,7 +25,7 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id alias_table_data = get_contributor_aliases_by_email(email) if len(alias_table_data) >= 1: # Move on if email resolved - logger.info( + logger.debug( f"Email {email} has been resolved earlier.") continue @@ -37,7 +37,7 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id if len(unresolved_query_result) >= 1: - logger.info(f"Commit data with email {email} has been unresolved in the past, skipping...") + logger.debug(f"Commit data with email {email} has been unresolved in the past, skipping...") continue login = None @@ -57,7 +57,7 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id login = get_login_with_commit_hash(logger, auth, contributor, repo_id) if login == None or login == "": - logger.info("Failed to get login from commit hash") + logger.warning("Failed to get login from commit hash") # Try to get the login from supplemental data if not found with the commit hash login = get_login_with_supplemental_data(logger, auth,contributor) @@ -133,9 +133,9 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey insert_alias(logger, cntrb, emailFromCommitData) except LookupError as e: - logger.info( + logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) - logger.info( + logger.error( f"Contributor id not able to be found in database despite the user_id existing. Something very wrong is happening. Error: {e}") return @@ -152,12 +152,12 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id WHERE email='{}' """.format(escapedEmail)) - logger.info(f"Updating now resolved email {email}") + logger.debug(f"Updating now resolved email {email}") try: execute_sql(query) except Exception as e: - logger.info( + logger.error( f"Deleting now resolved email failed with error: {e}") raise e @@ -288,10 +288,6 @@ def insert_facade_contributors(self, repo_git): AND commits.repo_id = :repo_id """).bindparams(repo_id=repo_id) - #self.logger.info("DEBUG: got passed the sql statement declaration") - # Get a list of dicts that contain the emails and cntrb_id's of commits that appear in the contributor's table. - #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ - # 'repo_id': repo_id}).to_json(orient="records")) result = execute_sql(resolve_email_to_cntrb_id_sql) existing_cntrb_emails = [dict(row) for row in result.mappings()] @@ -299,6 +295,5 @@ def insert_facade_contributors(self, repo_git): print(existing_cntrb_emails) link_commits_to_contributor(logger, facade_helper,list(existing_cntrb_emails)) - logger.info("Done with inserting and updating facade contributors") return diff --git a/augur/tasks/github/messages.py b/augur/tasks/github/messages.py index d47107c163..86fbe054fa 100644 --- a/augur/tasks/github/messages.py +++ b/augur/tasks/github/messages.py @@ -4,7 +4,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo @@ -97,7 +97,11 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger all_data = [] for comment_url in comment_urls: - messages = list(github_data_access.paginate_resource(comment_url)) + try: + messages = list(github_data_access.paginate_resource(comment_url)) + except UrlNotFoundException as e: + logger.warning(e) + continue all_data += messages diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 983ac67595..badc86cd38 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -1,9 +1,7 @@ import sqlalchemy as s -import httpx -from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess, NotFoundException, InvalidDataException from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.lib import bulk_insert_dicts, execute_sql from augur.application.db.util import execute_session_query from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs @@ -38,6 +36,8 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection repo = execute_session_query(query, 'one') owner, name = get_owner_repo(repo.repo_git) + github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) + pr_file_rows = [] logger.info(f"Getting pull request files for repo: {repo.repo_git}") for index, pr_info in enumerate(pr_numbers): @@ -67,29 +67,32 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection } """ - values = ("repository", "pullRequest", "files") + values = ["repository", "pullRequest", "files"] params = { 'owner': owner, 'repo': name, 'pr_number': pr_info['pr_src_number'], - 'values': values } try: - file_collection = GraphQlPageCollection(query, key_auth, logger, bind=params) + for pr_file in github_graphql_data_access.paginate_resource(query, params, values): + + if not pr_file or 'path' not in pr_file: + continue + + data = { + 'pull_request_id': pr_info['pull_request_id'], + 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, + 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, + 'pr_file_path': pr_file['path'], + 'data_source': 'GitHub API', + 'repo_id': repo.repo_id, + } - pr_file_rows += [{ - 'pull_request_id': pr_info['pull_request_id'], - 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, - 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, - 'pr_file_path': pr_file['path'], - 'data_source': 'GitHub API', - 'repo_id': repo.repo_id, - } for pr_file in file_collection if pr_file and 'path' in pr_file] - except httpx.RequestError as e: - logger.error(f"An error occurred while requesting data from the GitHub API: {e}") - except httpx.HTTPStatusError as e: - logger.error(f"HTTP error occurred while requesting data from the GitHub API: {e}") + pr_file_rows.append(data) + except (NotFoundException, InvalidDataException) as e: + logger.warning(e) + continue if len(pr_file_rows) > 0: # Execute a bulk upsert with sqlalchemy diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index dd31f726f4..c581ceb357 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -5,8 +5,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess -from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo @@ -28,7 +27,6 @@ def collect_pull_requests(repo_git: str, full_collection: bool) -> int: logger = logging.getLogger(collect_pull_requests.__name__) with GithubTaskManifest(logger) as manifest: - #with GithubTaskManifest() as manifest: augur_db = manifest.augur_db @@ -50,18 +48,18 @@ def collect_pull_requests(repo_git: str, full_collection: bool) -> int: all_data.append(pr) if len(all_data) >= 1000: - process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, augur_db) total_count += len(all_data) all_data.clear() if len(all_data): - process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, augur_db) total_count += len(all_data) if total_count > 0: return total_count else: - logger.info(f"{owner}/{repo} has no pull requests") + logger.debug(f"{owner}/{repo} has no pull requests") return 0 @@ -72,7 +70,7 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[ owner, repo = get_owner_repo(repo_git) - logger.info(f"Collecting pull requests for {owner}/{repo}") + logger.debug(f"Collecting pull requests for {owner}/{repo}") url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc&sort=updated" @@ -80,7 +78,7 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[ num_pages = github_data_access.get_resource_page_count(url) - logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests") + logger.debug(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests") # returns a generator so this method can be used by doing for x in retrieve_all_pr_data() @@ -228,7 +226,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: review_msg_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/comments" logger = logging.getLogger(collect_pull_request_review_comments.__name__) - logger.info(f"Collecting pull request review comments for {owner}/{repo}") + logger.debug(f"Collecting pull request review comments for {owner}/{repo}") repo_id = get_repo_by_repo_git(repo_git).repo_id @@ -245,23 +243,9 @@ def collect_pull_request_review_comments(repo_git: str) -> None: data_source = "Github API" key_auth = GithubRandomKeyAuth(logger) - pr_review_messages = GithubPaginator(review_msg_url, key_auth, logger) - num_pages = pr_review_messages.get_num_pages() - - all_raw_pr_review_messages = [] - for page_data, page in pr_review_messages.iter_pages(): - - if page_data is None: - break - - if len(page_data) == 0: - logger.debug(f"{owner}/{repo} Pr Review Messages Page {page} contains no data...returning") - logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") - break - - logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") + github_data_access = GithubDataAccess(key_auth, logger) - all_raw_pr_review_messages += page_data + all_raw_pr_review_messages = list(github_data_access.paginate_resource(review_msg_url)) contributors = [] for comment in all_raw_pr_review_messages: @@ -278,8 +262,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: pr_review_msg_mapping_data = {} pr_review_comments_len = len(all_raw_pr_review_messages) - logger.info(f"{owner}/{repo}: Pr review comments len: {pr_review_comments_len}") - for index, comment in enumerate(all_raw_pr_review_messages): + for comment in all_raw_pr_review_messages: # pull_request_review_id is required to map it to the correct pr review if not comment["pull_request_review_id"]: @@ -319,9 +302,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: try: augur_pr_review_id = pr_review_id_mapping[github_pr_review_id] except KeyError: - logger.info(f"{owner}/{repo}: Could not find related pr review") - logger.info(f"{owner}/{repo}: We were searching for pr review with id: {github_pr_review_id}") - logger.info("Skipping") + logger.warning(f"{owner}/{repo}: Could not find related pr review. We were searching for pr review with id: {github_pr_review_id}") continue pr_review_message_ref = extract_pr_review_message_ref_data(comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source) @@ -330,7 +311,8 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] - bulk_insert_dicts(logger, pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) + pr_review_msg_ref_string_columns = ["pr_review_msg_diff_hunk"] + bulk_insert_dicts(logger, pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys, string_fields=pr_review_msg_ref_string_columns) @@ -364,50 +346,34 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: pr_count = len(prs) + github_data_access = GithubDataAccess(manifest.key_auth, logger) + all_pr_reviews = {} for index, pr in enumerate(prs): pr_number = pr.pr_src_number pull_request_id = pr.pull_request_id - logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") + logger.debug(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" - pr_reviews = [] - pr_reviews_generator = GithubPaginator(pr_review_url, manifest.key_auth, logger) - for page_data, page in pr_reviews_generator.iter_pages(): - - if page_data is None: - break - - if len(page_data) == 0: - break - - if isinstance(page_data, list): - page_data = [ - element.decode('utf-8').replace('\x00', ' ') if isinstance(element, bytes) else element - for element in page_data - ] - logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") - elif isinstance(page_data, bytes): - page_data = page_data.decode('utf-8').replace('\x00', ' ') - logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") - - - pr_reviews.extend(page_data) - + try: + pr_reviews = list(github_data_access.paginate_resource(pr_review_url)) + except UrlNotFoundException as e: + logger.warning(e) + continue + if pr_reviews: all_pr_reviews[pull_request_id] = pr_reviews if not list(all_pr_reviews.keys()): - logger.info(f"{owner}/{repo} No pr reviews for repo") + logger.debug(f"{owner}/{repo} No pr reviews for repo") return contributors = [] - for pull_request_id in all_pr_reviews.keys(): + for pull_request_id, reviews in all_pr_reviews.items(): - reviews = all_pr_reviews[pull_request_id] for review in reviews: contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) if contributor: @@ -418,9 +384,8 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: pr_reviews = [] - for pull_request_id in all_pr_reviews.keys(): + for pull_request_id, reviews in all_pr_reviews.items(): - reviews = all_pr_reviews[pull_request_id] for review in reviews: if "cntrb_id" in review: diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index 0cf6705fca..fbdae02026 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -2,6 +2,7 @@ import json import sqlalchemy as s from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from augur.tasks.github.util.github_paginator import hit_api from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict @@ -98,123 +99,74 @@ def repo_info_model(key_auth, repo_orm_obj, logger): owner, repo = get_owner_repo(repo_orm_obj.repo_git) - url = 'https://api.github.com/graphql' - - query = """ - { - repository(owner:"%s", name:"%s"){ - updatedAt - hasIssuesEnabled - issues(states:OPEN) { + query = """query($repo: String!, $owner: String!) { + repository(name: $repo, owner: $owner) { + updatedAt + hasIssuesEnabled + issues(states: OPEN) { totalCount - } - hasWikiEnabled - forkCount - defaultBranchRef { + } + hasWikiEnabled + forkCount + defaultBranchRef { name - } - watchers { + } + watchers { totalCount - } - id - licenseInfo { + } + id + licenseInfo { name url - } - stargazers { + } + stargazers { totalCount - } - codeOfConduct { + } + codeOfConduct { name url - } - issue_count: issues { + } + issue_count: issues { totalCount - } - issues_closed: issues(states:CLOSED) { + } + issues_closed: issues(states: CLOSED) { totalCount - } - pr_count: pullRequests { + } + pr_count: pullRequests { totalCount - } - pr_open: pullRequests(states: OPEN) { + } + pr_open: pullRequests(states: OPEN) { totalCount - } - pr_closed: pullRequests(states: CLOSED) { + } + pr_closed: pullRequests(states: CLOSED) { totalCount - } - pr_merged: pullRequests(states: MERGED) { + } + pr_merged: pullRequests(states: MERGED) { totalCount - } - defaultBranchRef { + } + defaultBranchRef { target { ... on Commit { - history { - totalCount - } + history { + totalCount } + } + } } } - } - } - - """ % (owner, repo) - - ############################## - # { - # repository(owner: "chaoss", name: "augur") { - # updatedAt - # hasIssuesEnabled - # issues(states: OPEN) { - # totalCount - # } - # hasWikiEnabled - # forkCount - # defaultBranchRef { - # name - # } - # watchers { - # totalCount - # } - # id - # licenseInfo { - # name - # url - # } - # stargazers { - # totalCount - # } - # codeOfConduct { - # name - # url - # } - # issue_count: issues { - # totalCount - # } - # issues_closed: issues(states: CLOSED) { - # totalCount - # } - # pr_count: pullRequests { - # totalCount - # } - # pr_open: pullRequests(states: OPEN) { - # totalCount - # } - # pr_closed: pullRequests(states: CLOSED) { - # totalCount - # } - # pr_merged: pullRequests(states: MERGED) { - # totalCount - # } - # stargazerCount - # } - # } + } + """ + + github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) - try: - data = grab_repo_info_from_graphql_endpoint(key_auth, logger, query) - except Exception as e: - logger.error(f"Could not grab info for repo {repo_orm_obj.repo_id}") - raise e + variables = { + "owner": owner, + "repo": repo + } + + result_keys = ["repository"] + + data = github_graphql_data_access.get_resource(query, variables, result_keys) # Get committers count info that requires seperate endpoint committers_count = query_committers_count(key_auth, logger, owner, repo) diff --git a/augur/tasks/github/util/github_graphql_data_access.py b/augur/tasks/github/util/github_graphql_data_access.py new file mode 100644 index 0000000000..7d8c6851e1 --- /dev/null +++ b/augur/tasks/github/util/github_graphql_data_access.py @@ -0,0 +1,221 @@ +import logging +import time +import httpx +from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception, RetryError + +URL = "https://api.github.com/graphql" + +class RatelimitException(Exception): + + def __init__(self, response, message="Github Rate limit exceeded") -> None: + + self.response = response + + super().__init__(message) + +class NotFoundException(Exception): + pass + +class InvalidDataException(Exception): + pass + +class GithubGraphQlDataAccess: + + + def __init__(self, key_manager, logger: logging.Logger): + + self.logger = logger + self.key_manager = key_manager + + def get_resource(self, query, variables, result_keys): + + result_json = self.make_request_with_retries(query, variables).json() + + data = self.__extract_data_section(result_keys, result_json) + + return data + + + def paginate_resource(self, query, variables, result_keys): + + params = { + "numRecords" : 100, + "cursor" : None + } + params.update(variables) + + result_json = self.make_request_with_retries(query, params).json() + + data = self.__extract_data_section(result_keys, result_json) + + if self.__get_total_count(data) == 0: + return + + yield from self.__extract_raw_data_into_list(data) + + while self.__has_next_page(data): + params["cursor"] = self.__get_next_page_cursor(data) + + result_json = self.make_request_with_retries(query, params).json() + + data = self.__extract_data_section(result_keys, result_json) + + yield from self.__extract_raw_data_into_list(data) + + def make_request(self, query, variables, timeout=40): + + with httpx.Client() as client: + + json_dict = { + 'query' : query + } + + if variables: + json_dict['variables'] = variables + + response = client.post(url=URL,auth=self.key_manager,json=json_dict, timeout=timeout) + + response.raise_for_status() + + json_response = response.json() + if "errors" in json_response and len(json_response["errors"]) > 0: + errors = json_response["errors"] + + not_found_error = self.__find_first_error_of_type(errors, "NOT_FOUND") + + if not_found_error: + message = not_found_error.get("message", "Resource not found.") + raise NotFoundException(f"Could not find: {message}") + + raise Exception(f"Github Graphql Data Access Errors: {errors}") + + return response + + + def make_request_with_retries(self, query, variables, timeout=100): + """ What method does? + 1. Catches RetryError and rethrows a nicely formatted OutOfRetriesException that includes that last exception thrown + """ + + try: + return self.__make_request_with_retries(query, variables, timeout) + except RetryError as e: + raise e.last_attempt.exception() + + @retry(stop=stop_after_attempt(10), wait=wait_fixed(5), retry=retry_if_exception(lambda exc: not isinstance(exc, NotFoundException))) + def __make_request_with_retries(self, query, variables, timeout=40): + """ What method does? + 1. Retires 10 times + 2. Waits 5 seconds between retires + 3. Does not rety UrlNotFoundException + 4. Catches RatelimitException and waits before raising exception + """ + + try: + return self.make_request(query, variables, timeout) + except RatelimitException as e: + self.__handle_github_ratelimit_response(e.response) + raise e + + def __handle_github_ratelimit_response(self, response): + + headers = response.headers + + if "Retry-After" in headers: + + retry_after = int(headers["Retry-After"]) + self.logger.info( + f'\n\n\n\nSleeping for {retry_after} seconds due to secondary rate limit issue.\n\n\n\n') + time.sleep(retry_after) + + elif "X-RateLimit-Remaining" in headers and int(headers["X-RateLimit-Remaining"]) == 0: + current_epoch = int(time.time()) + epoch_when_key_resets = int(headers["X-RateLimit-Reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nAPI rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + else: + time.sleep(60) + + def __extract_data_section(self, keys, json_response): + + if not json_response: + raise Exception(f"Empty data returned. Data: {json_response}") + + if 'data' not in json_response: + raise Exception(f"Error: 'data' key missing from response. Response: {json_response}") + + core = json_response['data'] + + # iterate deeper into the json_response object until we get to the desired data + for value in keys: + + if core is None: + raise Exception(f"Error: 'core' is None when trying to index by {value}. Response: {json_response}") + + core = core[value] + + if core is None: + raise InvalidDataException(f"Error: The data section is null. Unable to process") + + return core + + def __extract_raw_data_into_list(self, data): + + if 'edges' not in data: + raise Exception(f"Error: 'edges' key not present in data. Data {data}") + + data_list = [] + for edge in data['edges']: + + if 'node' not in edge: + raise Exception(f"Error: 'node' key not present in data. Data {data}") + + data_list.append(edge['node']) + + return data_list + + def __has_next_page(self, data): + + if 'pageInfo' not in data: + raise Exception(f"Error: 'pageInfo' key not present in data. Data {data}") + + if 'hasNextPage' not in data['pageInfo']: + raise Exception(f"Error: 'hasNextPage' key not present in data. Data {data}") + + if not isinstance(data['pageInfo']['hasNextPage'], bool): + raise Exception(f"Error: pageInfo.hasNextPage is not a bool. Data {data}") + + return data['pageInfo']['hasNextPage'] + + def __get_next_page_cursor(self, data): + + if 'pageInfo' not in data: + raise Exception(f"Error: 'pageInfo' key not present in data. Data {data}") + + if 'endCursor' not in data['pageInfo']: + raise Exception(f"Error: 'endCursor' key not present in data. Data {data}") + + return data['pageInfo']['endCursor'] + + def __get_total_count(self, data): + + if 'totalCount' not in data: + raise Exception(f"Error: totalCount key not found in data. Data: {data}") + + if data["totalCount"] is None: + raise Exception(f"Error: totalCount is null. Data: {data}") + + try: + return int(data["totalCount"]) + except ValueError as exc: + raise Exception(f"Error: totalCount is not an integer. Data: {data}") from exc + + def __find_first_error_of_type(self, errors, type): + + return next((error for error in errors if error.get("type").lower() == type.lower()), None) diff --git a/augur/tasks/util/metadata_exception.py b/augur/tasks/util/metadata_exception.py new file mode 100644 index 0000000000..a861badac9 --- /dev/null +++ b/augur/tasks/util/metadata_exception.py @@ -0,0 +1,6 @@ +class MetadataException(Exception): + def __init__(self, original_exception, additional_metadata): + self.original_exception = original_exception + self.additional_metadata = additional_metadata + + super().__init__(f"{str(self.original_exception)} | Additional metadata: {self.additional_metadata}") diff --git a/augur/tasks/util/random_key_auth.py b/augur/tasks/util/random_key_auth.py index c9f865db9e..f2fea35b36 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/augur/tasks/util/random_key_auth.py @@ -33,7 +33,7 @@ def auth_flow(self, request: Request) -> Generator[Request, Response, None]: if self.list_of_keys: key_value = choice(self.list_of_keys) - self.logger.info(f'Key value used in request: {key_value}') + self.logger.debug(f'Key value used in request: {key_value}') # formats the key string into a format GitHub will accept if self.key_format: diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index 51b0109faa..5ec2e6eebc 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -11,6 +11,9 @@ import json import subprocess +from augur.tasks.util.metadata_exception import MetadataException + + def create_grouped_task_load(*args,processes=8,dataList=[],task=None): if not dataList or not task: @@ -141,7 +144,7 @@ def parse_json_from_subprocess_call(logger, subprocess_arr, cwd=None): required_output = {} except json.decoder.JSONDecodeError as e: logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") - raise e + raise MetadataException(e, f"output : {output}") return required_output diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 9676b40ce5..892a140ac0 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.10-bookworm LABEL maintainer="outdoors@acm.org" -LABEL version="0.71.0" +LABEL version="0.76.1" ENV DEBIAN_FRONTEND=noninteractive diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index df88b16c1e..c384d0f306 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -2,7 +2,7 @@ FROM postgres:14 LABEL maintainer="outdoors@acm.org" -LABEL version="0.71.0" +LABEL version="0.76.1" ENV POSTGRES_DB "test" ENV POSTGRES_USER "augur" diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index 266bec64a5..6b020e6412 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -1,7 +1,7 @@ FROM rabbitmq:3.12-management-alpine LABEL maintainer="574/augur@simplelogin.com" -LABEL version="0.71.0" +LABEL version="0.76.1" ARG RABBIT_MQ_DEFAULT_USER=augur ARG RABBIT_MQ_DEFAULT_PASSWORD=password123 diff --git a/metadata.py b/metadata.py index d7b4ac37a2..5cc02b504c 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.76.0" -__release__ = "v0.76.0 (England's Favorite Traitors)" +__version__ = "0.76.1" +__release__ = "v0.76.1 (For the Good of the Data)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024"