diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 218e1bed254..b8402e84ba3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -5,11 +5,13 @@ stages: test_ci: stage: test - image: ubuntu:18.04 + image: hub.meri.dev/test-docker/test:latest # only: # - setup-ci before_script: - - apt update && apt install -y openssh-client wget libarchive-dev libcurl4-openssl-dev git python3.7 python3-pip + - apt update && apt install -y libarchive-dev #libcurl4-openssl-dev + - apt install -y zlib1g-dev libicu-dev libcurl3 libcurl-openssl1.0-dev + - apt install -y build-essential cmake libssl-dev pkg-config cmake - wget http://131.123.42.38/lmcrs/beta/srcML-Ubuntu18.04.deb - dpkg -i srcML-Ubuntu18.04.deb - mkdir -p ~/.ssh @@ -22,11 +24,9 @@ test_ci: - export LC_ALL=C.UTF-8 - export LANG=C.UTF-8 script: - - apt-get update - git config --global user.email "merico@meri.co" - git config --global user.name "merico" - pip3 install pipenv - - echo -e "machine gitlab.com\nlogin ${GITLAB_USER}\npassword ${GITLAB_PASSWD}" > ~/.netrc - git clone https://gitlab.com/persper/code-analytics.git && cd code-analytics #&& git checkout ${CI_COMMIT_REF_NAME} diff --git a/Pipfile b/Pipfile index 19b18641463..dbdd78f493b 100644 --- a/Pipfile +++ b/Pipfile @@ -25,6 +25,7 @@ aenum = "*" pytest-cov = "*" gitpython = "*" sphinx = "*" +python-louvain = "*" [dev-packages] diff --git a/README.md b/README.md index 5cebce8dea4..77d7eed63db 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,13 @@ The following procedure is tested on Ubuntu 16.04 LTS. Download and install Python 3.6+: . Also, create a symbolic link from `python3` to `python` since some scripts reply on it. -``` +```sh sudo ln -s /usr/bin/python3 /usr/bin/python ``` 2. Install python dependencies (we recommend to use pipenv) -```bash +```sh pipenv install ``` @@ -29,7 +29,7 @@ pipenv install In order to uset the `--indent-heuristic` option of `git diff`, we require git version >= 2.11. Use the following commands to upgrade: -```bash +```sh sudo add-apt-repository ppa:git-core/ppa -y sudo apt-get update sudo apt-get install git -y @@ -40,12 +40,12 @@ git --version Add the following line to your `~/.bashrc` file. -``` +```sh export PYTHONPATH=$PYTHONPATH:/path/to/dir ``` To update your path for the remainder of the session. -``` +```sh source ~/.bashrc ``` @@ -55,14 +55,21 @@ Please download from [here](https://www.srcml.org/#download) and follow the [ins srcML also needs `libarchive-dev` and `libcurl4-openssl-dev`. Install them with the following commands: -```bash +```sh sudo apt install libarchive-dev sudo apt install libcurl4-openssl-dev ``` 6. Check setup correctness -```bash +As the test process will create Git repositories, set up your global Git user name and email before testing: +```sh +git config --global user.email "you@example.com" +git config --global user.name "Your Name" +``` + +Run the test process: +```sh pipenv run pytest test/test_analytics ``` diff --git a/notebooks/demo.ipynb b/notebooks/demo.ipynb new file mode 100644 index 00000000000..48ddf3760fb --- /dev/null +++ b/notebooks/demo.ipynb @@ -0,0 +1,72 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import deps\n", + "import os\n", + "from persper.analytics.c import CGraphServer\n", + "from persper.analytics.analyzer2 import Analyzer\n", + "from persper.analytics.graph_server import C_FILENAME_REGEXES\n", + "from persper.util.path import root_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# configure your project\n", + "repo_path = os.path.join(root_path, 'repos/')\n", + "\n", + "# configure alpha for devrank\n", + "alpha = 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# start analysis and show commit devrank values\n", + "az = Analyzer(repo_path, CGraphServer(C_FILENAME_REGEXES))\n", + "await az.analyze()\n", + "ccgraph = az.graph\n", + "ccgraph.commit_devranks(alpha)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "code-analytics-8iDyuztf", + "language": "python", + "name": "code-analytics-8idyuztf" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py index ecfc398e850..189af7f2899 100644 --- a/persper/analytics/analyzer2.py +++ b/persper/analytics/analyzer2.py @@ -2,6 +2,7 @@ import collections.abc import logging import re +import sys import time from abc import ABC from typing import List, Optional, Set, Union, Dict @@ -22,7 +23,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer, firstParentOnly: bool = False, commit_classifier: Optional[CommitClassifier] = None, skip_rewind_diff: bool = False, - monolithic_commit_lines_threshold: int = 5000): + monolithic_commit_lines_threshold: int = 5000, + monolithic_file_bytes_threshold: int = 200000): # skip_rewind_diff will skip diff, but rewind commit start/end will still be notified to the GraphServer. self._repositoryRoot = repositoryRoot self._graphServer = graphServer @@ -37,6 +39,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer, self._clf_results: Dict[str, List[float]] = {} self._skip_rewind_diff = skip_rewind_diff self._monolithic_commit_lines_threshold = monolithic_commit_lines_threshold + self._monolithic_file_bytes_threshold = monolithic_file_bytes_threshold + self._call_commit_graph = None def __getstate__(self): state = self.__dict__.copy() @@ -107,8 +111,22 @@ def firstParentOnly(self, value: bool): @property def graph(self): - return self._graphServer.get_graph() - + # When starting the analysis,set self._call_commit_graph to None, we can ensure that the graph is the latest call commit graph version. + if self._call_commit_graph is None: + # retry 10 times when get graph from graph server + for i in range(10): + try: + ccg = self._graphServer.get_graph() + if ccg is not None: + break + except Exception: + logging.info('get graph failed:{}'.format(i)) + time.sleep(1) + continue + else: + raise Exception('get graph is failed') + self._call_commit_graph = ccg + return self._call_commit_graph @property def visitedCommits(self) -> Set[str]: """ @@ -137,7 +155,18 @@ def compute_project_complexity(self, r_n: int, r_e: int): """ return self.graph.eval_project_complexity(r_n, r_e) + def compute_modularity(self): + """Compute modularity score based on function graph. + + Returns + ------- + modularity : float + The modularity score of this graph. + """ + return self.graph.compute_modularity() + async def analyze(self, maxAnalyzedCommits=None, suppressStdOutLogs=False): + self._call_commit_graph = None commitSpec = self._terminalCommit if self._originCommit: commitSpec = self._originCommit.hexsha + ".." + self._terminalCommit.hexsha @@ -173,7 +202,7 @@ def printCommitStatus(level, status: str): else: expectedParentCommit = None message = None - if not commit.parents: + if len(commit.parents) == 0: message = "Going forward (initial commit)." expectedParentCommit = None else: @@ -232,18 +261,22 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C # commit classification if self._commit_classifier and commit.hexsha not in self._clf_results: - prob = self._commit_classifier.predict(commit, diff_index) + prob = self._commit_classifier.predict(commit, diff_index, self._repo) self._clf_results[commit.hexsha] = prob - # t2: update_graph time + # t2: update_graph + git diff traversing time t2 = time.monotonic() + # t2a: get_contents time + t2a = 0 + # t2a: update_graph time + t2b = 0 if diff_index: for diff in diff_index: old_fname, new_fname = _get_fnames(diff) - # apply filter + # apply file-level filter # if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None, # as if the file is introduced/removed in this commit. - # However, the diff will keep its original, no matter if the file has been filtered in/out. + # However, the diff will not change, regardless of whether the file has been filtered out or not. if old_fname and not self._graphServer.filter_file(old_fname): old_fname = None if new_fname and not self._graphServer.filter_file(new_fname): @@ -254,17 +287,25 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C old_src = new_src = None + t2a0 = time.monotonic() if old_fname: old_src = get_contents(self._repo, parentCommit, old_fname) + if self._file_is_too_large(old_fname, old_src): + continue if new_fname: new_src = get_contents(self._repo, commit, new_fname) + if self._file_is_too_large(new_fname, new_src): + continue + t2a += time.monotonic() - t2a0 + t2b0 = time.monotonic() if old_src or new_src: result = self._graphServer.update_graph( old_fname, old_src, new_fname, new_src, diff.diff) if asyncio.iscoroutine(result): await result + t2b += time.monotonic() - t2b0 t2 = time.monotonic() - t2 # t3: end_commit time @@ -275,24 +316,39 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C t3 = time.monotonic() - t3 self._observer.onAfterCommit(self, commit, seekingMode) t0 = time.monotonic() - t0 - _logger.info("t0 = %.2f, t1 = %.2f, t2 = %.2f, t3 = %.2f", - t0, t1, t2, t3) + _logger.info("t0 = %.2f, t1 = %.2f, t2 = %.2f, t2a = %.2f, t2b = %.2f, t3 = %.2f", + t0, t1, t2, t2a, t2b, t3) assert self._graphServer.get_workspace_commit_hexsha() == commit.hexsha, \ "GraphServer.get_workspace_commit_hexsha should be return the hexsha seen in last start_commit." def _filter_monolithic_commit(self, commit: Commit, seeking_mode: CommitSeekingMode) -> CommitSeekingMode: # filter monolithic commit - if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) == 1: + # hot fix: enable filter_monolithic_commit on first commit + if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) <= 1: changed_lines = 0 files = commit.stats.files for fname in files: if self._graphServer.filter_file(fname): changed_lines += files[fname]['lines'] + print('_filter_monolithic_commit commit:', commit.hexsha, 'changed_lines:', changed_lines) if changed_lines > self._monolithic_commit_lines_threshold: # enforce using CommitSeekingMode.MergeCommit to update graph without updating node history + print('_filter_monolithic_commit set CommitSeekingMode to MergeCommit') return CommitSeekingMode.MergeCommit return seeking_mode + def _file_is_too_large(self, fname, file_content): + # Filter monolithic file by its byte size + # Returns True if under the threshold + file_size = sys.getsizeof(file_content) + too_large = file_size > self._monolithic_file_bytes_threshold + if too_large: + message = 'WARNING: file too large;' + else: + message = 'OK: file normal size;' + print(message, fname, str(file_size / 1000) + 'kB') + return too_large + def _get_fnames(diff: Diff): if diff.new_file: diff --git a/persper/analytics/call_commit_graph.py b/persper/analytics/call_commit_graph.py index 0c83fc928c6..48fdf6c2dc2 100644 --- a/persper/analytics/call_commit_graph.py +++ b/persper/analytics/call_commit_graph.py @@ -3,13 +3,19 @@ ==================================== CallCommitGraph stores all relevant analysis results """ +import logging +import community import networkx as nx from networkx.readwrite import json_graph +from typing import Union, Set, List, Dict, Optional + from persper.analytics.devrank import devrank from persper.analytics.score import normalize -from typing import Union, Set, List, Dict, Optional from persper.analytics.complexity import eval_project_complexity +_logger = logging.getLogger(__name__) + + class CommitIdGenerators: @staticmethod def fromOrdinal(ordinal: int, hexsha: str, message: str): @@ -50,6 +56,7 @@ def _to_networkx_format(graph_data: Dict) -> Dict: def reset(self): """Reset all internal states""" self._digraph = self._new_graph() + self._digraph.degree() def _new_graph(self): """Create a new nx.DiGraph for underlying storage @@ -97,10 +104,16 @@ def _next_cindex(self): # TODO: remove the default value of files def add_node(self, node: str, files: Union[Set[str], List[str]] = []): + if node is None: + _logger.error("Argument node is None in add_node.") + return self._digraph.add_node(node, size=None, history={}, files=set(files)) # add_node must be called on source and target first def add_edge(self, source, target): + if source is None or target is None: + _logger.error("Argument source or target is None in add_edge.") + return if source not in self._digraph: raise ValueError("Error: caller %s does not exist in call-commit graph." % source) if target not in self._digraph: @@ -134,9 +147,15 @@ def update_node_history_accurate(self, node, fstat): # read/write access to node history are thourgh this function def _get_node_history(self, node: str) -> Dict[str, Dict[str, int]]: + if node is None: + _logger.error("Argument node is None in _get_node_history.") + return {} return self._digraph.nodes[node]['history'] def update_node_files(self, node: str, new_files: Union[Set[str], List[str]]): + if node is None: + _logger.error("Argument node is None in update_node_files") + return self._digraph.nodes[node]['files'] = set(new_files) # TODO: provide other options for computing a node's size @@ -148,21 +167,24 @@ def _set_all_nodes_size(self, black_set=None): """ for node in self.nodes(): node_history = self._get_node_history(node) - if black_set is not None: - size = 0 - for cid, chist in node_history.items(): - sha = self.commits()[cid]['hexsha'] - if sha not in black_set: - size += (chist['adds'] + chist['dels']) - else: - size = sum([chist['adds'] + chist['dels'] for chist in node_history.values()]) - + size = 0 + for cid, chist in node_history.items(): + sha = self.commits()[cid]['hexsha'] + if black_set is not None and sha in black_set: + continue + if 'added_units' in chist.keys() and 'removed_units' in chist.keys(): + size += (chist['added_units'] + chist['removed_units']) + else: + size += (chist['adds'] + chist['dels']) # set default size to 1 to avoid zero division error if size == 0: size = 1 self._set_node_size(node, size) def _set_node_size(self, node, size): + if node is None: + _logger.error("Argument node is None in _set_node_size.") + # set node size even if it is None since we'd like to suppress the error self._digraph.nodes[node]['size'] = size def _set_all_edges_weight(self): @@ -180,12 +202,17 @@ def eval_project_complexity(self, r_n: float, r_e: float): """ return eval_project_complexity(self._digraph, r_n, r_e) + def _remove_invalid_nodes(self): + if None in self.nodes(): + self._digraph.remove_node(None) + def function_devranks(self, alpha, black_set=None): """ Args: alpha - A float between 0 and 1, commonly set to 0.85 black_set - A set of commit hexshas to be blacklisted """ + self._remove_invalid_nodes() self._set_all_nodes_size(black_set=black_set) return devrank(self._digraph, 'size', alpha=alpha) @@ -206,7 +233,10 @@ def commit_devranks(self, alpha, black_set=None): continue for cid, chist in history.items(): - csize = chist['adds'] + chist['dels'] + if 'added_units' in chist.keys() and 'removed_units' in chist.keys(): + csize = (chist['added_units'] + chist['removed_units']) + else: + csize = (chist['adds'] + chist['dels']) sha = self.commits()[cid]['hexsha'] if black_set is None or sha not in black_set: dr = (csize / size) * func_devranks[func] @@ -238,3 +268,32 @@ def developer_devranks(self, alpha, black_set=None): else: developer_devranks[email] = commit_devranks[sha] return developer_devranks + + def compute_modularity(self): + """Compute modularity score based on function graph. + + Returns + ------- + modularity : float + The modularity score of this graph. + """ + # Check the number of edges + if len(self.edges()) == 0: + return 0. + + # Construct non directed graph + graph = nx.Graph() + for node in self.nodes(): + if node is not None: + graph.add_node(node) + for (source, target) in self.edges(): + if source is not None and target is not None: + graph.add_edge(source, target) + # Compute the partition of the graph nodes + partition = community.best_partition(graph) + # Compute modularity + modularity = community.modularity(partition, graph) + # Normalize [0, 1] to [0, 100] + modularity = modularity * 100 + + return modularity diff --git a/persper/analytics/call_graph/c.py b/persper/analytics/call_graph/c.py index 7ff8bbda713..c252e4fc931 100644 --- a/persper/analytics/call_graph/c.py +++ b/persper/analytics/call_graph/c.py @@ -18,6 +18,11 @@ class NotFunctionCallError(UnexpectedASTError): pass +class UnexpectedCallNodeError(UnexpectedASTError): + """Raise when failed to parse a function call's callee name""" + pass + + def _handle_function(func_node): """Extract name and range from a node @@ -122,17 +127,39 @@ def _handle_call(call_node): Case 2: function call from struct variable Example: tty->write(tty) + Case 3: function call in a chain + Example: (*mi).second.empty() + Raises: NotFunctionCallError + UnexpectedCallNodeError """ name_node = call_node.find('srcml:name', ns) if name_node is None: # Case 1 raise NotFunctionCallError() + + def last_sub_name_node(node): + name_lst = node.findall('srcml:name', ns) + if len(name_lst) > 0: + return name_lst[-1] + else: + raise UnexpectedCallNodeError() + + # Case 2 & 3 callee_name = name_node.text - if callee_name is None: - # Case 2 - callee_name = name_node[-1].text + # DEBUG + # print_flag = False + # if callee_name is None: + # print_flag = True + # from persper.analytics.call_graph.utils import transform_node_to_src + # print(transform_node_to_src(name_node)) + while callee_name is None: + name_node = last_sub_name_node(name_node) + callee_name = name_node.text + # DEBUG + # if print_flag: + # print(callee_name) return callee_name @@ -168,6 +195,9 @@ def update_graph(ccgraph, ast_list, change_stats, new_fname_to_old_fname): except NotFunctionCallError as e: # do not print error since we expect this to happen a lot continue + except UnexpectedCallNodeError as e: + print(type(e).__name__, e.args) + continue if callee_name not in ccgraph: # Pass [] to files argument since we don't know diff --git a/persper/analytics/commit_classifier.py b/persper/analytics/commit_classifier.py index 7e3a5cb58f7..b4bac5d4d1e 100644 --- a/persper/analytics/commit_classifier.py +++ b/persper/analytics/commit_classifier.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from git import Commit, DiffIndex +from git import Commit, DiffIndex, Repo class CommitClassifier(ABC): @@ -8,13 +8,14 @@ class CommitClassifier(ABC): """ @abstractmethod - def predict(self, commit: Commit, diff_index: DiffIndex): + def predict(self, commit: Commit, diff_index: DiffIndex, repo: Repo): """ Args: commit: A gitpython's Commit object. diff_index: A gitpython's DiffIndex object. It is a list of Diff object, each containing the diff information between a pair of old/new source files. + repo: A gitpython's Repo object. Returns: diff --git a/persper/analytics/git_tools.py b/persper/analytics/git_tools.py index 01f25e7a8ef..2080a3d0589 100644 --- a/persper/analytics/git_tools.py +++ b/persper/analytics/git_tools.py @@ -16,7 +16,10 @@ def diff_with_first_parent(repo: Repo, commit: Commit): def diff_with_commit(repo: Repo, current_commit: Commit, base_commit_sha: str): - if not base_commit_sha: + # about git.NULL_TREE: https://github.com/gitpython-developers/GitPython/blob/master/git/diff.py#L87 + if current_commit is None: + current_commit = git.NULL_TREE + if base_commit_sha is None: base_commit = repo.tree(EMPTY_TREE_SHA) else: base_commit = repo.commit(base_commit_sha) diff --git a/persper/analytics/graph_server.py b/persper/analytics/graph_server.py index 406c3f39942..049c721166b 100644 --- a/persper/analytics/graph_server.py +++ b/persper/analytics/graph_server.py @@ -6,13 +6,15 @@ from persper.analytics.call_commit_graph import CallCommitGraph JS_FILENAME_REGEXES = [ - r'.+\.js$', + r'.+\.(js|vue|ts|tsx)$', r'^(?!dist/).+', r'^(?!test(s)?/).+', r'^(?!spec/).+', r'^(?!build/).+', r'^(?!bin/).+', - r'^(?!doc(s)?/).+' + r'^(?!doc(s)?/).+', + r'.*(?