diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 218e1bed254..b8402e84ba3 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,11 +5,13 @@ stages:
test_ci:
stage: test
- image: ubuntu:18.04
+ image: hub.meri.dev/test-docker/test:latest
# only:
# - setup-ci
before_script:
- - apt update && apt install -y openssh-client wget libarchive-dev libcurl4-openssl-dev git python3.7 python3-pip
+ - apt update && apt install -y libarchive-dev #libcurl4-openssl-dev
+ - apt install -y zlib1g-dev libicu-dev libcurl3 libcurl-openssl1.0-dev
+ - apt install -y build-essential cmake libssl-dev pkg-config cmake
- wget http://131.123.42.38/lmcrs/beta/srcML-Ubuntu18.04.deb
- dpkg -i srcML-Ubuntu18.04.deb
- mkdir -p ~/.ssh
@@ -22,11 +24,9 @@ test_ci:
- export LC_ALL=C.UTF-8
- export LANG=C.UTF-8
script:
- - apt-get update
- git config --global user.email "merico@meri.co"
- git config --global user.name "merico"
- pip3 install pipenv
-
- echo -e "machine gitlab.com\nlogin ${GITLAB_USER}\npassword ${GITLAB_PASSWD}" > ~/.netrc
- git clone https://gitlab.com/persper/code-analytics.git && cd code-analytics
#&& git checkout ${CI_COMMIT_REF_NAME}
diff --git a/Pipfile b/Pipfile
index 19b18641463..dbdd78f493b 100644
--- a/Pipfile
+++ b/Pipfile
@@ -25,6 +25,7 @@ aenum = "*"
pytest-cov = "*"
gitpython = "*"
sphinx = "*"
+python-louvain = "*"
[dev-packages]
diff --git a/README.md b/README.md
index 5cebce8dea4..77d7eed63db 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,13 @@ The following procedure is tested on Ubuntu 16.04 LTS.
Download and install Python 3.6+: .
Also, create a symbolic link from `python3` to `python` since some scripts reply on it.
-```
+```sh
sudo ln -s /usr/bin/python3 /usr/bin/python
```
2. Install python dependencies (we recommend to use pipenv)
-```bash
+```sh
pipenv install
```
@@ -29,7 +29,7 @@ pipenv install
In order to uset the `--indent-heuristic` option of `git diff`, we require git version >= 2.11. Use the following commands to upgrade:
-```bash
+```sh
sudo add-apt-repository ppa:git-core/ppa -y
sudo apt-get update
sudo apt-get install git -y
@@ -40,12 +40,12 @@ git --version
Add the following line to your `~/.bashrc` file.
-```
+```sh
export PYTHONPATH=$PYTHONPATH:/path/to/dir
```
To update your path for the remainder of the session.
-```
+```sh
source ~/.bashrc
```
@@ -55,14 +55,21 @@ Please download from [here](https://www.srcml.org/#download) and follow the [ins
srcML also needs `libarchive-dev` and `libcurl4-openssl-dev`. Install them with the following commands:
-```bash
+```sh
sudo apt install libarchive-dev
sudo apt install libcurl4-openssl-dev
```
6. Check setup correctness
-```bash
+As the test process will create Git repositories, set up your global Git user name and email before testing:
+```sh
+git config --global user.email "you@example.com"
+git config --global user.name "Your Name"
+```
+
+Run the test process:
+```sh
pipenv run pytest test/test_analytics
```
diff --git a/notebooks/demo.ipynb b/notebooks/demo.ipynb
new file mode 100644
index 00000000000..48ddf3760fb
--- /dev/null
+++ b/notebooks/demo.ipynb
@@ -0,0 +1,72 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# import deps\n",
+ "import os\n",
+ "from persper.analytics.c import CGraphServer\n",
+ "from persper.analytics.analyzer2 import Analyzer\n",
+ "from persper.analytics.graph_server import C_FILENAME_REGEXES\n",
+ "from persper.util.path import root_path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# configure your project\n",
+ "repo_path = os.path.join(root_path, 'repos/')\n",
+ "\n",
+ "# configure alpha for devrank\n",
+ "alpha = 0.5"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# start analysis and show commit devrank values\n",
+ "az = Analyzer(repo_path, CGraphServer(C_FILENAME_REGEXES))\n",
+ "await az.analyze()\n",
+ "ccgraph = az.graph\n",
+ "ccgraph.commit_devranks(alpha)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "code-analytics-8iDyuztf",
+ "language": "python",
+ "name": "code-analytics-8idyuztf"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py
index ecfc398e850..189af7f2899 100644
--- a/persper/analytics/analyzer2.py
+++ b/persper/analytics/analyzer2.py
@@ -2,6 +2,7 @@
import collections.abc
import logging
import re
+import sys
import time
from abc import ABC
from typing import List, Optional, Set, Union, Dict
@@ -22,7 +23,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer,
firstParentOnly: bool = False,
commit_classifier: Optional[CommitClassifier] = None,
skip_rewind_diff: bool = False,
- monolithic_commit_lines_threshold: int = 5000):
+ monolithic_commit_lines_threshold: int = 5000,
+ monolithic_file_bytes_threshold: int = 200000):
# skip_rewind_diff will skip diff, but rewind commit start/end will still be notified to the GraphServer.
self._repositoryRoot = repositoryRoot
self._graphServer = graphServer
@@ -37,6 +39,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer,
self._clf_results: Dict[str, List[float]] = {}
self._skip_rewind_diff = skip_rewind_diff
self._monolithic_commit_lines_threshold = monolithic_commit_lines_threshold
+ self._monolithic_file_bytes_threshold = monolithic_file_bytes_threshold
+ self._call_commit_graph = None
def __getstate__(self):
state = self.__dict__.copy()
@@ -107,8 +111,22 @@ def firstParentOnly(self, value: bool):
@property
def graph(self):
- return self._graphServer.get_graph()
-
+ # When starting the analysis,set self._call_commit_graph to None, we can ensure that the graph is the latest call commit graph version.
+ if self._call_commit_graph is None:
+ # retry 10 times when get graph from graph server
+ for i in range(10):
+ try:
+ ccg = self._graphServer.get_graph()
+ if ccg is not None:
+ break
+ except Exception:
+ logging.info('get graph failed:{}'.format(i))
+ time.sleep(1)
+ continue
+ else:
+ raise Exception('get graph is failed')
+ self._call_commit_graph = ccg
+ return self._call_commit_graph
@property
def visitedCommits(self) -> Set[str]:
"""
@@ -137,7 +155,18 @@ def compute_project_complexity(self, r_n: int, r_e: int):
"""
return self.graph.eval_project_complexity(r_n, r_e)
+ def compute_modularity(self):
+ """Compute modularity score based on function graph.
+
+ Returns
+ -------
+ modularity : float
+ The modularity score of this graph.
+ """
+ return self.graph.compute_modularity()
+
async def analyze(self, maxAnalyzedCommits=None, suppressStdOutLogs=False):
+ self._call_commit_graph = None
commitSpec = self._terminalCommit
if self._originCommit:
commitSpec = self._originCommit.hexsha + ".." + self._terminalCommit.hexsha
@@ -173,7 +202,7 @@ def printCommitStatus(level, status: str):
else:
expectedParentCommit = None
message = None
- if not commit.parents:
+ if len(commit.parents) == 0:
message = "Going forward (initial commit)."
expectedParentCommit = None
else:
@@ -232,18 +261,22 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
# commit classification
if self._commit_classifier and commit.hexsha not in self._clf_results:
- prob = self._commit_classifier.predict(commit, diff_index)
+ prob = self._commit_classifier.predict(commit, diff_index, self._repo)
self._clf_results[commit.hexsha] = prob
- # t2: update_graph time
+ # t2: update_graph + git diff traversing time
t2 = time.monotonic()
+ # t2a: get_contents time
+ t2a = 0
+ # t2a: update_graph time
+ t2b = 0
if diff_index:
for diff in diff_index:
old_fname, new_fname = _get_fnames(diff)
- # apply filter
+ # apply file-level filter
# if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None,
# as if the file is introduced/removed in this commit.
- # However, the diff will keep its original, no matter if the file has been filtered in/out.
+ # However, the diff will not change, regardless of whether the file has been filtered out or not.
if old_fname and not self._graphServer.filter_file(old_fname):
old_fname = None
if new_fname and not self._graphServer.filter_file(new_fname):
@@ -254,17 +287,25 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
old_src = new_src = None
+ t2a0 = time.monotonic()
if old_fname:
old_src = get_contents(self._repo, parentCommit, old_fname)
+ if self._file_is_too_large(old_fname, old_src):
+ continue
if new_fname:
new_src = get_contents(self._repo, commit, new_fname)
+ if self._file_is_too_large(new_fname, new_src):
+ continue
+ t2a += time.monotonic() - t2a0
+ t2b0 = time.monotonic()
if old_src or new_src:
result = self._graphServer.update_graph(
old_fname, old_src, new_fname, new_src, diff.diff)
if asyncio.iscoroutine(result):
await result
+ t2b += time.monotonic() - t2b0
t2 = time.monotonic() - t2
# t3: end_commit time
@@ -275,24 +316,39 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
t3 = time.monotonic() - t3
self._observer.onAfterCommit(self, commit, seekingMode)
t0 = time.monotonic() - t0
- _logger.info("t0 = %.2f, t1 = %.2f, t2 = %.2f, t3 = %.2f",
- t0, t1, t2, t3)
+ _logger.info("t0 = %.2f, t1 = %.2f, t2 = %.2f, t2a = %.2f, t2b = %.2f, t3 = %.2f",
+ t0, t1, t2, t2a, t2b, t3)
assert self._graphServer.get_workspace_commit_hexsha() == commit.hexsha, \
"GraphServer.get_workspace_commit_hexsha should be return the hexsha seen in last start_commit."
def _filter_monolithic_commit(self, commit: Commit, seeking_mode: CommitSeekingMode) -> CommitSeekingMode:
# filter monolithic commit
- if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) == 1:
+ # hot fix: enable filter_monolithic_commit on first commit
+ if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) <= 1:
changed_lines = 0
files = commit.stats.files
for fname in files:
if self._graphServer.filter_file(fname):
changed_lines += files[fname]['lines']
+ print('_filter_monolithic_commit commit:', commit.hexsha, 'changed_lines:', changed_lines)
if changed_lines > self._monolithic_commit_lines_threshold:
# enforce using CommitSeekingMode.MergeCommit to update graph without updating node history
+ print('_filter_monolithic_commit set CommitSeekingMode to MergeCommit')
return CommitSeekingMode.MergeCommit
return seeking_mode
+ def _file_is_too_large(self, fname, file_content):
+ # Filter monolithic file by its byte size
+ # Returns True if under the threshold
+ file_size = sys.getsizeof(file_content)
+ too_large = file_size > self._monolithic_file_bytes_threshold
+ if too_large:
+ message = 'WARNING: file too large;'
+ else:
+ message = 'OK: file normal size;'
+ print(message, fname, str(file_size / 1000) + 'kB')
+ return too_large
+
def _get_fnames(diff: Diff):
if diff.new_file:
diff --git a/persper/analytics/call_commit_graph.py b/persper/analytics/call_commit_graph.py
index 0c83fc928c6..48fdf6c2dc2 100644
--- a/persper/analytics/call_commit_graph.py
+++ b/persper/analytics/call_commit_graph.py
@@ -3,13 +3,19 @@
====================================
CallCommitGraph stores all relevant analysis results
"""
+import logging
+import community
import networkx as nx
from networkx.readwrite import json_graph
+from typing import Union, Set, List, Dict, Optional
+
from persper.analytics.devrank import devrank
from persper.analytics.score import normalize
-from typing import Union, Set, List, Dict, Optional
from persper.analytics.complexity import eval_project_complexity
+_logger = logging.getLogger(__name__)
+
+
class CommitIdGenerators:
@staticmethod
def fromOrdinal(ordinal: int, hexsha: str, message: str):
@@ -50,6 +56,7 @@ def _to_networkx_format(graph_data: Dict) -> Dict:
def reset(self):
"""Reset all internal states"""
self._digraph = self._new_graph()
+ self._digraph.degree()
def _new_graph(self):
"""Create a new nx.DiGraph for underlying storage
@@ -97,10 +104,16 @@ def _next_cindex(self):
# TODO: remove the default value of files
def add_node(self, node: str, files: Union[Set[str], List[str]] = []):
+ if node is None:
+ _logger.error("Argument node is None in add_node.")
+ return
self._digraph.add_node(node, size=None, history={}, files=set(files))
# add_node must be called on source and target first
def add_edge(self, source, target):
+ if source is None or target is None:
+ _logger.error("Argument source or target is None in add_edge.")
+ return
if source not in self._digraph:
raise ValueError("Error: caller %s does not exist in call-commit graph." % source)
if target not in self._digraph:
@@ -134,9 +147,15 @@ def update_node_history_accurate(self, node, fstat):
# read/write access to node history are thourgh this function
def _get_node_history(self, node: str) -> Dict[str, Dict[str, int]]:
+ if node is None:
+ _logger.error("Argument node is None in _get_node_history.")
+ return {}
return self._digraph.nodes[node]['history']
def update_node_files(self, node: str, new_files: Union[Set[str], List[str]]):
+ if node is None:
+ _logger.error("Argument node is None in update_node_files")
+ return
self._digraph.nodes[node]['files'] = set(new_files)
# TODO: provide other options for computing a node's size
@@ -148,21 +167,24 @@ def _set_all_nodes_size(self, black_set=None):
"""
for node in self.nodes():
node_history = self._get_node_history(node)
- if black_set is not None:
- size = 0
- for cid, chist in node_history.items():
- sha = self.commits()[cid]['hexsha']
- if sha not in black_set:
- size += (chist['adds'] + chist['dels'])
- else:
- size = sum([chist['adds'] + chist['dels'] for chist in node_history.values()])
-
+ size = 0
+ for cid, chist in node_history.items():
+ sha = self.commits()[cid]['hexsha']
+ if black_set is not None and sha in black_set:
+ continue
+ if 'added_units' in chist.keys() and 'removed_units' in chist.keys():
+ size += (chist['added_units'] + chist['removed_units'])
+ else:
+ size += (chist['adds'] + chist['dels'])
# set default size to 1 to avoid zero division error
if size == 0:
size = 1
self._set_node_size(node, size)
def _set_node_size(self, node, size):
+ if node is None:
+ _logger.error("Argument node is None in _set_node_size.")
+ # set node size even if it is None since we'd like to suppress the error
self._digraph.nodes[node]['size'] = size
def _set_all_edges_weight(self):
@@ -180,12 +202,17 @@ def eval_project_complexity(self, r_n: float, r_e: float):
"""
return eval_project_complexity(self._digraph, r_n, r_e)
+ def _remove_invalid_nodes(self):
+ if None in self.nodes():
+ self._digraph.remove_node(None)
+
def function_devranks(self, alpha, black_set=None):
"""
Args:
alpha - A float between 0 and 1, commonly set to 0.85
black_set - A set of commit hexshas to be blacklisted
"""
+ self._remove_invalid_nodes()
self._set_all_nodes_size(black_set=black_set)
return devrank(self._digraph, 'size', alpha=alpha)
@@ -206,7 +233,10 @@ def commit_devranks(self, alpha, black_set=None):
continue
for cid, chist in history.items():
- csize = chist['adds'] + chist['dels']
+ if 'added_units' in chist.keys() and 'removed_units' in chist.keys():
+ csize = (chist['added_units'] + chist['removed_units'])
+ else:
+ csize = (chist['adds'] + chist['dels'])
sha = self.commits()[cid]['hexsha']
if black_set is None or sha not in black_set:
dr = (csize / size) * func_devranks[func]
@@ -238,3 +268,32 @@ def developer_devranks(self, alpha, black_set=None):
else:
developer_devranks[email] = commit_devranks[sha]
return developer_devranks
+
+ def compute_modularity(self):
+ """Compute modularity score based on function graph.
+
+ Returns
+ -------
+ modularity : float
+ The modularity score of this graph.
+ """
+ # Check the number of edges
+ if len(self.edges()) == 0:
+ return 0.
+
+ # Construct non directed graph
+ graph = nx.Graph()
+ for node in self.nodes():
+ if node is not None:
+ graph.add_node(node)
+ for (source, target) in self.edges():
+ if source is not None and target is not None:
+ graph.add_edge(source, target)
+ # Compute the partition of the graph nodes
+ partition = community.best_partition(graph)
+ # Compute modularity
+ modularity = community.modularity(partition, graph)
+ # Normalize [0, 1] to [0, 100]
+ modularity = modularity * 100
+
+ return modularity
diff --git a/persper/analytics/call_graph/c.py b/persper/analytics/call_graph/c.py
index 7ff8bbda713..c252e4fc931 100644
--- a/persper/analytics/call_graph/c.py
+++ b/persper/analytics/call_graph/c.py
@@ -18,6 +18,11 @@ class NotFunctionCallError(UnexpectedASTError):
pass
+class UnexpectedCallNodeError(UnexpectedASTError):
+ """Raise when failed to parse a function call's callee name"""
+ pass
+
+
def _handle_function(func_node):
"""Extract name and range from a node
@@ -122,17 +127,39 @@ def _handle_call(call_node):
Case 2: function call from struct variable
Example: tty->write(tty)
+ Case 3: function call in a chain
+ Example: (*mi).second.empty()
+
Raises:
NotFunctionCallError
+ UnexpectedCallNodeError
"""
name_node = call_node.find('srcml:name', ns)
if name_node is None:
# Case 1
raise NotFunctionCallError()
+
+ def last_sub_name_node(node):
+ name_lst = node.findall('srcml:name', ns)
+ if len(name_lst) > 0:
+ return name_lst[-1]
+ else:
+ raise UnexpectedCallNodeError()
+
+ # Case 2 & 3
callee_name = name_node.text
- if callee_name is None:
- # Case 2
- callee_name = name_node[-1].text
+ # DEBUG
+ # print_flag = False
+ # if callee_name is None:
+ # print_flag = True
+ # from persper.analytics.call_graph.utils import transform_node_to_src
+ # print(transform_node_to_src(name_node))
+ while callee_name is None:
+ name_node = last_sub_name_node(name_node)
+ callee_name = name_node.text
+ # DEBUG
+ # if print_flag:
+ # print(callee_name)
return callee_name
@@ -168,6 +195,9 @@ def update_graph(ccgraph, ast_list, change_stats, new_fname_to_old_fname):
except NotFunctionCallError as e:
# do not print error since we expect this to happen a lot
continue
+ except UnexpectedCallNodeError as e:
+ print(type(e).__name__, e.args)
+ continue
if callee_name not in ccgraph:
# Pass [] to files argument since we don't know
diff --git a/persper/analytics/commit_classifier.py b/persper/analytics/commit_classifier.py
index 7e3a5cb58f7..b4bac5d4d1e 100644
--- a/persper/analytics/commit_classifier.py
+++ b/persper/analytics/commit_classifier.py
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
-from git import Commit, DiffIndex
+from git import Commit, DiffIndex, Repo
class CommitClassifier(ABC):
@@ -8,13 +8,14 @@ class CommitClassifier(ABC):
"""
@abstractmethod
- def predict(self, commit: Commit, diff_index: DiffIndex):
+ def predict(self, commit: Commit, diff_index: DiffIndex, repo: Repo):
"""
Args:
commit: A gitpython's Commit object.
diff_index: A gitpython's DiffIndex object.
It is a list of Diff object, each containing the
diff information between a pair of old/new source files.
+ repo: A gitpython's Repo object.
Returns:
diff --git a/persper/analytics/git_tools.py b/persper/analytics/git_tools.py
index 01f25e7a8ef..2080a3d0589 100644
--- a/persper/analytics/git_tools.py
+++ b/persper/analytics/git_tools.py
@@ -16,7 +16,10 @@ def diff_with_first_parent(repo: Repo, commit: Commit):
def diff_with_commit(repo: Repo, current_commit: Commit, base_commit_sha: str):
- if not base_commit_sha:
+ # about git.NULL_TREE: https://github.com/gitpython-developers/GitPython/blob/master/git/diff.py#L87
+ if current_commit is None:
+ current_commit = git.NULL_TREE
+ if base_commit_sha is None:
base_commit = repo.tree(EMPTY_TREE_SHA)
else:
base_commit = repo.commit(base_commit_sha)
diff --git a/persper/analytics/graph_server.py b/persper/analytics/graph_server.py
index 406c3f39942..049c721166b 100644
--- a/persper/analytics/graph_server.py
+++ b/persper/analytics/graph_server.py
@@ -6,13 +6,15 @@
from persper.analytics.call_commit_graph import CallCommitGraph
JS_FILENAME_REGEXES = [
- r'.+\.js$',
+ r'.+\.(js|vue|ts|tsx)$',
r'^(?!dist/).+',
r'^(?!test(s)?/).+',
r'^(?!spec/).+',
r'^(?!build/).+',
r'^(?!bin/).+',
- r'^(?!doc(s)?/).+'
+ r'^(?!doc(s)?/).+',
+ r'.*(?