Skip to content

Commit

Permalink
v-0.3
Browse files Browse the repository at this point in the history
Merge branch 'staging/ca-0.5.1' into 'master'

See merge request persper/code-analytics!132
  • Loading branch information
hezyin committed Jun 26, 2019
2 parents ce38e4d + cd2f8fb commit 2c72de2
Show file tree
Hide file tree
Showing 17 changed files with 443 additions and 57 deletions.
8 changes: 4 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ stages:

test_ci:
stage: test
image: ubuntu:18.04
image: hub.meri.dev/test-docker/test:latest
# only:
# - setup-ci
before_script:
- apt update && apt install -y openssh-client wget libarchive-dev libcurl4-openssl-dev git python3.7 python3-pip
- apt update && apt install -y libarchive-dev #libcurl4-openssl-dev
- apt install -y zlib1g-dev libicu-dev libcurl3 libcurl-openssl1.0-dev
- apt install -y build-essential cmake libssl-dev pkg-config cmake
- wget http://131.123.42.38/lmcrs/beta/srcML-Ubuntu18.04.deb
- dpkg -i srcML-Ubuntu18.04.deb
- mkdir -p ~/.ssh
Expand All @@ -22,11 +24,9 @@ test_ci:
- export LC_ALL=C.UTF-8
- export LANG=C.UTF-8
script:
- apt-get update
- git config --global user.email "[email protected]"
- git config --global user.name "merico"
- pip3 install pipenv

- echo -e "machine gitlab.com\nlogin ${GITLAB_USER}\npassword ${GITLAB_PASSWD}" > ~/.netrc
- git clone https://gitlab.com/persper/code-analytics.git && cd code-analytics
#&& git checkout ${CI_COMMIT_REF_NAME}
Expand Down
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ aenum = "*"
pytest-cov = "*"
gitpython = "*"
sphinx = "*"
python-louvain = "*"

[dev-packages]

Expand Down
21 changes: 14 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@ The following procedure is tested on Ubuntu 16.04 LTS.
Download and install Python 3.6+: <https://www.python.org/downloads/>.

Also, create a symbolic link from `python3` to `python` since some scripts reply on it.
```
```sh
sudo ln -s /usr/bin/python3 /usr/bin/python
```

2. Install python dependencies (we recommend to use pipenv)

```bash
```sh
pipenv install
```

3. Update git

In order to uset the `--indent-heuristic` option of `git diff`, we require git version >= 2.11. Use the following commands to upgrade:

```bash
```sh
sudo add-apt-repository ppa:git-core/ppa -y
sudo apt-get update
sudo apt-get install git -y
Expand All @@ -40,12 +40,12 @@ git --version

Add the following line to your `~/.bashrc` file.

```
```sh
export PYTHONPATH=$PYTHONPATH:/path/to/dir
```

To update your path for the remainder of the session.
```
```sh
source ~/.bashrc
```

Expand All @@ -55,14 +55,21 @@ Please download from [here](https://www.srcml.org/#download) and follow the [ins

srcML also needs `libarchive-dev` and `libcurl4-openssl-dev`. Install them with the following commands:

```bash
```sh
sudo apt install libarchive-dev
sudo apt install libcurl4-openssl-dev
```

6. Check setup correctness

```bash
As the test process will create Git repositories, set up your global Git user name and email before testing:
```sh
git config --global user.email "[email protected]"
git config --global user.name "Your Name"
```

Run the test process:
```sh
pipenv run pytest test/test_analytics
```

Expand Down
72 changes: 72 additions & 0 deletions notebooks/demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import deps\n",
"import os\n",
"from persper.analytics.c import CGraphServer\n",
"from persper.analytics.analyzer2 import Analyzer\n",
"from persper.analytics.graph_server import C_FILENAME_REGEXES\n",
"from persper.util.path import root_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# configure your project\n",
"repo_path = os.path.join(root_path, 'repos/<your_repo_name>')\n",
"\n",
"# configure alpha for devrank\n",
"alpha = 0.5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# start analysis and show commit devrank values\n",
"az = Analyzer(repo_path, CGraphServer(C_FILENAME_REGEXES))\n",
"await az.analyze()\n",
"ccgraph = az.graph\n",
"ccgraph.commit_devranks(alpha)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "code-analytics-8iDyuztf",
"language": "python",
"name": "code-analytics-8idyuztf"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
78 changes: 67 additions & 11 deletions persper/analytics/analyzer2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import collections.abc
import logging
import re
import sys
import time
from abc import ABC
from typing import List, Optional, Set, Union, Dict
Expand All @@ -22,7 +23,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer,
firstParentOnly: bool = False,
commit_classifier: Optional[CommitClassifier] = None,
skip_rewind_diff: bool = False,
monolithic_commit_lines_threshold: int = 5000):
monolithic_commit_lines_threshold: int = 5000,
monolithic_file_bytes_threshold: int = 200000):
# skip_rewind_diff will skip diff, but rewind commit start/end will still be notified to the GraphServer.
self._repositoryRoot = repositoryRoot
self._graphServer = graphServer
Expand All @@ -37,6 +39,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer,
self._clf_results: Dict[str, List[float]] = {}
self._skip_rewind_diff = skip_rewind_diff
self._monolithic_commit_lines_threshold = monolithic_commit_lines_threshold
self._monolithic_file_bytes_threshold = monolithic_file_bytes_threshold
self._call_commit_graph = None

def __getstate__(self):
state = self.__dict__.copy()
Expand Down Expand Up @@ -107,8 +111,22 @@ def firstParentOnly(self, value: bool):

@property
def graph(self):
return self._graphServer.get_graph()

# When starting the analysis,set self._call_commit_graph to None, we can ensure that the graph is the latest call commit graph version.
if self._call_commit_graph is None:
# retry 10 times when get graph from graph server
for i in range(10):
try:
ccg = self._graphServer.get_graph()
if ccg is not None:
break
except Exception:
logging.info('get graph failed:{}'.format(i))
time.sleep(1)
continue
else:
raise Exception('get graph is failed')
self._call_commit_graph = ccg
return self._call_commit_graph
@property
def visitedCommits(self) -> Set[str]:
"""
Expand Down Expand Up @@ -137,7 +155,18 @@ def compute_project_complexity(self, r_n: int, r_e: int):
"""
return self.graph.eval_project_complexity(r_n, r_e)

def compute_modularity(self):
"""Compute modularity score based on function graph.
Returns
-------
modularity : float
The modularity score of this graph.
"""
return self.graph.compute_modularity()

async def analyze(self, maxAnalyzedCommits=None, suppressStdOutLogs=False):
self._call_commit_graph = None
commitSpec = self._terminalCommit
if self._originCommit:
commitSpec = self._originCommit.hexsha + ".." + self._terminalCommit.hexsha
Expand Down Expand Up @@ -173,7 +202,7 @@ def printCommitStatus(level, status: str):
else:
expectedParentCommit = None
message = None
if not commit.parents:
if len(commit.parents) == 0:
message = "Going forward (initial commit)."
expectedParentCommit = None
else:
Expand Down Expand Up @@ -232,18 +261,22 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C

# commit classification
if self._commit_classifier and commit.hexsha not in self._clf_results:
prob = self._commit_classifier.predict(commit, diff_index)
prob = self._commit_classifier.predict(commit, diff_index, self._repo)
self._clf_results[commit.hexsha] = prob

# t2: update_graph time
# t2: update_graph + git diff traversing time
t2 = time.monotonic()
# t2a: get_contents time
t2a = 0
# t2a: update_graph time
t2b = 0
if diff_index:
for diff in diff_index:
old_fname, new_fname = _get_fnames(diff)
# apply filter
# apply file-level filter
# if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None,
# as if the file is introduced/removed in this commit.
# However, the diff will keep its original, no matter if the file has been filtered in/out.
# However, the diff will not change, regardless of whether the file has been filtered out or not.
if old_fname and not self._graphServer.filter_file(old_fname):
old_fname = None
if new_fname and not self._graphServer.filter_file(new_fname):
Expand All @@ -254,17 +287,25 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C

old_src = new_src = None

t2a0 = time.monotonic()
if old_fname:
old_src = get_contents(self._repo, parentCommit, old_fname)
if self._file_is_too_large(old_fname, old_src):
continue

if new_fname:
new_src = get_contents(self._repo, commit, new_fname)
if self._file_is_too_large(new_fname, new_src):
continue
t2a += time.monotonic() - t2a0

t2b0 = time.monotonic()
if old_src or new_src:
result = self._graphServer.update_graph(
old_fname, old_src, new_fname, new_src, diff.diff)
if asyncio.iscoroutine(result):
await result
t2b += time.monotonic() - t2b0
t2 = time.monotonic() - t2

# t3: end_commit time
Expand All @@ -275,24 +316,39 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
t3 = time.monotonic() - t3
self._observer.onAfterCommit(self, commit, seekingMode)
t0 = time.monotonic() - t0
_logger.info("t0 = %.2f, t1 = %.2f, t2 = %.2f, t3 = %.2f",
t0, t1, t2, t3)
_logger.info("t0 = %.2f, t1 = %.2f, t2 = %.2f, t2a = %.2f, t2b = %.2f, t3 = %.2f",
t0, t1, t2, t2a, t2b, t3)
assert self._graphServer.get_workspace_commit_hexsha() == commit.hexsha, \
"GraphServer.get_workspace_commit_hexsha should be return the hexsha seen in last start_commit."

def _filter_monolithic_commit(self, commit: Commit, seeking_mode: CommitSeekingMode) -> CommitSeekingMode:
# filter monolithic commit
if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) == 1:
# hot fix: enable filter_monolithic_commit on first commit
if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) <= 1:
changed_lines = 0
files = commit.stats.files
for fname in files:
if self._graphServer.filter_file(fname):
changed_lines += files[fname]['lines']
print('_filter_monolithic_commit commit:', commit.hexsha, 'changed_lines:', changed_lines)
if changed_lines > self._monolithic_commit_lines_threshold:
# enforce using CommitSeekingMode.MergeCommit to update graph without updating node history
print('_filter_monolithic_commit set CommitSeekingMode to MergeCommit')
return CommitSeekingMode.MergeCommit
return seeking_mode

def _file_is_too_large(self, fname, file_content):
# Filter monolithic file by its byte size
# Returns True if under the threshold
file_size = sys.getsizeof(file_content)
too_large = file_size > self._monolithic_file_bytes_threshold
if too_large:
message = 'WARNING: file too large;'
else:
message = 'OK: file normal size;'
print(message, fname, str(file_size / 1000) + 'kB')
return too_large


def _get_fnames(diff: Diff):
if diff.new_file:
Expand Down
Loading

0 comments on commit 2c72de2

Please sign in to comment.