Skip to content

Commit

Permalink
v-0.2
Browse files Browse the repository at this point in the history
Merge branch 'develop' into 'master'

See merge request persper/code-analytics!89
  • Loading branch information
hezyin committed May 26, 2019
2 parents b992df4 + 36c1571 commit ce38e4d
Show file tree
Hide file tree
Showing 36 changed files with 1,290 additions and 311 deletions.
37 changes: 37 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
stages:
- build
- test


test_ci:
stage: test
image: ubuntu:18.04
# only:
# - setup-ci
before_script:
- apt update && apt install -y openssh-client wget libarchive-dev libcurl4-openssl-dev git python3.7 python3-pip
- wget http://131.123.42.38/lmcrs/beta/srcML-Ubuntu18.04.deb
- dpkg -i srcML-Ubuntu18.04.deb
- mkdir -p ~/.ssh
- echo "${DEPLOY_KEY}" | tr -d '\r' > ~/.ssh/id_rsa
- chmod 600 ~/.ssh/id_rsa
- eval "$(ssh-agent -s)"
- ssh-keyscan -H "gitlab.com" >> ~/.ssh/known_hosts
- chmod 644 ~/.ssh/known_hosts
- set LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
- export LC_ALL=C.UTF-8
- export LANG=C.UTF-8
script:
- apt-get update
- git config --global user.email "[email protected]"
- git config --global user.name "merico"
- pip3 install pipenv

- echo -e "machine gitlab.com\nlogin ${GITLAB_USER}\npassword ${GITLAB_PASSWD}" > ~/.netrc
- git clone https://gitlab.com/persper/code-analytics.git && cd code-analytics
#&& git checkout ${CI_COMMIT_REF_NAME}
- export PYTHONPATH=$PYTHONPATH:/root/code-analytics
- pipenv install --python 3.7
- pipenv run pytest -s test/test_analytics
- pipenv run pytest -s test/test_analytics2
- echo "Done"
95 changes: 65 additions & 30 deletions persper/analytics/analyzer2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
import time
from abc import ABC
from typing import List, Optional, Set, Union
from typing import List, Optional, Set, Union, Dict

from git import Commit, Diff, DiffIndex, Repo

Expand All @@ -20,7 +20,10 @@ class Analyzer:
def __init__(self, repositoryRoot: str, graphServer: GraphServer,
terminalCommit: str = 'HEAD',
firstParentOnly: bool = False,
commit_classifier: Optional[CommitClassifier] = None):
commit_classifier: Optional[CommitClassifier] = None,
skip_rewind_diff: bool = False,
monolithic_commit_lines_threshold: int = 5000):
# skip_rewind_diff will skip diff, but rewind commit start/end will still be notified to the GraphServer.
self._repositoryRoot = repositoryRoot
self._graphServer = graphServer
self._repo = Repo(repositoryRoot)
Expand All @@ -32,6 +35,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer,
self._observer: AnalyzerObserver = emptyAnalyzerObserver
self._commit_classifier = commit_classifier
self._clf_results: Dict[str, List[float]] = {}
self._skip_rewind_diff = skip_rewind_diff
self._monolithic_commit_lines_threshold = monolithic_commit_lines_threshold

def __getstate__(self):
state = self.__dict__.copy()
Expand Down Expand Up @@ -123,6 +128,15 @@ def compute_commit_scores(self, alpha: float, label_weights: List[float],
top_one=top_one,
additive=additive)

def compute_project_complexity(self, r_n: int, r_e: int):
"""
Evaluates project complexity.
params
r_n: The conversion factor from node count to logic units.
r_e: The conversion factor from edge count to logic units.
"""
return self.graph.eval_project_complexity(r_n, r_e)

async def analyze(self, maxAnalyzedCommits=None, suppressStdOutLogs=False):
commitSpec = self._terminalCommit
if self._originCommit:
Expand Down Expand Up @@ -195,6 +209,9 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
if type(commit) != Commit:
commit = self._repo.commit(commit)

# filter monolithic commit
seekingMode = self._filter_monolithic_commit(commit, seekingMode)

# t0: Total time usage
t0 = time.monotonic()
self._observer.onBeforeCommit(self, commit, seekingMode)
Expand All @@ -207,7 +224,11 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
await result

t1 = time.monotonic() - t1
diff_index = diff_with_commit(self._repo, commit, parentCommit)
diff_index = None
if self._skip_rewind_diff and seekingMode == CommitSeekingMode.Rewind:
_logger.info("Skipped diff for rewinding commit.")
else:
diff_index = diff_with_commit(self._repo, commit, parentCommit)

# commit classification
if self._commit_classifier and commit.hexsha not in self._clf_results:
Expand All @@ -216,33 +237,34 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C

# t2: update_graph time
t2 = time.monotonic()
for diff in diff_index:
old_fname, new_fname = _get_fnames(diff)
# apply filter
# if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None,
# as if the file is introduced/removed in this commit.
# However, the diff will keep its original, no matter if the file has been filtered in/out.
if old_fname and not self._graphServer.filter_file(old_fname):
old_fname = None
if new_fname and not self._graphServer.filter_file(new_fname):
new_fname = None
if not old_fname and not new_fname:
# no modification
continue

old_src = new_src = None

if old_fname:
old_src = get_contents(self._repo, parentCommit, old_fname)

if new_fname:
new_src = get_contents(self._repo, commit, new_fname)

if old_src or new_src:
result = self._graphServer.update_graph(
old_fname, old_src, new_fname, new_src, diff.diff)
if asyncio.iscoroutine(result):
await result
if diff_index:
for diff in diff_index:
old_fname, new_fname = _get_fnames(diff)
# apply filter
# if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None,
# as if the file is introduced/removed in this commit.
# However, the diff will keep its original, no matter if the file has been filtered in/out.
if old_fname and not self._graphServer.filter_file(old_fname):
old_fname = None
if new_fname and not self._graphServer.filter_file(new_fname):
new_fname = None
if not old_fname and not new_fname:
# no modification
continue

old_src = new_src = None

if old_fname:
old_src = get_contents(self._repo, parentCommit, old_fname)

if new_fname:
new_src = get_contents(self._repo, commit, new_fname)

if old_src or new_src:
result = self._graphServer.update_graph(
old_fname, old_src, new_fname, new_src, diff.diff)
if asyncio.iscoroutine(result):
await result
t2 = time.monotonic() - t2

# t3: end_commit time
Expand All @@ -258,6 +280,19 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
assert self._graphServer.get_workspace_commit_hexsha() == commit.hexsha, \
"GraphServer.get_workspace_commit_hexsha should be return the hexsha seen in last start_commit."

def _filter_monolithic_commit(self, commit: Commit, seeking_mode: CommitSeekingMode) -> CommitSeekingMode:
# filter monolithic commit
if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) == 1:
changed_lines = 0
files = commit.stats.files
for fname in files:
if self._graphServer.filter_file(fname):
changed_lines += files[fname]['lines']
if changed_lines > self._monolithic_commit_lines_threshold:
# enforce using CommitSeekingMode.MergeCommit to update graph without updating node history
return CommitSeekingMode.MergeCommit
return seeking_mode


def _get_fnames(diff: Diff):
if diff.new_file:
Expand Down
12 changes: 7 additions & 5 deletions persper/analytics/c.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from persper.analytics.call_commit_graph import CallCommitGraph


def function_change_stats(old_ast, new_ast, patch, patch_parser, ranges_func):
def function_change_stats(old_ast, old_src, new_ast, new_src, patch, patch_parser, ranges_func):
"""
Parse old/new source files and extract the change info for all functions
"""
Expand All @@ -19,19 +19,21 @@ def function_change_stats(old_ast, new_ast, patch, patch_parser, ranges_func):

if old_ast is not None:
forward_stats = get_changed_functions(
*ranges_func(old_ast), adds, dels, separate=True)
*ranges_func(old_ast), adds, dels, old_src, new_src, separate=True)

if new_ast is not None:
inv_adds, inv_dels = inverse_diff(adds, dels)
bckward_stats = get_changed_functions(
*ranges_func(new_ast), inv_adds, inv_dels, separate=True)
*ranges_func(new_ast), inv_adds, inv_dels, new_src, old_src, separate=True)

# merge forward and backward stats
for func, fstat in bckward_stats.items():
if func not in forward_stats:
forward_stats[func] = {
'adds': fstat['dels'],
'dels': fstat['adds']
'dels': fstat['adds'],
'added_units': fstat['removed_units'],
'removed_units': fstat['added_units']
}

return forward_stats
Expand Down Expand Up @@ -85,7 +87,7 @@ def update_graph(self, old_filename, old_src, new_filename, new_src, patch):
# Compatible with both the old and the new Analyzer
change_stats = {}
if self._seeking_mode != CommitSeekingMode.MergeCommit:
change_stats = function_change_stats(old_ast, new_ast, patch,
change_stats = function_change_stats(old_ast, old_src, new_ast, new_src, patch,
self._parse_patch,
get_func_ranges_c)

Expand Down
24 changes: 23 additions & 1 deletion persper/analytics/call_commit_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from persper.analytics.devrank import devrank
from persper.analytics.score import normalize
from typing import Union, Set, List, Dict, Optional

from persper.analytics.complexity import eval_project_complexity

class CommitIdGenerators:
@staticmethod
Expand Down Expand Up @@ -119,6 +119,19 @@ def update_node_history(self, node, num_adds, num_dels):
else:
node_history[self._current_commit_id] = {'adds': num_adds, 'dels': num_dels}

def update_node_history_accurate(self, node, fstat):
node_history = self._get_node_history(node)
# A commit might update a node's history more than once when
# a single FunctionNode corresponds to more than one actual functions
if self._current_commit_id in node_history:
node_history[self._current_commit_id]['adds'] += fstat['adds']
node_history[self._current_commit_id]['dels'] += fstat['dels']
node_history[self._current_commit_id]['added_units'] += fstat['added_units']
node_history[self._current_commit_id]['removed_units'] += fstat['removed_units']
else:
node_history[self._current_commit_id] = {'adds': fstat['adds'], 'dels': fstat['dels'],
'added_units': fstat['added_units'], 'removed_units': fstat['removed_units']}

# read/write access to node history are thourgh this function
def _get_node_history(self, node: str) -> Dict[str, Dict[str, int]]:
return self._digraph.nodes[node]['history']
Expand Down Expand Up @@ -158,6 +171,15 @@ def _set_all_edges_weight(self):
for nbr, datadict in self._digraph.pred[node].items():
datadict['weight'] = self._digraph.nodes[node]['size']

def eval_project_complexity(self, r_n: float, r_e: float):
"""
Evaluates project complexity.
params
r_n: The conversion factor from node count to logic units.
r_e: The conversion factor from edge count to logic units.
"""
return eval_project_complexity(self._digraph, r_n, r_e)

def function_devranks(self, alpha, black_set=None):
"""
Args:
Expand Down
3 changes: 2 additions & 1 deletion persper/analytics/call_graph/c.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ def update_graph(ccgraph, ast_list, change_stats, new_fname_to_old_fname):
if func not in ccgraph:
print("%s in change_stats but not in ccgraph" % func)
continue
ccgraph.update_node_history(func, fstat['adds'], fstat['dels'])
ccgraph.update_node_history_accurate(func, fstat)
# ccgraph.update_node_history(func, fstat['adds'], fstat['dels'])


def get_func_ranges_c(root):
Expand Down
36 changes: 36 additions & 0 deletions persper/analytics/complexity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import logging
from typing import Dict, List

import numpy as np
from networkx import DiGraph

_logger = logging.getLogger(__file__)


def eval_project_complexity(G: DiGraph, r_n: float, r_e: float):
"""
Evaluates project complexity from the specified bare call commit graph.
remarks
The formula is
complexity = sum_by_node(added_units + removed_units) + r_n*len(nodes) + r_e*len(edges)
"""
logical_units = 0
useFallback = None
for _, data in G.nodes(data=True):
added = 0
removed = 0
for _, v in data["history"].items():
if useFallback == None:
useFallback = not "added_units" in v
if useFallback:
_logger.warning(
"Will use LOC instead of logic units to measure complexity.")
if useFallback:
added += v["adds"]
removed += v["dels"]
else:
added += v["added_units"]
removed += v["removed_units"]
logical_units += added + removed
complexity = logical_units + r_n*len(G.nodes) + r_e*len(G.edges)
return complexity
Loading

0 comments on commit ce38e4d

Please sign in to comment.