Skip to content

Commit

Permalink
Merge pull request #11 from Persper/blacklist-commits
Browse files Browse the repository at this point in the history
Provide the option to black list commits
  • Loading branch information
zhengxu001 authored Jan 9, 2019
2 parents bf2070e + ef1208c commit c70e8f2
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 33 deletions.
99 changes: 69 additions & 30 deletions persper/analytics/call_commit_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,47 +52,76 @@ def __contains__(self, node):
return node in self._digraph

def add_node(self, node):
self._digraph.add_node(node, size=0, history={})
self._digraph.add_node(node, size=None, history={})

# add_node must be called on source and target first
def add_edge(self, source, target):
self._digraph.add_edge(source, target,
addedBy=self._cur_cindex(),
weight=self._digraph.nodes[target]['size'])
weight=None)

def update_node_history(self, node, size):
# Use current commit index
cc_idx = self._cur_cindex()
node_history = self._digraph.nodes[node]['history']
node_history = self._get_node_history(node)
# A commit might update a node's history more than once
if cc_idx in node_history:
node_history[cc_idx] += size
else:
node_history[cc_idx] = size
self._update_node_size(node, size)
self._update_ingoing_weight(node)
self._check_history_match_size(node)

# node's size is automatically updated when history is updated
def _update_node_size(self, node, size):
self._digraph.nodes[node]['size'] += size

# edge's weight is automaitcally updated when history is updated
# needs to be called after _update_node_size
def _update_ingoing_weight(self, node):
for nbr, datadict in self._digraph.pred[node].items():
datadict['weight'] = self._digraph.nodes[node]['size']

def _check_history_match_size(self, node):
assert(sum(self._digraph.nodes[node]['history'].values()) ==
self._digraph.nodes[node]['size'])

def function_devranks(self, alpha):
# read/write access to node history are thourgh this function
def _get_node_history(self, node):
return self._digraph.nodes[node]['history']

def _set_all_nodes_size(self, black_set=None):
""" Compute node size after nodes have been added to the graph
node size is currently defined as the total number lines of edits
black_set - A set of commit hexshas to be blacklisted
"""
for node in self.nodes():
node_history = self._get_node_history(node)
if black_set is not None:
size = 0
for cindex, csize in node_history.items():
sha = self.commits()[cindex]['hexsha']
if sha not in black_set:
size += csize
else:
size = sum(node_history.values())

# set default size to 1 to avoid zero division error
if size == 0:
size = 1
self._set_node_size(node, size)

def _set_node_size(self, node, size):
self._digraph.nodes[node]['size'] = size

def _set_all_edges_weight(self):
self._set_all_nodes_size()
for node in self.nodes():
for nbr, datadict in self._digraph.pred[node].items():
datadict['weight'] = self._digraph.nodes[node]['size']

def function_devranks(self, alpha, black_set=None):
"""
Args:
alpha - A float between 0 and 1, commonly set to 0.85
black_set - A set of commit hexshas to be blacklisted
"""
self._set_all_nodes_size(black_set=black_set)
return devrank(self._digraph, 'size', alpha=alpha)

def commit_devranks(self, alpha):
def commit_devranks(self, alpha, black_set=None):
"""
Args:
alpha - A float between 0 and 1, commonly set to 0.85
black_set - A set of commit hexshas to be blacklisted
"""
commit_devranks = {}
func_devranks = self.function_devranks(alpha)
func_devranks = self.function_devranks(alpha, black_set=black_set)

for func, data in self.nodes(data=True):
size = data['size']
Expand All @@ -103,21 +132,31 @@ def commit_devranks(self, alpha):

for cindex, csize in history.items():
sha = self.commits()[cindex]['hexsha']
dr = (csize / size) * func_devranks[func]
if sha in commit_devranks:
commit_devranks[sha] += dr
else:
commit_devranks[sha] = dr
if black_set is None or sha not in black_set:
dr = (csize / size) * func_devranks[func]
if sha in commit_devranks:
commit_devranks[sha] += dr
else:
commit_devranks[sha] = dr

return commit_devranks

def developer_devranks(self, alpha):
def developer_devranks(self, alpha, black_set=None):
"""
Args:
alpha - A float between 0 and 1, commonly set to 0.85
black_set - A set of commit hexshas to be blacklisted
"""
developer_devranks = {}
commit_devranks = self.commit_devranks(alpha)
commit_devranks = self.commit_devranks(alpha, black_set=black_set)

for commit in self.commits():
sha = commit['hexsha']
email = commit['authorEmail']

if sha not in commit_devranks:
continue

if email in developer_devranks:
developer_devranks[email] += commit_devranks[sha]
else:
Expand Down
3 changes: 0 additions & 3 deletions test/test_analytics/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from persper.analytics.analyzer import Analyzer
from persper.analytics.graph_server import C_FILENAME_REGEXES
from persper.util.path import root_path
from .util import assert_size_match_history


@pytest.fixture(scope='module')
Expand Down Expand Up @@ -49,9 +48,7 @@ def test_az_basic(az):

commits = ccgraph.commits()
for func, data in ccgraph.nodes(data=True):
size = data['size']
history = data['history']
assert_size_match_history(size, history)

for cindex, csize in history.items():
commit_message = commits[cindex]['message']
Expand Down
33 changes: 33 additions & 0 deletions test/test_analytics/test_call_commit_graph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
import os
import shutil
import subprocess
from math import isclose
from git import Repo
from persper.analytics.call_commit_graph import CallCommitGraph
from persper.analytics.cpp import CPPGraphServer
from persper.analytics.analyzer import Analyzer
from persper.analytics.graph_server import CPP_FILENAME_REGEXES
from persper.util.path import root_path


def test_call_commit_graph():
Expand Down Expand Up @@ -84,3 +92,28 @@ def test_call_commit_graph():
assert(isclose(commit_drs3[third_commit['hexsha']], 0.454, rel_tol=1e-2))
assert(isclose(dev_drs3[first_commit['authorEmail']], 0.798, rel_tol=1e-2))
assert(isclose(dev_drs3[second_commit['authorEmail']], 0.201, rel_tol=1e-2))


def test_black_set():
"""
The CRLF commit: https://github.com/bitcoin/bitcoin/commit/0a61b0df1224a5470bcddab302bc199ca5a9e356
Its parent: https://github.com/bitcoin/bitcoin/commit/5b721607b1057df4dfe97f80d235ed372312f398
Its grandparent: https://github.com/bitcoin/bitcoin/commit/2ef9cfa5b81877b1023f2fcb82f5a638b1eb013c
Its great grandparent: https://github.com/bitcoin/bitcoin/commit/7d7797b141dbd4ed9db1dda94684beb3395c2534
"""
repo_path = os.path.join(root_path, 'repos/bitcoin')
bitcoin_url = 'https://github.com/bitcoin/bitcoin'
if not os.path.exists(repo_path):
Repo.clone_from(bitcoin_url, repo_path)
az = Analyzer(repo_path, CPPGraphServer(CPP_FILENAME_REGEXES))
crlf_sha = '0a61b0df1224a5470bcddab302bc199ca5a9e356'
ggparent_sha = '7d7797b141dbd4ed9db1dda94684beb3395c2534'
rev = ggparent_sha + '..' + crlf_sha
az.analyze(rev=rev)
ccgraph = az.get_graph()
devdict = ccgraph.commit_devranks(0.85)
devdict2 = ccgraph.commit_devranks(0.85, black_set=set([crlf_sha]))
assert(len(devdict) == 3)
assert(len(devdict2) == 2)
assert(crlf_sha in devdict)
assert(crlf_sha not in devdict2)

0 comments on commit c70e8f2

Please sign in to comment.