From 8d4665cb4f3c0c5b10b19b90b24911aa64ef59fe Mon Sep 17 00:00:00 2001 From: xinyan Date: Sat, 11 May 2019 15:39:10 +0800 Subject: [PATCH 01/39] Add lifecycle methods in IGraphServer. --- .../abstractions/callcommitgraph.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/persper/analytics2/abstractions/callcommitgraph.py b/persper/analytics2/abstractions/callcommitgraph.py index 551a82a0b60..1f07eed72b1 100644 --- a/persper/analytics2/abstractions/callcommitgraph.py +++ b/persper/analytics2/abstractions/callcommitgraph.py @@ -293,8 +293,31 @@ class ICallCommitGraph(IReadOnlyCallCommitGraph, IWriteOnlyCallCommitGraph): class IGraphServer(ABC): """ - Provides basic functionality to trigger the commit analysis on graph server. + Provides basic functionality to trigger the commit analysis on graph server. + remarks + The call sequence: + * start + * update_graph + * update_graph + * ... + * update_graph + * stop """ + @abstractmethod + def start(self) -> None: + """ + When implemented, starts the graph server and get ready for commit analysis, if applicable. + This includes starting the graph server process, preparing workspace folder, etc. + """ + pass + + @abstractmethod + def stop(self) -> None: + """ + When implemented, stops the graph server and do necessary cleanup, if applicable. + """ + pass + @abstractmethod def update_graph(self, commit: ICommitInfo) -> None: """ From 2fc33f73a33d48238aea103799cf98dcd258892e Mon Sep 17 00:00:00 2001 From: Yang Zhikai Date: Sun, 12 May 2019 16:10:06 +0000 Subject: [PATCH 02/39] Squashed commits: e308d117 Merge remote-tracking branch 'origin/develop' into memory-ccg 9c5486bd delete comment code 6d4b6d06 update _ensure_node_exists method 3c674261 update ccg update-node-files edae8ef4 Merge branch 'hurthwell-refactor' into 'develop' 8d4665cb Add lifecycle methods in IGraphServer. b70339ce update ccg reuse method to construct ccg 1ec3c5e2 pass memory-ccg a08dbabb Merge branch 'master' into memory-ccg d29d45ee update memory-ccg 04d7c094 modify memory-ccg update_node_history 1532bf1c fix bugs f59d4fc8 modify memory-ccg bugs 5f62ab17 fix some exists problems 07c80d1d add memory-ccg test case and modify node data struture a28e6365 update ccg-memory node iterator 9ca1008c update memory-ccg cdbeeafc update ccg' a935aee1 update ccg-abstraction 874238b9 update memoryccg 866c61a8 update memoryccg ad711f3d implement basic momory-ccg ed3195bd implement basic momory-ccg bb0b4dfb modify memory-ccg bugs 3756f5f4 fix some exists problems 30c86fbd add memory-ccg test case and modify node data struture cccd1e77 Merge branch 'master' into memory-ccg 7cab91ab update ccg-memory node iterator 40aa8323 update memory-ccg cce89ea3 update ccg' 5e7d7641 Merge branch 'hurthwell-refactor' into memory-ccg 6ab9fb22 update ccg-abstraction 6cc9ece0 Merge remote-tracking branch 'origin/hurthwell-refactor' into memory-ccg c1dded46 update memoryccg d996ee1f update memoryccg 717f68a4 implement basic momory-ccg 04f1a682 implement basic momory-ccg --- .../abstractions/callcommitgraph.py | 6 +- persper/analytics2/memorycallcommitgraph.py | 149 ++++++++++++++++++ test/analytics2/__init__.py | 0 test/analytics2/abstractions/__init__.py | 0 .../abstractions/callcommitgraph.py | 29 ++-- test/analytics2/callcommitgraph.py | 14 -- test/analytics2/test_callcommitgraph.py | 13 ++ 7 files changed, 182 insertions(+), 29 deletions(-) create mode 100644 persper/analytics2/memorycallcommitgraph.py create mode 100644 test/analytics2/__init__.py create mode 100644 test/analytics2/abstractions/__init__.py delete mode 100644 test/analytics2/callcommitgraph.py create mode 100644 test/analytics2/test_callcommitgraph.py diff --git a/persper/analytics2/abstractions/callcommitgraph.py b/persper/analytics2/abstractions/callcommitgraph.py index 1f07eed72b1..fb438d2ae20 100644 --- a/persper/analytics2/abstractions/callcommitgraph.py +++ b/persper/analytics2/abstractions/callcommitgraph.py @@ -12,6 +12,10 @@ class NodeId(NamedTuple): name: str language: str + def __eq__(self, other): + return self.name == other.name and self.language == other.language + + class NodeHistoryItem: """ @@ -247,7 +251,7 @@ def update_node_history(self, node_id: NodeId, commit_hexsha: str, added_lines: pass @abstractmethod - def update_node_files(self, node_id: NodeId, files: Iterable[str] = None) -> None: + def update_node_files(self, node_id: NodeId, commit_hexsha: str, files: Iterable[str] = None) -> None: """ Sets or replaces the list of files that contains this node in the latest commit. Note that this method will replace the whole file list of the specified node. diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py new file mode 100644 index 00000000000..7c18d9ade4d --- /dev/null +++ b/persper/analytics2/memorycallcommitgraph.py @@ -0,0 +1,149 @@ +from persper.analytics2.abstractions.callcommitgraph import * +from persper.analytics2.abstractions.repository import * +import sys +import logging +from collections import defaultdict + + +class MemoryCallCommitGraph(ICallCommitGraph): + def __init__(self, graph_data: dict=None): + self._nodes_dict = {} + self._edges_dict = {} + self._commits = {} + self._from_edges = defaultdict(list) + self._to_edges = defaultdict(list) + if graph_data: + for i in graph_data["nodes"]: + nodeid = NodeId(i["id"]['name'], i["id"]['language']) + for commit_id, history in i['history'].items: + self.update_node_history(nodeid, commit_id, history['adds'], history['dels']) + files = [] + for file in i["files"]: + files.append(file) + self.update_node_files(nodeid, files) + for i in graph_data["edges"]: + from_id = NodeId(i['from_id']["name"], i['from_id']["language"]) + to_id = NodeId(i['to_id']["name"], i['to_id']["language"]) + self.add_edge(from_id, to_id, i["added_by"]) + + for i in graph_data["commits"]: + self.add_commit(i["hex_sha"], Commit(i["hex_sha"], i["author_email"], i['author_name'], + i['author_date'], i['committer_email'], + i['committer_name'], i['commit_date'], + i['message'], i['parent']) ) + + def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None: + if node_id not in self._nodes_dict: + self._nodes_dict[node_id] = Node(node_id, added_by=commit_hexsha) + assert self._nodes_dict[node_id].added_by + + def get_node(self, id: NodeId) -> Node: + return self._nodes_dict.get(id, None) + + def get_nodes_count(self, name: str = None, language: str = None, + from_id: NodeId = None, to_id: NodeId = None) -> int: + base_set = self._nodes_dict.values() + if name is None and language is None and from_id is None and to_id is None: + return len(base_set) + count = 0 + for node in base_set: + if name is not None and node.node_id.name != name: + continue + if language is not None and node.node_id.language != language: + continue + if from_id is not None and node in self._from_edges[from_id]: + continue + if to_id is not None and node in self._to_edges[to_id]: + continue + count += 1 + return count + + def get_edge(self, from_id: NodeId, to_id: NodeId) -> Edge: + return self._edges_dict[(from_id, to_id)] + + def get_edges_count(self, from_name: str = None, from_language: str = None, to_name: str = None, + to_language: str = None) -> int: + base_set = self._edges_dict.values() + if from_name is None and from_language is None and to_name is None and to_language is None: + return len(base_set) + count = 0 + for edge in base_set: + if from_name is not None and edge.from_id.name != from_name: + continue + if to_name is not None and edge.to_id.name != to_name: + continue + if from_language is not None and edge.from_id.language != from_language: + continue + if to_language is not None and edge.to_id.language != to_language: + continue + count += 1 + return count + + def enum_edges(self, from_name: str = None, from_language: str = None, to_name: str = None, to_language: str = None) -> Iterable[Edge]: + base_set = self._edges_dict.values() + for edge in base_set: + if from_name is not None and edge.from_id.name != from_name: + continue + if to_name is not None and edge.to_id.name != to_name: + continue + if from_language is not None and edge.from_id.language != from_language: + continue + if to_language is not None and edge.to_id.language != to_language: + continue + yield edge + + def enum_nodes(self, name: str = None, language: str = None, from_id: NodeId = None, to_id: NodeId = None) -> Iterable[Node]: + base_set = self._nodes_dict.values() + for node in base_set: + if name is not None and node.name != name: + continue + if language is not None and node.language != language: + continue + if from_id is not None and node in self._from_edges[from_id]: + continue + if to_id is not None and node in self._to_edges[to_id]: + continue + yield node + + def enum_commits(self) -> Iterable[Commit]: + for commit in self._commits.values(): + yield commit + + def add_node(self, id: NodeId, node: Node) -> None: + self._nodes_dict[id] = node + + def update_node_history(self, node_id: NodeId, commit_hexsha: str, + added_lines: int = 0, removed_lines: int = 0) -> None: + self._ensure_node_exists(node_id, commit_hexsha) + for historyitem in self._nodes_dict[node_id].history: + if historyitem.hexsha == commit_hexsha: + self._nodes_dict[node_id].history = [NodeHistoryItem(commit_hexsha, + added_lines, removed_lines)] + return + self._nodes_dict[node_id].history.append(NodeHistoryItem(commit_hexsha, + added_lines, removed_lines)) + + def update_node_files(self, node_id: NodeId, commit_hexsha: str, + files: Iterable[str] = None) -> None: + self._ensure_node_exists(node_id, commit_hexsha) + self._nodes_dict[node_id].files = files + + def add_edge(self, from_id: NodeId, to_id: NodeId, commit_hexsha: str) -> None: + edge = Edge(from_id, to_id, commit_hexsha) + self._edges_dict[(from_id, to_id)] = edge + self._from_edges[from_id].append(to_id) + self._to_edges[to_id].append(from_id) + + def flush(self) -> None: + pass + + def add_commit(self, hex_sha: str, author_email: str, author_name: str, author_date: str, + committer_email: str, committer_name: str, commit_date: str, message: str) -> None: + self._commits[hex_sha] = Commit(hex_sha, author_email, author_name, + author_date, committer_email, committer_name, commit_date, message) + + def get_commit(self, hex_sha: str) -> Commit: + return self._commits[hex_sha] + + def update_commit(self, commit: Commit) -> None: + self._commits[commit.hexsha] = commit diff --git a/test/analytics2/__init__.py b/test/analytics2/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test/analytics2/abstractions/__init__.py b/test/analytics2/abstractions/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test/analytics2/abstractions/callcommitgraph.py b/test/analytics2/abstractions/callcommitgraph.py index 7b8d4f9c2ab..bf1e51f86ac 100644 --- a/test/analytics2/abstractions/callcommitgraph.py +++ b/test/analytics2/abstractions/callcommitgraph.py @@ -31,7 +31,7 @@ def create_dummy_commit(message: str = None, parents: Iterable[str] = None): def test_call_commit_graph(ccg: ICallCommitGraph): - #assert ccg + # assert ccg # commits commit1 = create_dummy_commit() commit2 = create_dummy_commit() @@ -59,12 +59,13 @@ def test_call_commit_graph(ccg: ICallCommitGraph): cppFiles = ["MyClass.h", "MyClass.cpp"] csFiles = ["MyClass.cs"] javaFiles = ["MyClass.java"] - ccg.update_node_files(cppnode1, added_files=cppFiles) - ccg.update_node_files(cppnode2, added_files=cppFiles) - ccg.update_node_files(cppnode3, added_files=cppFiles) - ccg.update_node_files(csnode2, added_files=csFiles) - ccg.update_node_files(csnode3, added_files=csFiles) - ccg.update_node_files(javanode1, added_files=javaFiles) + ccg.update_node_files(cppnode1, commit1.hexsha, files=cppFiles) + ccg.update_node_files(cppnode2, commit1.hexsha, files=cppFiles) + ccg.update_node_files(cppnode3, commit1.hexsha, files=cppFiles) + ccg.update_node_files(csnode1, commit2.hexsha, files=csFiles) + ccg.update_node_files(csnode2, commit2.hexsha, files=csFiles) + ccg.update_node_files(csnode3, commit2.hexsha, files=csFiles) + ccg.update_node_files(javanode1, commit3.hexsha, files=javaFiles) ccg.update_node_history(cppnode1, commit1.hexsha, 10, 0) # 10 will be overwritten ccg.update_node_history(cppnode1, commit1.hexsha, 20, -10) @@ -72,6 +73,7 @@ def test_call_commit_graph(ccg: ICallCommitGraph): ccg.update_node_history(cppnode3, commit1.hexsha, 10, 0) ccg.update_node_history(csnode2, commit2.hexsha, 5, 0) ccg.update_node_history(csnode3, commit2.hexsha, 4, 0) + ccg.update_node_history(csnode1, commit2.hexsha, 4, 0) ccg.add_edge(cppnode2, cppnode1, commit1.hexsha) ccg.add_edge(cppnode3, cppnode1, commit1.hexsha) # csnode1 is implicitly added @@ -79,7 +81,6 @@ def test_call_commit_graph(ccg: ICallCommitGraph): ccg.add_edge(csnode2, csnode1, commit1.hexsha) ccg.add_edge(csnode3, csnode2, commit1.hexsha) ccg.flush() - assert ccg.get_nodes_count() == 7 assert ccg.get_nodes_count(name=csnode2.name) == 2 assert ccg.get_nodes_count(name=csnode2.name, language=csnode2.language) == 1 @@ -91,7 +92,7 @@ def test_call_commit_graph(ccg: ICallCommitGraph): assert ccg.get_nodes_count(language="non_existent") == 0 assert ccg.get_edges_count() == 5 assert ccg.get_edges_count(from_language="cs") == 3 - assert ccg.get_edges_count(to_language="cpp") == 4 + assert ccg.get_edges_count(to_language="cpp") == 3 assert ccg.get_edges_count(from_language="cs", to_language="cpp") == 1 assert ccg.get_edges_count(to_name=cppnode1.name) == 3 @@ -101,15 +102,15 @@ def assertNode(node_id, added_by, files): assert node.node_id == node_id assert node.added_by == added_by assert set(node.files) == set(files) + return node - assert ccg.get_node(NodeId("non_existent", "cpp")) == None - assert ccg.get_node(NodeId(cppnode1.name, "non_existent")) == None + assert ccg.get_node(NodeId("non_existent", "cpp")) is None + assert ccg.get_node(NodeId(cppnode1.name, "non_existent")) is None assertNode(cppnode1, added_by=commit1.hexsha, files=cppFiles) assertNode(cppnode2, added_by=commit1.hexsha, files=cppFiles) assertNode(cppnode3, added_by=commit1.hexsha, files=cppFiles) - assertNode(csnode1, added_by=commit3.hexsha, files=csFiles) + assertNode(csnode1, added_by=commit2.hexsha, files=csFiles) assertNode(csnode2, added_by=commit2.hexsha, files=csFiles) assertNode(csnode3, added_by=commit2.hexsha, files=csFiles) - # javanode1 is not connected nor has node history, so it shouldn't have added_by. - assertNode(javanode1, added_by=None, files=javaFiles) + assertNode(javanode1, added_by=commit3.hexsha, files=javaFiles) diff --git a/test/analytics2/callcommitgraph.py b/test/analytics2/callcommitgraph.py deleted file mode 100644 index dab597a1b19..00000000000 --- a/test/analytics2/callcommitgraph.py +++ /dev/null @@ -1,14 +0,0 @@ -import os.path -import subprocess -import test.analytics2.abstractions.callcommitgraph as ccghelper - -# TODO import your call commit graph implementation(s) -# from persper.analytics2.callcommitgraph import InMemoryCallCommitGraph -from persper.util.path import root_path - - -def test_in_memory_call_commit_graph(): - ccg = None - # TODO create an instance for testing - #ccg = InMemoryCallCommitGraph() - ccghelper.test_call_commit_graph(ccg) diff --git a/test/analytics2/test_callcommitgraph.py b/test/analytics2/test_callcommitgraph.py new file mode 100644 index 00000000000..575ea9e187a --- /dev/null +++ b/test/analytics2/test_callcommitgraph.py @@ -0,0 +1,13 @@ +import subprocess +import pytest +from ..analytics2.abstractions import callcommitgraph as ccghelper + +# TODO import your call commit graph implementation(s) +from persper.analytics2.memorycallcommitgraph import MemoryCallCommitGraph +from persper.util.path import root_path + + +def test_memory_call_commit_graph(): + ccg = MemoryCallCommitGraph() + ccghelper.test_call_commit_graph(ccg) + From d16a6b0f41148b85482b9acbf4fa1565a52dcb6d Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 8 May 2019 23:43:18 +0800 Subject: [PATCH 03/39] Rename IRepositoryHistoryProvider -> ICommitRepository. --- persper/analytics2/abstractions/repository.py | 2 +- persper/analytics2/repository.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/persper/analytics2/abstractions/repository.py b/persper/analytics2/abstractions/repository.py index 91437f5d048..28f4599a2a6 100644 --- a/persper/analytics2/abstractions/repository.py +++ b/persper/analytics2/abstractions/repository.py @@ -208,7 +208,7 @@ def __repr__(self): type(self).__name__, self.old_file, self.new_file, str(self.operation)) -class IRepositoryHistoryProvider(ABC): +class ICommitRepository(ABC): """ Provides functionality for accessing commit history of a specified commit. """ diff --git a/persper/analytics2/repository.py b/persper/analytics2/repository.py index 44820376424..a66b6e1a4d9 100644 --- a/persper/analytics2/repository.py +++ b/persper/analytics2/repository.py @@ -6,16 +6,18 @@ from git import Blob, Commit, Diff, DiffIndex, Repo -from persper.analytics2.abstractions.repository import ( - FileDiffOperation, ICommitInfo, IFileDiff, IFileInfo, - IRepositoryHistoryProvider, IWorkspaceFileFilter) +from persper.analytics2.abstractions.repository import (FileDiffOperation, + ICommitInfo, + ICommitRepository, + IFileDiff, IFileInfo, + IWorkspaceFileFilter) _logger = logging.getLogger(__name__) EMPTY_TREE_SHA = '4b825dc642cb6eb9a060e54bf8d69288fbee4904' -class GitRepository(IRepositoryHistoryProvider): +class GitRepository(ICommitRepository): def __init__(self, repo_path: str, first_parent_only: bool = False): """ params From 618f2671f0d89736932f2c987221a7e61a342421 Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 8 May 2019 23:49:51 +0800 Subject: [PATCH 04/39] Add skeleton for MetaAnalyzer. --- persper/analytics2/metaanalyzer.py | 52 ++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 persper/analytics2/metaanalyzer.py diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py new file mode 100644 index 00000000000..ea2dac265d1 --- /dev/null +++ b/persper/analytics2/metaanalyzer.py @@ -0,0 +1,52 @@ +from typing import Iterable + +from persper.analytics2.abstractions.analyzers import ICommitAnalyzer, IPostAnalyzer +from persper.analytics2.abstractions.repository import ICommitRepository + + +class MetaAnalyzer(): + """ + Coordinates `ICommitAnalyzer` and `IPostAnalyzer` implementation, doing analysis through the commit history. + """ + def __init__(self, history_provider: ICommitRepository, + commit_analyzers: Iterable[ICommitAnalyzer], post_analyzers: Iterable[IPostAnalyzer], + origin_commit: str = None, terminal_commit: str = "HEAD", + first_parent_only: bool = False, + analyzed_commits: Iterable[str] = None): + if not isinstance(history_provider, ICommitRepository): + raise ValueError("Expect ICommitRepository instance for history_provider.") + # do necessary defensive copies + self._history_provider = history_provider + self._commit_analyzers = list(commit_analyzers) + self._post_analyzers = list(post_analyzers) + self._origin_commit = origin_commit + self._terminal_commit = terminal_commit + self._first_parent_only = first_parent_only + self._analyzed_commits = set(analyzed_commits) if analyzed_commits else set() + + @property + def origin_commit(self): + return self._origin_commit + + @origin_commit.setter + def origin_commit(self, value: str): + self._origin_commit = value + + @property + def terminal_commit(self): + return self._terminal_commit + + @terminal_commit.setter + def terminal_commit(self, value: str): + self._terminal_commit = value + + @property + def first_parent_only(self): + return self._first_parent_only + + @first_parent_only.setter + def first_parent_only(self, value: bool): + self._first_parent_only = value + + def analyze(self, max_commits: int = 100): + pass From ba46358d734babde2ef822b4e09902f34b312c0c Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 8 May 2019 23:54:10 +0800 Subject: [PATCH 05/39] Remove MetaAnalyzer.first_parent_only because we actually cannot control this. --- persper/analytics2/metaanalyzer.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py index ea2dac265d1..e61bc9a3614 100644 --- a/persper/analytics2/metaanalyzer.py +++ b/persper/analytics2/metaanalyzer.py @@ -7,11 +7,13 @@ class MetaAnalyzer(): """ Coordinates `ICommitAnalyzer` and `IPostAnalyzer` implementation, doing analysis through the commit history. + params + origin_commit, terminal_commit: See `ICommitRepository.enum_commits` for details. """ + def __init__(self, history_provider: ICommitRepository, commit_analyzers: Iterable[ICommitAnalyzer], post_analyzers: Iterable[IPostAnalyzer], origin_commit: str = None, terminal_commit: str = "HEAD", - first_parent_only: bool = False, analyzed_commits: Iterable[str] = None): if not isinstance(history_provider, ICommitRepository): raise ValueError("Expect ICommitRepository instance for history_provider.") @@ -21,7 +23,6 @@ def __init__(self, history_provider: ICommitRepository, self._post_analyzers = list(post_analyzers) self._origin_commit = origin_commit self._terminal_commit = terminal_commit - self._first_parent_only = first_parent_only self._analyzed_commits = set(analyzed_commits) if analyzed_commits else set() @property @@ -40,13 +41,5 @@ def terminal_commit(self): def terminal_commit(self, value: str): self._terminal_commit = value - @property - def first_parent_only(self): - return self._first_parent_only - - @first_parent_only.setter - def first_parent_only(self, value: bool): - self._first_parent_only = value - def analyze(self, max_commits: int = 100): pass From 88340026cf1e9bdafa44fda1ce6369ba7025a446 Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 8 May 2019 23:55:33 +0800 Subject: [PATCH 06/39] Add pep8 config for analytics2 module. Extend max-line-length. --- persper/analytics2/setup.cfg | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 persper/analytics2/setup.cfg diff --git a/persper/analytics2/setup.cfg b/persper/analytics2/setup.cfg new file mode 100644 index 00000000000..68859ad034c --- /dev/null +++ b/persper/analytics2/setup.cfg @@ -0,0 +1,2 @@ +[pep8] +max-line-length = 120 From 1788e8ef41775be5d807878af4ad0debd3e9a54e Mon Sep 17 00:00:00 2001 From: xinyan Date: Fri, 10 May 2019 00:32:44 +0800 Subject: [PATCH 07/39] Implement MetaAnalyzer.analyze. --- persper/analytics2/metaanalyzer.py | 97 +++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py index e61bc9a3614..75f99d2c9a1 100644 --- a/persper/analytics2/metaanalyzer.py +++ b/persper/analytics2/metaanalyzer.py @@ -1,7 +1,16 @@ +import logging +import re +import traceback from typing import Iterable -from persper.analytics2.abstractions.analyzers import ICommitAnalyzer, IPostAnalyzer -from persper.analytics2.abstractions.repository import ICommitRepository +from persper.analytics2.abstractions.analyzers import ( + AnalysisStatus, CommitAnalysisStopReason, ICommitAnalyzer, IPostAnalyzer) +from persper.analytics2.abstractions.repository import (ICommitInfo, + ICommitRepository, + repr_hexsha) + +_logger = logging.getLogger(__file__) +_whitespace_re = re.compile(r"\s+") class MetaAnalyzer(): @@ -11,14 +20,14 @@ class MetaAnalyzer(): origin_commit, terminal_commit: See `ICommitRepository.enum_commits` for details. """ - def __init__(self, history_provider: ICommitRepository, + def __init__(self, repository: ICommitRepository, commit_analyzers: Iterable[ICommitAnalyzer], post_analyzers: Iterable[IPostAnalyzer], origin_commit: str = None, terminal_commit: str = "HEAD", analyzed_commits: Iterable[str] = None): - if not isinstance(history_provider, ICommitRepository): - raise ValueError("Expect ICommitRepository instance for history_provider.") + if not isinstance(repository, ICommitRepository): + raise ValueError("Expect ICommitRepository instance for repository.") # do necessary defensive copies - self._history_provider = history_provider + self._repository = repository self._commit_analyzers = list(commit_analyzers) self._post_analyzers = list(post_analyzers) self._origin_commit = origin_commit @@ -42,4 +51,78 @@ def terminal_commit(self, value: str): self._terminal_commit = value def analyze(self, max_commits: int = 100): - pass + _logger.info("Start analyzing: %s..%s, max_commits=%d .", + self._origin_commit, self._terminal_commit, max_commits) + analyzedCommits = [] + currentSkippedCommits = 0 + currentSkippedFirstCommit = None + currentSkippedLastCommit = None + stopReason = CommitAnalysisStopReason.ReachedTerminalCommit + lastCommitRef = None + # XXX determine whether we need to add this into AnalysisStatus + failedAnalyzer = None + failedAnalyzerException = None + for commit in self._repository.enum_commits(self._origin_commit, self._terminal_commit): + assert isinstance(commit, ICommitInfo) + if len(analyzedCommits) >= max_commits: + _logger.info("Max analyzed commits reached.") + stopReason = CommitAnalysisStopReason.ReachedMaximumCommits + break + lastCommitRef = commit.hexsha + # Skip commits we have already analyzed previously + if lastCommitRef in self._analyzed_commits: + currentSkippedLastCommit = lastCommitRef + if currentSkippedFirstCommit == None: + currentSkippedCommits = 0 + currentSkippedFirstCommit = currentSkippedLastCommit + currentSkippedCommits += 1 + continue + if currentSkippedFirstCommit != None: + _logger.info("Skipped %s analyzed commits: %s..%s .", + currentSkippedCommits, currentSkippedFirstCommit, currentSkippedLastCommit) + currentSkippedFirstCommit = None + # Analyze commit + if _logger.getEffectiveLevel <= logging.INFO: + briefMessage = commit.message + trimmed = len(briefMessage) > 50 + briefMessage = re.sub(_whitespace_re, " ", briefMessage[:60])[:47] + if trimmed: + briefMessage += "..." + _logger.info("Analyzing commit [%s]: %s", repr_hexsha(lastCommitRef), briefMessage) + analyzer = None + analyzerIndex = 0 + try: + for analyzer in self._commit_analyzers: + assert isinstance(analyzer, ICommitAnalyzer) + _logger.debug("Analyzing with [%d]: %s .", analyzerIndex, analyzer) + analyzer.analyze(commit) + analyzerIndex += 1 + if _logger.getEffectiveLevel <= logging.DEBUG: + _logger.debug("Finished analyzing commit [%s].", repr_hexsha(lastCommitRef)) + except Exception as ex: + _logger.error("Failed to analyze commit [%s] with analyzer [%d][%s].\n%s", + lastCommitRef, analyzerIndex, analyzer, ex) + failedAnalyzer = analyzer + failedAnalyzerException = ex + stopReason = CommitAnalysisStopReason.FatalError + break + analyzedCommits.append(lastCommitRef) + self._analyzed_commits.add(lastCommitRef) + # Post analysis + if self._post_analyzers: + analyzer = None + analyzerIndex = 0 + status = AnalysisStatus(stop_reason=stopReason, exception=failedAnalyzerException, + origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit, + analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef) + try: + for analyzer in self._post_analyzers: + assert isinstance(analyzer, IPostAnalyzer) + _logger.debug("Post-analyzing with [%d]: %s .", analyzerIndex, analyzer) + analyzer.analyze(status) + analyzerIndex += 1 + except Exception as ex: + _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s", + analyzerIndex, analyzer, ex) + # We can do nothing about it. Crash the caller. + raise From 9fa881c6d9479f5fb97688ad8c90a8ddf7b0e95b Mon Sep 17 00:00:00 2001 From: xinyan Date: Fri, 10 May 2019 00:40:05 +0800 Subject: [PATCH 08/39] Add profiling log for MetaAnalyzer. --- persper/analytics2/metaanalyzer.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py index 75f99d2c9a1..3b86183ee66 100644 --- a/persper/analytics2/metaanalyzer.py +++ b/persper/analytics2/metaanalyzer.py @@ -1,6 +1,7 @@ import logging import re import traceback +from time import monotonic from typing import Iterable from persper.analytics2.abstractions.analyzers import ( @@ -62,7 +63,13 @@ def analyze(self, max_commits: int = 100): # XXX determine whether we need to add this into AnalysisStatus failedAnalyzer = None failedAnalyzerException = None + t0 = monotonic() + tAnalyzer = None + analyzerEllapsedTime = 0 for commit in self._repository.enum_commits(self._origin_commit, self._terminal_commit): + if tAnalyzer is not None: + analyzerEllapsedTime += monotonic() - tAnalyzer + tAnalyzer = monotonic() assert isinstance(commit, ICommitInfo) if len(analyzedCommits) >= max_commits: _logger.info("Max analyzed commits reached.") @@ -108,7 +115,13 @@ def analyze(self, max_commits: int = 100): break analyzedCommits.append(lastCommitRef) self._analyzed_commits.add(lastCommitRef) + if tAnalyzer is not None: + analyzerEllapsedTime += monotonic() - tAnalyzer + _logger.info("Analyzed %d commits in %.2fs, analyzer exclusive %.2fs.", + len(analyzedCommits), monotonic() - t0, analyzerEllapsedTime) # Post analysis + t0 = monotonic() + _logger.info("Start post-analyzing: %s..%s .", self._origin_commit, self._terminal_commit) if self._post_analyzers: analyzer = None analyzerIndex = 0 @@ -121,6 +134,7 @@ def analyze(self, max_commits: int = 100): _logger.debug("Post-analyzing with [%d]: %s .", analyzerIndex, analyzer) analyzer.analyze(status) analyzerIndex += 1 + _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0) except Exception as ex: _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s", analyzerIndex, analyzer, ex) From f884d2abb4b7d42245fb590e4d7f1868df9bbee4 Mon Sep 17 00:00:00 2001 From: xinyan Date: Fri, 10 May 2019 00:49:11 +0800 Subject: [PATCH 09/39] Put traceback rather than exception message in log. --- persper/analytics2/metaanalyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py index 3b86183ee66..0fd434a24e2 100644 --- a/persper/analytics2/metaanalyzer.py +++ b/persper/analytics2/metaanalyzer.py @@ -108,7 +108,7 @@ def analyze(self, max_commits: int = 100): _logger.debug("Finished analyzing commit [%s].", repr_hexsha(lastCommitRef)) except Exception as ex: _logger.error("Failed to analyze commit [%s] with analyzer [%d][%s].\n%s", - lastCommitRef, analyzerIndex, analyzer, ex) + lastCommitRef, analyzerIndex, analyzer, traceback.format_exc()) failedAnalyzer = analyzer failedAnalyzerException = ex stopReason = CommitAnalysisStopReason.FatalError @@ -137,6 +137,6 @@ def analyze(self, max_commits: int = 100): _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0) except Exception as ex: _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s", - analyzerIndex, analyzer, ex) + analyzerIndex, analyzer, traceback.format_exc()) # We can do nothing about it. Crash the caller. raise From 344c9fef316dcebacbb0d47cc53d315486d38c6d Mon Sep 17 00:00:00 2001 From: xinyan Date: Sat, 11 May 2019 15:58:21 +0800 Subject: [PATCH 10/39] Implement CallCommitGraphAnalyzer. --- persper/analytics2/devrank.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/persper/analytics2/devrank.py b/persper/analytics2/devrank.py index fff0e602ea3..646bbbd31be 100644 --- a/persper/analytics2/devrank.py +++ b/persper/analytics2/devrank.py @@ -1,23 +1,35 @@ +import logging +from time import monotonic from typing import Iterable from persper.analytics2.abstractions.analyzers import (AnalysisStatus, ICommitAnalyzer, IPostAnalyzer) -from persper.analytics2.abstractions.callcommitgraph import (IGraphServer, - IReadOnlyCallCommitGraph, - IWriteOnlyCallCommitGraph) +from persper.analytics2.abstractions.callcommitgraph import ( + IGraphServer, IReadOnlyCallCommitGraph, IWriteOnlyCallCommitGraph) from persper.analytics2.abstractions.repository import ICommitInfo +_logger = logging.getLogger(__file__) + class CallCommitGraphAnalyzer(ICommitAnalyzer): def __init__(self, graph_servers: Iterable[IGraphServer], call_commit_graph: IWriteOnlyCallCommitGraph): assert graph_servers assert call_commit_graph self._graph_servers = list(graph_servers) + # We only need this for flushing. + # We actually can flush the graph at a later stage. self._call_commit_graph = call_commit_graph def analyze(self, commit: ICommitInfo): - raise NotImplementedError() + for gs in self._graph_servers: + t0 = monotonic() + _logger.info("Analyzing %s with %s...", commit, gs) + assert isinstance(gs, IGraphServer) + _logger.info("%s finished in %.2fs.", gs, monotonic() - t0) + t0 = monotonic() + self._call_commit_graph.flush() + _logger.info("Call commit graph flush used %.2fs.", monotonic() - t0) class DevRankAnalyzer(IPostAnalyzer): From 1ad6d86341c4b4c56efd9ee9975e249c9eb767b9 Mon Sep 17 00:00:00 2001 From: xinyan Date: Sat, 11 May 2019 16:35:32 +0800 Subject: [PATCH 11/39] Add unit test for MetaAnalyzer. Fixed bugs in MetaAnalyzer to get it works. Let MetaAnalyzer.analyze return analysis status. --- persper/analytics2/metaanalyzer.py | 42 ++++++------- test/analytics2/abstractions/repository.py | 9 ++- test/analytics2/metaanalyzer.py | 71 ++++++++++++++++++++++ 3 files changed, 98 insertions(+), 24 deletions(-) create mode 100644 test/analytics2/metaanalyzer.py diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py index 0fd434a24e2..3343e922103 100644 --- a/persper/analytics2/metaanalyzer.py +++ b/persper/analytics2/metaanalyzer.py @@ -51,7 +51,7 @@ def terminal_commit(self): def terminal_commit(self, value: str): self._terminal_commit = value - def analyze(self, max_commits: int = 100): + def analyze(self, max_commits: int = 100) -> AnalysisStatus: _logger.info("Start analyzing: %s..%s, max_commits=%d .", self._origin_commit, self._terminal_commit, max_commits) analyzedCommits = [] @@ -89,7 +89,7 @@ def analyze(self, max_commits: int = 100): currentSkippedCommits, currentSkippedFirstCommit, currentSkippedLastCommit) currentSkippedFirstCommit = None # Analyze commit - if _logger.getEffectiveLevel <= logging.INFO: + if _logger.getEffectiveLevel() <= logging.INFO: briefMessage = commit.message trimmed = len(briefMessage) > 50 briefMessage = re.sub(_whitespace_re, " ", briefMessage[:60])[:47] @@ -104,7 +104,7 @@ def analyze(self, max_commits: int = 100): _logger.debug("Analyzing with [%d]: %s .", analyzerIndex, analyzer) analyzer.analyze(commit) analyzerIndex += 1 - if _logger.getEffectiveLevel <= logging.DEBUG: + if _logger.getEffectiveLevel() <= logging.DEBUG: _logger.debug("Finished analyzing commit [%s].", repr_hexsha(lastCommitRef)) except Exception as ex: _logger.error("Failed to analyze commit [%s] with analyzer [%d][%s].\n%s", @@ -120,23 +120,23 @@ def analyze(self, max_commits: int = 100): _logger.info("Analyzed %d commits in %.2fs, analyzer exclusive %.2fs.", len(analyzedCommits), monotonic() - t0, analyzerEllapsedTime) # Post analysis + status = AnalysisStatus(stop_reason=stopReason, exception=failedAnalyzerException, + origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit, + analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef) t0 = monotonic() _logger.info("Start post-analyzing: %s..%s .", self._origin_commit, self._terminal_commit) - if self._post_analyzers: - analyzer = None - analyzerIndex = 0 - status = AnalysisStatus(stop_reason=stopReason, exception=failedAnalyzerException, - origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit, - analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef) - try: - for analyzer in self._post_analyzers: - assert isinstance(analyzer, IPostAnalyzer) - _logger.debug("Post-analyzing with [%d]: %s .", analyzerIndex, analyzer) - analyzer.analyze(status) - analyzerIndex += 1 - _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0) - except Exception as ex: - _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s", - analyzerIndex, analyzer, traceback.format_exc()) - # We can do nothing about it. Crash the caller. - raise + analyzer = None + analyzerIndex = 0 + try: + for analyzer in self._post_analyzers: + assert isinstance(analyzer, IPostAnalyzer) + _logger.debug("Post-analyzing with [%d]: %s .", analyzerIndex, analyzer) + analyzer.analyze(status) + analyzerIndex += 1 + _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0) + except Exception as ex: + _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s", + analyzerIndex, analyzer, traceback.format_exc()) + # We can do nothing about it. Crash the caller. + raise + return status diff --git a/test/analytics2/abstractions/repository.py b/test/analytics2/abstractions/repository.py index 85905d5e331..a676dc3e2e6 100644 --- a/test/analytics2/abstractions/repository.py +++ b/test/analytics2/abstractions/repository.py @@ -3,13 +3,15 @@ from itertools import islice from random import randint -from persper.analytics2.abstractions.repository import ( - FileDiffOperation, ICommitInfo, IFileDiff, IRepositoryHistoryProvider) +from persper.analytics2.abstractions.repository import (FileDiffOperation, + ICommitInfo, + ICommitRepository, + IFileDiff) _logger = logging.getLogger(__file__) -def test_repository_history_provider(rhp: IRepositoryHistoryProvider): +def test_repository_history_provider(rhp: ICommitRepository): assert rhp # We enumerate from the beginning commits = list(islice(rhp.enum_commits(None, "HEAD"), 1000)) @@ -22,6 +24,7 @@ def test_repository_history_provider(rhp: IRepositoryHistoryProvider): seenCommits = set() for c in commits: assert isinstance(c, ICommitInfo) + assert isinstance(c.hexsha, str) # We should see every commit only once assert c.hexsha not in seenCommits seenCommits.add(c.hexsha) diff --git a/test/analytics2/metaanalyzer.py b/test/analytics2/metaanalyzer.py new file mode 100644 index 00000000000..cd36f0314b6 --- /dev/null +++ b/test/analytics2/metaanalyzer.py @@ -0,0 +1,71 @@ +import logging +from itertools import islice +from test.analytics2.repository import prepare_repository + +from persper.analytics2.abstractions.analyzers import ( + AnalysisStatus, CommitAnalysisStopReason, ICommitAnalyzer, IPostAnalyzer) +from persper.analytics2.abstractions.repository import ICommitInfo +from persper.analytics2.metaanalyzer import MetaAnalyzer +from persper.analytics2.repository import GitRepository + +_logger = logging.getLogger(__file__) + + +class DummyCommitAnalyzer(ICommitAnalyzer): + def __init__(self, raiseExceptionAtIndex=-1): + self.analyzedCommits = [] + self._raiseExceptionAtIndex = raiseExceptionAtIndex + + def analyze(self, commit: ICommitInfo) -> None: + assert commit + index = self.analyzedCommits + print("Current commit #{0}, hexsha {1}", index, commit.hexsha) + if index == self._raiseExceptionAtIndex: + raise Exception("Raised exception at commit #{0}.".format(index)) + self.analyzedCommits.append(commit.hexsha) + + +class DummyPostAnalyzer(IPostAnalyzer): + def __init__(self): + self.status = None + + def analyze(self, status: AnalysisStatus) -> None: + self.status = status + + +def test_meta_analyzer(): + repoPath = prepare_repository("test_feature_branch") + repo = GitRepository(repoPath) + ca = DummyCommitAnalyzer() + pa = DummyPostAnalyzer() + ma = MetaAnalyzer(repo, [ca], [pa], origin_commit=None, terminal_commit="HEAD", analyzed_commits=()) + status = ma.analyze(100) + assert status == pa.status + + commits = [c.hexsha for c in islice(repo.enum_commits(None, "HEAD"), 101)] + if len(commits) <= 100: + assert pa.status.stop_reason == CommitAnalysisStopReason.ReachedTerminalCommit + else: + assert pa.status.stop_reason == CommitAnalysisStopReason.ReachedMaximumCommits + commits = commits[:100] + assert ca.analyzedCommits == commits + assert status.analyzed_commits_ref == commits + assert status.origin_commit_ref == None + assert status.terminal_commit_ref == "HEAD" + assert status.last_commit_ref == commits[-1] + assert status.exception == None + + if len(commits) < 2: + _logger.warning("Skipped exception test because it needs repository have at least 2 commits.") + exceptionIndex = len(commits)//2 + ca = DummyCommitAnalyzer(raiseExceptionAtIndex=exceptionIndex) + pa = DummyPostAnalyzer() + ma = MetaAnalyzer(repo, [ca], [pa], origin_commit=None, terminal_commit="HEAD", analyzed_commits=()) + status = ma.analyze(100) + assert status == pa.status + assert status.stop_reason == CommitAnalysisStopReason.FatalError + assert isinstance(status.exception, Exception) + assert status.analyzed_commits_ref == commits[:exceptionIndex] + assert status.origin_commit_ref == None + assert status.terminal_commit_ref == "HEAD" + assert status.last_commit_ref == commits[exceptionIndex] From 26eee25abca20221f9ac5c9df9efcba4c72dca23 Mon Sep 17 00:00:00 2001 From: xinyan Date: Tue, 14 May 2019 21:39:45 +0800 Subject: [PATCH 12/39] Add missing update_graph call in CallCommitGraphAnalyzer. --- persper/analytics2/devrank.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/persper/analytics2/devrank.py b/persper/analytics2/devrank.py index 646bbbd31be..bb7b93f67ac 100644 --- a/persper/analytics2/devrank.py +++ b/persper/analytics2/devrank.py @@ -22,10 +22,12 @@ def __init__(self, graph_servers: Iterable[IGraphServer], call_commit_graph: IWr self._call_commit_graph = call_commit_graph def analyze(self, commit: ICommitInfo): + assert commit for gs in self._graph_servers: t0 = monotonic() _logger.info("Analyzing %s with %s...", commit, gs) assert isinstance(gs, IGraphServer) + gs.update_graph(commit) _logger.info("%s finished in %.2fs.", gs, monotonic() - t0) t0 = monotonic() self._call_commit_graph.flush() @@ -38,4 +40,5 @@ def __init__(self, call_commit_graph: IReadOnlyCallCommitGraph): self._call_commit_graph = call_commit_graph def analyze(self, status: AnalysisStatus): + # TODO put analysis code here. pass From 93d726fa0586e058b24f07123dc70f184bee4547 Mon Sep 17 00:00:00 2001 From: xinyan Date: Tue, 14 May 2019 21:48:36 +0800 Subject: [PATCH 13/39] Add docs for analyzer invocation order in MetaAnalyzer. --- persper/analytics2/metaanalyzer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py index 3343e922103..5230bf79ae2 100644 --- a/persper/analytics2/metaanalyzer.py +++ b/persper/analytics2/metaanalyzer.py @@ -17,14 +17,18 @@ class MetaAnalyzer(): """ Coordinates `ICommitAnalyzer` and `IPostAnalyzer` implementation, doing analysis through the commit history. - params - origin_commit, terminal_commit: See `ICommitRepository.enum_commits` for details. """ def __init__(self, repository: ICommitRepository, commit_analyzers: Iterable[ICommitAnalyzer], post_analyzers: Iterable[IPostAnalyzer], origin_commit: str = None, terminal_commit: str = "HEAD", analyzed_commits: Iterable[str] = None): + """ + params + commit_analyzers: a list of commit analyzers. They will be invoked sequentially in each commit. + post_analyzers: a list of post analyzers. They will be invoked sequentially after the analysis ends successfully or in fault. + origin_commit, terminal_commit: see `ICommitRepository.enum_commits` for details. + """ if not isinstance(repository, ICommitRepository): raise ValueError("Expect ICommitRepository instance for repository.") # do necessary defensive copies From 97d218aa9c9bf2501ed7103e7e6411e12806d0b1 Mon Sep 17 00:00:00 2001 From: xinyan Date: Mon, 13 May 2019 22:10:56 +0800 Subject: [PATCH 14/39] Use explicit member imports instead of wildcard imports. --- persper/analytics2/memorycallcommitgraph.py | 34 +++++++++++++-------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py index 7c18d9ade4d..023bfad6664 100644 --- a/persper/analytics2/memorycallcommitgraph.py +++ b/persper/analytics2/memorycallcommitgraph.py @@ -1,12 +1,18 @@ -from persper.analytics2.abstractions.callcommitgraph import * -from persper.analytics2.abstractions.repository import * -import sys import logging +import sys from collections import defaultdict +from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge, + ICallCommitGraph, + Node, + NodeHistoryItem, + NodeId) +from persper.analytics2.abstractions.repository import ( + ICommitInfo, IRepositoryHistoryProvider) +from typing import Iterable class MemoryCallCommitGraph(ICallCommitGraph): - def __init__(self, graph_data: dict=None): + def __init__(self, graph_data: dict = None): self._nodes_dict = {} self._edges_dict = {} self._commits = {} @@ -16,21 +22,23 @@ def __init__(self, graph_data: dict=None): for i in graph_data["nodes"]: nodeid = NodeId(i["id"]['name'], i["id"]['language']) for commit_id, history in i['history'].items: - self.update_node_history(nodeid, commit_id, history['adds'], history['dels']) + self.update_node_history( + nodeid, commit_id, history['adds'], history['dels']) files = [] for file in i["files"]: files.append(file) self.update_node_files(nodeid, files) for i in graph_data["edges"]: - from_id = NodeId(i['from_id']["name"], i['from_id']["language"]) + from_id = NodeId(i['from_id']["name"], + i['from_id']["language"]) to_id = NodeId(i['to_id']["name"], i['to_id']["language"]) self.add_edge(from_id, to_id, i["added_by"]) for i in graph_data["commits"]: self.add_commit(i["hex_sha"], Commit(i["hex_sha"], i["author_email"], i['author_name'], - i['author_date'], i['committer_email'], - i['committer_name'], i['commit_date'], - i['message'], i['parent']) ) + i['author_date'], i['committer_email'], + i['committer_name'], i['commit_date'], + i['message'], i['parent'])) def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None: if node_id not in self._nodes_dict: @@ -118,13 +126,13 @@ def update_node_history(self, node_id: NodeId, commit_hexsha: str, for historyitem in self._nodes_dict[node_id].history: if historyitem.hexsha == commit_hexsha: self._nodes_dict[node_id].history = [NodeHistoryItem(commit_hexsha, - added_lines, removed_lines)] + added_lines, removed_lines)] return self._nodes_dict[node_id].history.append(NodeHistoryItem(commit_hexsha, - added_lines, removed_lines)) + added_lines, removed_lines)) def update_node_files(self, node_id: NodeId, commit_hexsha: str, - files: Iterable[str] = None) -> None: + files: Iterable[str] = None) -> None: self._ensure_node_exists(node_id, commit_hexsha) self._nodes_dict[node_id].files = files @@ -140,7 +148,7 @@ def flush(self) -> None: def add_commit(self, hex_sha: str, author_email: str, author_name: str, author_date: str, committer_email: str, committer_name: str, commit_date: str, message: str) -> None: self._commits[hex_sha] = Commit(hex_sha, author_email, author_name, - author_date, committer_email, committer_name, commit_date, message) + author_date, committer_email, committer_name, commit_date, message) def get_commit(self, hex_sha: str) -> Commit: return self._commits[hex_sha] From e37c18db6bc9dfaf0a4fced43ad788f37754ffd6 Mon Sep 17 00:00:00 2001 From: xinyan Date: Mon, 13 May 2019 23:31:00 +0800 Subject: [PATCH 15/39] Refactor memory ccg deserialization logic. --- persper/analytics2/memorycallcommitgraph.py | 105 +++++++++++++------- 1 file changed, 68 insertions(+), 37 deletions(-) diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py index 023bfad6664..158e92800f8 100644 --- a/persper/analytics2/memorycallcommitgraph.py +++ b/persper/analytics2/memorycallcommitgraph.py @@ -1,44 +1,78 @@ +import json import logging import sys from collections import defaultdict +from typing import Iterable, TextIO from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge, ICallCommitGraph, Node, NodeHistoryItem, NodeId) -from persper.analytics2.abstractions.repository import ( - ICommitInfo, IRepositoryHistoryProvider) -from typing import Iterable +from persper.analytics2.abstractions.repository import (ICommitInfo, + ICommitRepository) + + +def serialize_node_id(d: NodeId) -> dict: + return {"name": d.name, "language": d.language} + + +def deserialize_node_id(d: dict) -> NodeId: + return NodeId(d["name"], d["language"]) + + +def deserialize_node_history_item(d: dict) -> NodeHistoryItem: + return NodeHistoryItem(hexsha=d["hexsha"], added_lines=d["added_lines"], removed_lines=d["removed_lines"]) + + +def deserialize_node(d: dict) -> NodeId: + return Node(node_id=deserialize_node_id(d["id"]), added_by=d["added_by"], + history=[deserialize_node_history_item(i) for i in d["history"]], + files=list(d["files"])) + + +def deserialize_edge(d: dict) -> Edge: + return Edge(from_id=deserialize_node_id(d["from_id"]), to_id=d["to_id"], added_by=d["added_by"]) + + +def deserialize_commit(d: dict) -> Commit: + return Commit(d["hex_sha"], + d["author_email"], d['author_name'], d['author_date'], + d['committer_email'], d['committer_name'], d['commit_date'], + d['message'], d['parents']) + class MemoryCallCommitGraph(ICallCommitGraph): - def __init__(self, graph_data: dict = None): + def __init__(self): self._nodes_dict = {} self._edges_dict = {} self._commits = {} self._from_edges = defaultdict(list) self._to_edges = defaultdict(list) - if graph_data: - for i in graph_data["nodes"]: - nodeid = NodeId(i["id"]['name'], i["id"]['language']) - for commit_id, history in i['history'].items: - self.update_node_history( - nodeid, commit_id, history['adds'], history['dels']) - files = [] - for file in i["files"]: - files.append(file) - self.update_node_files(nodeid, files) - for i in graph_data["edges"]: - from_id = NodeId(i['from_id']["name"], - i['from_id']["language"]) - to_id = NodeId(i['to_id']["name"], i['to_id']["language"]) - self.add_edge(from_id, to_id, i["added_by"]) - - for i in graph_data["commits"]: - self.add_commit(i["hex_sha"], Commit(i["hex_sha"], i["author_email"], i['author_name'], - i['author_date'], i['committer_email'], - i['committer_name'], i['commit_date'], - i['message'], i['parent'])) + + @staticmethod + def deserialize_dict(graph_data: dict) -> "MemoryCallCommitGraph": + graph = MemoryCallCommitGraph() + for nd in graph_data["nodes"]: + node = deserialize_node(nd) + graph._add_node_direct(node) + for ed in graph_data["edges"]: + edge = deserialize_edge(ed) + graph._add_edge_direct(edge) + for cd in graph_data["commits"]: + commit = deserialize_commit(cd) + graph.update_commit(commit) + return graph + + @staticmethod + def load_from(fp: TextIO) -> "MemoryCallCommitGraph": + d = json.load(fp) + return MemoryCallCommitGraph.deserialize_dict(d) + + @staticmethod + def load(json_content: str) -> "MemoryCallCommitGraph": + d = json.loads(json_content) + return MemoryCallCommitGraph.deserialize_dict(d) def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None: if node_id not in self._nodes_dict: @@ -117,8 +151,8 @@ def enum_commits(self) -> Iterable[Commit]: for commit in self._commits.values(): yield commit - def add_node(self, id: NodeId, node: Node) -> None: - self._nodes_dict[id] = node + def _add_node_direct(self, node: Node) -> None: + self._nodes_dict[node.id] = node def update_node_history(self, node_id: NodeId, commit_hexsha: str, added_lines: int = 0, removed_lines: int = 0) -> None: @@ -137,21 +171,18 @@ def update_node_files(self, node_id: NodeId, commit_hexsha: str, self._nodes_dict[node_id].files = files def add_edge(self, from_id: NodeId, to_id: NodeId, commit_hexsha: str) -> None: - edge = Edge(from_id, to_id, commit_hexsha) - self._edges_dict[(from_id, to_id)] = edge - self._from_edges[from_id].append(to_id) - self._to_edges[to_id].append(from_id) + self._add_edge_direct(Edge(from_id, to_id, commit_hexsha)) + + def _add_edge_direct(self, edge: Edge) -> None: + self._edges_dict[(edge.from_id, edge.to_id)] = edge + self._from_edges[edge.from_id].append(edge.to_id) + self._to_edges[edge.to_id].append(edge.from_id) def flush(self) -> None: pass - def add_commit(self, hex_sha: str, author_email: str, author_name: str, author_date: str, - committer_email: str, committer_name: str, commit_date: str, message: str) -> None: - self._commits[hex_sha] = Commit(hex_sha, author_email, author_name, - author_date, committer_email, committer_name, commit_date, message) - def get_commit(self, hex_sha: str) -> Commit: - return self._commits[hex_sha] + return self._commits.get(hex_sha, None) def update_commit(self, commit: Commit) -> None: self._commits[commit.hexsha] = commit From 368eadc1bd24e434c5834b2297d06dcae22173b0 Mon Sep 17 00:00:00 2001 From: xinyan Date: Tue, 14 May 2019 22:57:24 +0800 Subject: [PATCH 16/39] Add serialization support in test_memory_call_commit_graph. Add test for serialization/deserialization. Bug fix in MemoryCallCommitGraph to make it pass the test. --- persper/analytics2/memorycallcommitgraph.py | 58 ++++++++++++++++--- .../abstractions/callcommitgraph.py | 17 ++++++ test/analytics2/test_callcommitgraph.py | 8 ++- 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py index 158e92800f8..17824688110 100644 --- a/persper/analytics2/memorycallcommitgraph.py +++ b/persper/analytics2/memorycallcommitgraph.py @@ -2,8 +2,11 @@ import logging import sys from collections import defaultdict +from datetime import datetime from typing import Iterable, TextIO +import pytz + from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge, ICallCommitGraph, Node, @@ -13,12 +16,34 @@ ICommitRepository) -def serialize_node_id(d: NodeId) -> dict: - return {"name": d.name, "language": d.language} +def serialize_node_id(o: NodeId) -> tuple: + return (o.name, o.language) + + +def serialize_node_history_item(o: NodeHistoryItem) -> dict: + return {"hexsha": o.hexsha, "added_lines": o.added_lines, "removed_lines": o.removed_lines} + + +def serialize_node(o: Node) -> dict: + return {"id": o.node_id, "added_by": o.added_by, + "history": [serialize_node_history_item(h) for h in o.history], + "files": list(o.files)} + + +def serialize_edge(o: Edge) -> dict: + return {"from_id": serialize_node_id(o.from_id), "to_id": serialize_node_id(o.to_id), + "added_by": o.added_by} -def deserialize_node_id(d: dict) -> NodeId: - return NodeId(d["name"], d["language"]) +def serialize_commit(o: Commit) -> dict: + return {"hex_sha": o.hexsha, + "author_email": o.author_email, "author_name": o.author_name, "authored_time": str(o.authored_time), + "committer_email": o.committer_email, "committer_name": o.committer_name, "committed_time": str(o.committed_time), + "message": o.message, "parents": o.parents} + + +def deserialize_node_id(t: tuple) -> NodeId: + return NodeId(t[0], t[1]) def deserialize_node_history_item(d: dict) -> NodeHistoryItem: @@ -32,13 +57,13 @@ def deserialize_node(d: dict) -> NodeId: def deserialize_edge(d: dict) -> Edge: - return Edge(from_id=deserialize_node_id(d["from_id"]), to_id=d["to_id"], added_by=d["added_by"]) + return Edge(from_id=deserialize_node_id(d["from_id"]), to_id=deserialize_node_id(d["to_id"]), added_by=d["added_by"]) def deserialize_commit(d: dict) -> Commit: return Commit(d["hex_sha"], - d["author_email"], d['author_name'], d['author_date'], - d['committer_email'], d['committer_name'], d['commit_date'], + d["author_email"], d['author_name'], datetime.fromisoformat(d['authored_time']), + d['committer_email'], d['committer_name'], datetime.fromisoformat(d['committed_time']), d['message'], d['parents']) @@ -70,10 +95,24 @@ def load_from(fp: TextIO) -> "MemoryCallCommitGraph": return MemoryCallCommitGraph.deserialize_dict(d) @staticmethod - def load(json_content: str) -> "MemoryCallCommitGraph": + def deserialize(json_content: str) -> "MemoryCallCommitGraph": d = json.loads(json_content) return MemoryCallCommitGraph.deserialize_dict(d) + def serialize_dict(self) -> dict: + return { + "nodes": [serialize_node(n) for n in self._nodes_dict.values()], + "edges": [serialize_edge(n) for n in self._edges_dict.values()], + "commits": [serialize_commit(n) for n in self._commits.values()], + } + + def save_to(self, fp: TextIO): + d = self.serialize_dict() + json.dump(d, fp) + + def serialize(self) -> str: + return json.dumps(self.serialize_dict()) + def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None: if node_id not in self._nodes_dict: self._nodes_dict[node_id] = Node(node_id, added_by=commit_hexsha) @@ -152,7 +191,7 @@ def enum_commits(self) -> Iterable[Commit]: yield commit def _add_node_direct(self, node: Node) -> None: - self._nodes_dict[node.id] = node + self._nodes_dict[node.node_id] = node def update_node_history(self, node_id: NodeId, commit_hexsha: str, added_lines: int = 0, removed_lines: int = 0) -> None: @@ -174,6 +213,7 @@ def add_edge(self, from_id: NodeId, to_id: NodeId, commit_hexsha: str) -> None: self._add_edge_direct(Edge(from_id, to_id, commit_hexsha)) def _add_edge_direct(self, edge: Edge) -> None: + self._edges_dict[(edge.from_id, edge.to_id)] = edge self._from_edges[edge.from_id].append(edge.to_id) self._to_edges[edge.to_id].append(edge.from_id) diff --git a/test/analytics2/abstractions/callcommitgraph.py b/test/analytics2/abstractions/callcommitgraph.py index bf1e51f86ac..a538adb39f5 100644 --- a/test/analytics2/abstractions/callcommitgraph.py +++ b/test/analytics2/abstractions/callcommitgraph.py @@ -5,6 +5,7 @@ from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge, ICallCommitGraph, + IReadOnlyCallCommitGraph, Node, NodeId) @@ -114,3 +115,19 @@ def assertNode(node_id, added_by, files): assertNode(csnode2, added_by=commit2.hexsha, files=csFiles) assertNode(csnode3, added_by=commit2.hexsha, files=csFiles) assertNode(javanode1, added_by=commit3.hexsha, files=javaFiles) + + +def assert_graph_same(expected: IReadOnlyCallCommitGraph, actual: IReadOnlyCallCommitGraph, compare_hexsha: bool = True): + for n1 in expected.enum_nodes(): + n2 = actual.get_node(n1.node_id) + assert n2, "Node missing: {0}".format(n1.node_id) + assert n1.node_id == n2.node_id + if compare_hexsha: + assert n1.added_by == n2.added_by + else: + c1 = expected.get_commit(n1.added_by) + c2 = actual.get_commit(n2.added_by) + assert c1 + assert c2 + assert c1.message == c2.message + # TODO add more assertions diff --git a/test/analytics2/test_callcommitgraph.py b/test/analytics2/test_callcommitgraph.py index 575ea9e187a..91c6d0ef638 100644 --- a/test/analytics2/test_callcommitgraph.py +++ b/test/analytics2/test_callcommitgraph.py @@ -1,6 +1,6 @@ import subprocess import pytest -from ..analytics2.abstractions import callcommitgraph as ccghelper +import test.analytics2.abstractions.callcommitgraph as ccghelper # TODO import your call commit graph implementation(s) from persper.analytics2.memorycallcommitgraph import MemoryCallCommitGraph @@ -10,4 +10,8 @@ def test_memory_call_commit_graph(): ccg = MemoryCallCommitGraph() ccghelper.test_call_commit_graph(ccg) - + serialized = ccg.serialize() + print("Serialized:", serialized) + assert isinstance(serialized, str) + ccg2 = MemoryCallCommitGraph.deserialize(serialized) + ccghelper.assert_graph_same(ccg, ccg2) From da50ed180b58ea50e82cf065c69a153e5bade58d Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 15 May 2019 23:17:03 +0800 Subject: [PATCH 17/39] Add more tests in assert_graph_same. --- .../abstractions/callcommitgraph.py | 78 +++++++++++++++---- 1 file changed, 64 insertions(+), 14 deletions(-) diff --git a/test/analytics2/abstractions/callcommitgraph.py b/test/analytics2/abstractions/callcommitgraph.py index a538adb39f5..22c16e4a75b 100644 --- a/test/analytics2/abstractions/callcommitgraph.py +++ b/test/analytics2/abstractions/callcommitgraph.py @@ -3,10 +3,9 @@ from random import randint from typing import Iterable -from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge, - ICallCommitGraph, - IReadOnlyCallCommitGraph, - Node, NodeId) +from persper.analytics2.abstractions.callcommitgraph import ( + Commit, Edge, ICallCommitGraph, IReadOnlyCallCommitGraph, Node, + NodeHistoryItem, NodeId) def commit_equals(x: Commit, y: Commit): @@ -117,17 +116,68 @@ def assertNode(node_id, added_by, files): assertNode(javanode1, added_by=commit3.hexsha, files=javaFiles) -def assert_graph_same(expected: IReadOnlyCallCommitGraph, actual: IReadOnlyCallCommitGraph, compare_hexsha: bool = True): +def commit_assertion_skip(expectedGraph, actualGraph, expectedHexsha, actualHexsha): + pass + + +def commit_assertion_by_hexsha(expectedGraph, actualGraph, expectedHexsha, actualHexsha): + assert expectedHexsha == actualHexsha, "Commits are not the same by hexsha." + + +def commit_assertion_by_comment(expectedGraph, actualGraph, expectedHexsha, actualHexsha): + c1 = expectedGraph.get_commit(expectedHexsha) + c2 = actualGraph.get_commit(actualHexsha) + assert c1, "Expected-side of commit is missing." + assert c2, "Actual-side of commit is missing." + assert c1.message == c2.message, "Commits are not the same by commit message." + + +def assert_graph_same(expected: IReadOnlyCallCommitGraph, actual: IReadOnlyCallCommitGraph, + commit_assertion=commit_assertion_by_hexsha): + def assertCommitEqual(expectedHexsha, actualHexsha): + return commit_assertion(expected, actual, expectedHexsha, actualHexsha) for n1 in expected.enum_nodes(): n2 = actual.get_node(n1.node_id) assert n2, "Node missing: {0}".format(n1.node_id) assert n1.node_id == n2.node_id - if compare_hexsha: - assert n1.added_by == n2.added_by - else: - c1 = expected.get_commit(n1.added_by) - c2 = actual.get_commit(n2.added_by) - assert c1 - assert c2 - assert c1.message == c2.message - # TODO add more assertions + assertCommitEqual(n1.added_by, n2.added_by) + keyExtractor = None + if commit_assertion == commit_assertion_by_hexsha: + # Make autopep8 happy. + def f(h): + return h.hexsha + keyExtractor = f + elif commit_assertion == commit_assertion_by_comment: + def f(h): + return h.message + keyExtractor = f + if keyExtractor: + d1 = dict((keyExtractor, h) for h in n1.history) + d2 = dict((keyExtractor, h) for h in n2.history) + for k, h1 in d1: + h2 = d2.get(k, None) + assert isinstance(h1, NodeHistoryItem) + assert h2, "Commit history {0} missing for node {1}.".format(h1, n1.node_id) + assert isinstance(h2, NodeHistoryItem) + assert h1.added_lines == h2.added_lines, "In commit: {0}".format(h1) + assert h1.removed_lines == h2.removed_lines, "In commit: {0}".format(h1) + if len(d1) < len(d2): + # there are extra node history + for k, h2 in d2: + h1 = d1.get(k, None) + assert h2, "Extra commit history {0} for node {1}.".format(h1, n1.node_id) + assert set(n1.files) == set(n2.files) + if expected.get_nodes_count() < actual.get_nodes_count(): + # there are extra nodes + for n2 in actual.enum_nodes(): + n1 = expected.get_node(n2.node_id) + assert n1, "Extra node: {0}".format(n2.node_id) + for b1 in expected.enum_edges(): + b2 = actual.get_edge(b1.from_id, b1.to_id) + assert b2, "Edge missing: {0} -> {1}".format(b1.from_id, b1.to_id) + assertCommitEqual(b1.added_by, b2.added_by) + if expected.get_edges_count() < actual.get_edges_count(): + # there are extra edges + for n2 in actual.enum_edges(): + n1 = expected.get_edge(n2.from_id, n2.to_id) + assert n1, "Extra edge: {0} -> {1}".format(b2.from_id, b2.to_id) From 52b6b4bc71a58fe31e253cf2f668c08e36833cab Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 15 May 2019 23:26:04 +0800 Subject: [PATCH 18/39] Reorganize test_analytics2. Make file names compatible with pytest. --- test/analytics2/abstractions/__init__.py | 0 .../__init__.py | 0 .../helpers/__init__.py} | 0 .../helpers}/callcommitgraph.py | 0 .../helpers}/repository.py | 0 test/{analytics2 => test_analytics2}/setup.cfg | 0 .../test_callcommitgraph.py | 2 +- .../test_metaanalyzer.py} | 2 +- .../test_repository.py} | 2 +- test/test_analytics2/utilities.py | 18 ++++++++++++++++++ 10 files changed, 21 insertions(+), 3 deletions(-) delete mode 100644 test/analytics2/abstractions/__init__.py rename test/{analytics2 => test_analytics2}/__init__.py (100%) rename test/{analytics2/utilities.py => test_analytics2/helpers/__init__.py} (100%) rename test/{analytics2/abstractions => test_analytics2/helpers}/callcommitgraph.py (100%) rename test/{analytics2/abstractions => test_analytics2/helpers}/repository.py (100%) rename test/{analytics2 => test_analytics2}/setup.cfg (100%) rename test/{analytics2 => test_analytics2}/test_callcommitgraph.py (89%) rename test/{analytics2/metaanalyzer.py => test_analytics2/test_metaanalyzer.py} (98%) rename test/{analytics2/repository.py => test_analytics2/test_repository.py} (93%) create mode 100644 test/test_analytics2/utilities.py diff --git a/test/analytics2/abstractions/__init__.py b/test/analytics2/abstractions/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/test/analytics2/__init__.py b/test/test_analytics2/__init__.py similarity index 100% rename from test/analytics2/__init__.py rename to test/test_analytics2/__init__.py diff --git a/test/analytics2/utilities.py b/test/test_analytics2/helpers/__init__.py similarity index 100% rename from test/analytics2/utilities.py rename to test/test_analytics2/helpers/__init__.py diff --git a/test/analytics2/abstractions/callcommitgraph.py b/test/test_analytics2/helpers/callcommitgraph.py similarity index 100% rename from test/analytics2/abstractions/callcommitgraph.py rename to test/test_analytics2/helpers/callcommitgraph.py diff --git a/test/analytics2/abstractions/repository.py b/test/test_analytics2/helpers/repository.py similarity index 100% rename from test/analytics2/abstractions/repository.py rename to test/test_analytics2/helpers/repository.py diff --git a/test/analytics2/setup.cfg b/test/test_analytics2/setup.cfg similarity index 100% rename from test/analytics2/setup.cfg rename to test/test_analytics2/setup.cfg diff --git a/test/analytics2/test_callcommitgraph.py b/test/test_analytics2/test_callcommitgraph.py similarity index 89% rename from test/analytics2/test_callcommitgraph.py rename to test/test_analytics2/test_callcommitgraph.py index 91c6d0ef638..07a9bc3cc67 100644 --- a/test/analytics2/test_callcommitgraph.py +++ b/test/test_analytics2/test_callcommitgraph.py @@ -1,6 +1,6 @@ import subprocess import pytest -import test.analytics2.abstractions.callcommitgraph as ccghelper +import test.analytics2.helpers.callcommitgraph as ccghelper # TODO import your call commit graph implementation(s) from persper.analytics2.memorycallcommitgraph import MemoryCallCommitGraph diff --git a/test/analytics2/metaanalyzer.py b/test/test_analytics2/test_metaanalyzer.py similarity index 98% rename from test/analytics2/metaanalyzer.py rename to test/test_analytics2/test_metaanalyzer.py index cd36f0314b6..7d72caa5a53 100644 --- a/test/analytics2/metaanalyzer.py +++ b/test/test_analytics2/test_metaanalyzer.py @@ -1,6 +1,6 @@ import logging from itertools import islice -from test.analytics2.repository import prepare_repository +from test.analytics2.utilities import prepare_repository from persper.analytics2.abstractions.analyzers import ( AnalysisStatus, CommitAnalysisStopReason, ICommitAnalyzer, IPostAnalyzer) diff --git a/test/analytics2/repository.py b/test/test_analytics2/test_repository.py similarity index 93% rename from test/analytics2/repository.py rename to test/test_analytics2/test_repository.py index 1eac74d43db..de785ecb346 100644 --- a/test/analytics2/repository.py +++ b/test/test_analytics2/test_repository.py @@ -1,6 +1,6 @@ import os.path import subprocess -import test.analytics2.abstractions.repository as repositoryhelper +import test.analytics2.helpers.repository as repositoryhelper from persper.analytics2.repository import GitRepository from persper.util.path import root_path diff --git a/test/test_analytics2/utilities.py b/test/test_analytics2/utilities.py new file mode 100644 index 00000000000..e2a71d525c4 --- /dev/null +++ b/test/test_analytics2/utilities.py @@ -0,0 +1,18 @@ +import os.path +import subprocess +import test.analytics2.helpers.repository as repositoryhelper + +from persper.analytics2.repository import GitRepository +from persper.util.path import root_path + + +def prepare_repository(repo_name: str): + # build the repo first if not exists yet + repo_path = os.path.join(root_path, 'repos/' + repo_name) + script_path = os.path.join(root_path, 'tools/repo_creater/create_repo.py') + test_src_path = os.path.join(root_path, 'test/' + repo_name) + if not os.path.isdir(repo_path): + cmd = '{} {}'.format(script_path, test_src_path) + subprocess.call(cmd, shell=True) + print("Repository path: ", repo_path) + return repo_path From a70445938f1f057c239268305fc7f177a061f68a Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 15 May 2019 23:37:44 +0800 Subject: [PATCH 19/39] Fix module imports in test_analytics2 accordingly. --- test/test_analytics2/test_callcommitgraph.py | 2 +- test/test_analytics2/test_metaanalyzer.py | 2 +- test/test_analytics2/test_repository.py | 2 +- test/test_analytics2/utilities.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_analytics2/test_callcommitgraph.py b/test/test_analytics2/test_callcommitgraph.py index 07a9bc3cc67..b2a459f4cc3 100644 --- a/test/test_analytics2/test_callcommitgraph.py +++ b/test/test_analytics2/test_callcommitgraph.py @@ -1,6 +1,6 @@ import subprocess import pytest -import test.analytics2.helpers.callcommitgraph as ccghelper +import test.test_analytics2.helpers.callcommitgraph as ccghelper # TODO import your call commit graph implementation(s) from persper.analytics2.memorycallcommitgraph import MemoryCallCommitGraph diff --git a/test/test_analytics2/test_metaanalyzer.py b/test/test_analytics2/test_metaanalyzer.py index 7d72caa5a53..6aaa61df0f5 100644 --- a/test/test_analytics2/test_metaanalyzer.py +++ b/test/test_analytics2/test_metaanalyzer.py @@ -1,6 +1,6 @@ import logging from itertools import islice -from test.analytics2.utilities import prepare_repository +from test.test_analytics2.utilities import prepare_repository from persper.analytics2.abstractions.analyzers import ( AnalysisStatus, CommitAnalysisStopReason, ICommitAnalyzer, IPostAnalyzer) diff --git a/test/test_analytics2/test_repository.py b/test/test_analytics2/test_repository.py index de785ecb346..89404b90b57 100644 --- a/test/test_analytics2/test_repository.py +++ b/test/test_analytics2/test_repository.py @@ -1,6 +1,6 @@ import os.path import subprocess -import test.analytics2.helpers.repository as repositoryhelper +import test.test_analytics2.helpers.repository as repositoryhelper from persper.analytics2.repository import GitRepository from persper.util.path import root_path diff --git a/test/test_analytics2/utilities.py b/test/test_analytics2/utilities.py index e2a71d525c4..a0f9044b201 100644 --- a/test/test_analytics2/utilities.py +++ b/test/test_analytics2/utilities.py @@ -1,6 +1,6 @@ import os.path import subprocess -import test.analytics2.helpers.repository as repositoryhelper +import test.test_analytics2.helpers.repository as repositoryhelper from persper.analytics2.repository import GitRepository from persper.util.path import root_path From a21716000df8d117d668adc0779659850b9625f1 Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 15 May 2019 23:40:33 +0800 Subject: [PATCH 20/39] Add docs. --- persper/analytics2/memorycallcommitgraph.py | 24 +++++++++++++++++++ .../helpers/callcommitgraph.py | 6 +++++ 2 files changed, 30 insertions(+) diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py index 17824688110..8415cf18340 100644 --- a/persper/analytics2/memorycallcommitgraph.py +++ b/persper/analytics2/memorycallcommitgraph.py @@ -15,6 +15,8 @@ from persper.analytics2.abstractions.repository import (ICommitInfo, ICommitRepository) +# Helper methods for data model <--> dict serialization. + def serialize_node_id(o: NodeId) -> tuple: return (o.name, o.language) @@ -68,6 +70,10 @@ def deserialize_commit(d: dict) -> Commit: class MemoryCallCommitGraph(ICallCommitGraph): + """ + Represents a call commit graph stored in-memory. + """ + def __init__(self): self._nodes_dict = {} self._edges_dict = {} @@ -77,6 +83,9 @@ def __init__(self): @staticmethod def deserialize_dict(graph_data: dict) -> "MemoryCallCommitGraph": + """ + Deserializes a MemoryCallCommitGraph from dict generated by `serialize_dict` method. + """ graph = MemoryCallCommitGraph() for nd in graph_data["nodes"]: node = deserialize_node(nd) @@ -91,15 +100,24 @@ def deserialize_dict(graph_data: dict) -> "MemoryCallCommitGraph": @staticmethod def load_from(fp: TextIO) -> "MemoryCallCommitGraph": + """ + Deserializes a MemoryCallCommitGraph from the specified text IO containing JSON. + """ d = json.load(fp) return MemoryCallCommitGraph.deserialize_dict(d) @staticmethod def deserialize(json_content: str) -> "MemoryCallCommitGraph": + """ + Deserializes a MemoryCallCommitGraph from the specified JSON string. + """ d = json.loads(json_content) return MemoryCallCommitGraph.deserialize_dict(d) def serialize_dict(self) -> dict: + """ + Serializes the call commit graph contained in the current instance into a simple dict. + """ return { "nodes": [serialize_node(n) for n in self._nodes_dict.values()], "edges": [serialize_edge(n) for n in self._edges_dict.values()], @@ -107,10 +125,16 @@ def serialize_dict(self) -> dict: } def save_to(self, fp: TextIO): + """ + Serializes the call commit graph contained in the current instance into the specified text IO as JSON. + """ d = self.serialize_dict() json.dump(d, fp) def serialize(self) -> str: + """ + Serializes the call commit graph contained in the current instance into JSON string. + """ return json.dumps(self.serialize_dict()) def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None: diff --git a/test/test_analytics2/helpers/callcommitgraph.py b/test/test_analytics2/helpers/callcommitgraph.py index 22c16e4a75b..9b35f70ea49 100644 --- a/test/test_analytics2/helpers/callcommitgraph.py +++ b/test/test_analytics2/helpers/callcommitgraph.py @@ -134,6 +134,12 @@ def commit_assertion_by_comment(expectedGraph, actualGraph, expectedHexsha, actu def assert_graph_same(expected: IReadOnlyCallCommitGraph, actual: IReadOnlyCallCommitGraph, commit_assertion=commit_assertion_by_hexsha): + """ + Asserts two `IReadOnlyCallCommitGraph` instances contain the equivalent content. + params + commit_assertion: Specifies how to treat two commits as equivalent. You need to choose between + `commit_assertion_skip`, `commit_assertion_by_hexsha`, and `commit_assertion_by_comment`. + """ def assertCommitEqual(expectedHexsha, actualHexsha): return commit_assertion(expected, actual, expectedHexsha, actualHexsha) for n1 in expected.enum_nodes(): From bb09cc6a5c82a642b03cf03db320a9fa0a265082 Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 15 May 2019 23:52:38 +0800 Subject: [PATCH 21/39] Add MetaAnalyzer.load/save_dict methods. --- persper/analytics2/metaanalyzer.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py index 5230bf79ae2..8026a7d5b39 100644 --- a/persper/analytics2/metaanalyzer.py +++ b/persper/analytics2/metaanalyzer.py @@ -28,6 +28,10 @@ def __init__(self, repository: ICommitRepository, commit_analyzers: a list of commit analyzers. They will be invoked sequentially in each commit. post_analyzers: a list of post analyzers. They will be invoked sequentially after the analysis ends successfully or in fault. origin_commit, terminal_commit: see `ICommitRepository.enum_commits` for details. + remarks + You may use `load_state_dict` to load `origin_commit`, `terminal_commit`, and `analyzed_commits` from + dict after instantiating this class. Still you need to inject required services (indicated by required + parameters) so that you can instantiate this class. """ if not isinstance(repository, ICommitRepository): raise ValueError("Expect ICommitRepository instance for repository.") @@ -39,6 +43,22 @@ def __init__(self, repository: ICommitRepository, self._terminal_commit = terminal_commit self._analyzed_commits = set(analyzed_commits) if analyzed_commits else set() + def save_state_dict(self) -> dict: + """ + Save the current state into a dict with simple values. + """ + return {"origin_commit": self._origin_commit, + "terminal_commit": self._terminal_commit, + "analyzed_commits": list(self._analyzed_commits)} + + def load_state_dict(self, d: dict): + """ + Load the current state from a dict with simple values. + """ + self.origin_commit = d["origin_commit"] + self.terminal_commit = d["terminal_commit"] + self.analyzed_commits = set(d["analyzed_commits"]) + @property def origin_commit(self): return self._origin_commit @@ -125,8 +145,8 @@ def analyze(self, max_commits: int = 100) -> AnalysisStatus: len(analyzedCommits), monotonic() - t0, analyzerEllapsedTime) # Post analysis status = AnalysisStatus(stop_reason=stopReason, exception=failedAnalyzerException, - origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit, - analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef) + origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit, + analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef) t0 = monotonic() _logger.info("Start post-analyzing: %s..%s .", self._origin_commit, self._terminal_commit) analyzer = None @@ -140,7 +160,7 @@ def analyze(self, max_commits: int = 100) -> AnalysisStatus: _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0) except Exception as ex: _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s", - analyzerIndex, analyzer, traceback.format_exc()) + analyzerIndex, analyzer, traceback.format_exc()) # We can do nothing about it. Crash the caller. raise return status From 40b765d36d5b7d416b6edd3c32a6e8926e535f33 Mon Sep 17 00:00:00 2001 From: xinyan Date: Fri, 17 May 2019 22:55:04 +0800 Subject: [PATCH 22/39] Add skip_rewind_diff param in Analyzer constructor. --- persper/analytics/analyzer2.py | 68 +++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py index 8439df5b0c9..bdb890157d4 100644 --- a/persper/analytics/analyzer2.py +++ b/persper/analytics/analyzer2.py @@ -4,7 +4,7 @@ import re import time from abc import ABC -from typing import List, Optional, Set, Union +from typing import List, Optional, Set, Union, Dict from git import Commit, Diff, DiffIndex, Repo @@ -20,7 +20,9 @@ class Analyzer: def __init__(self, repositoryRoot: str, graphServer: GraphServer, terminalCommit: str = 'HEAD', firstParentOnly: bool = False, - commit_classifier: Optional[CommitClassifier] = None): + commit_classifier: Optional[CommitClassifier] = None, + skip_rewind_diff: bool = False): + # skip_rewind_diff will skip diff, but rewind commit start/end will still be notified to the GraphServer. self._repositoryRoot = repositoryRoot self._graphServer = graphServer self._repo = Repo(repositoryRoot) @@ -32,6 +34,7 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer, self._observer: AnalyzerObserver = emptyAnalyzerObserver self._commit_classifier = commit_classifier self._clf_results: Dict[str, List[float]] = {} + self._skip_rewind_diff = skip_rewind_diff def __getstate__(self): state = self.__dict__.copy() @@ -207,7 +210,11 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C await result t1 = time.monotonic() - t1 - diff_index = diff_with_commit(self._repo, commit, parentCommit) + diff_index = None + if self._skip_rewind_diff: + _logger.info("Skipped diff for rewinding commit.") + else: + diff_index = diff_with_commit(self._repo, commit, parentCommit) # commit classification if self._commit_classifier and commit.hexsha not in self._clf_results: @@ -216,33 +223,34 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C # t2: update_graph time t2 = time.monotonic() - for diff in diff_index: - old_fname, new_fname = _get_fnames(diff) - # apply filter - # if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None, - # as if the file is introduced/removed in this commit. - # However, the diff will keep its original, no matter if the file has been filtered in/out. - if old_fname and not self._graphServer.filter_file(old_fname): - old_fname = None - if new_fname and not self._graphServer.filter_file(new_fname): - new_fname = None - if not old_fname and not new_fname: - # no modification - continue - - old_src = new_src = None - - if old_fname: - old_src = get_contents(self._repo, parentCommit, old_fname) - - if new_fname: - new_src = get_contents(self._repo, commit, new_fname) - - if old_src or new_src: - result = self._graphServer.update_graph( - old_fname, old_src, new_fname, new_src, diff.diff) - if asyncio.iscoroutine(result): - await result + if diff_index: + for diff in diff_index: + old_fname, new_fname = _get_fnames(diff) + # apply filter + # if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None, + # as if the file is introduced/removed in this commit. + # However, the diff will keep its original, no matter if the file has been filtered in/out. + if old_fname and not self._graphServer.filter_file(old_fname): + old_fname = None + if new_fname and not self._graphServer.filter_file(new_fname): + new_fname = None + if not old_fname and not new_fname: + # no modification + continue + + old_src = new_src = None + + if old_fname: + old_src = get_contents(self._repo, parentCommit, old_fname) + + if new_fname: + new_src = get_contents(self._repo, commit, new_fname) + + if old_src or new_src: + result = self._graphServer.update_graph( + old_fname, old_src, new_fname, new_src, diff.diff) + if asyncio.iscoroutine(result): + await result t2 = time.monotonic() - t2 # t3: end_commit time From a1adac22a102a4f19a8ae75e58b11e01193a70c9 Mon Sep 17 00:00:00 2001 From: xinyan Date: Sat, 18 May 2019 11:07:59 +0800 Subject: [PATCH 23/39] Skip rewind diff only when seekingMode is Rewind. --- persper/analytics/analyzer2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py index bdb890157d4..b56e922b239 100644 --- a/persper/analytics/analyzer2.py +++ b/persper/analytics/analyzer2.py @@ -211,7 +211,7 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C t1 = time.monotonic() - t1 diff_index = None - if self._skip_rewind_diff: + if self._skip_rewind_diff and seekingMode == CommitSeekingMode.Rewind: _logger.info("Skipped diff for rewinding commit.") else: diff_index = diff_with_commit(self._repo, commit, parentCommit) From aa9e9b0fbf15f9949da950b48f1d4619b1e1178d Mon Sep 17 00:00:00 2001 From: YingjieLiu <18706819589@163.com> Date: Mon, 20 May 2019 16:26:43 +0800 Subject: [PATCH 24/39] Count added or removed units for each patch in update_graph --- persper/analytics/c.py | 12 ++--- persper/analytics/call_commit_graph.py | 13 ++++++ persper/analytics/call_graph/c.py | 3 +- persper/analytics/detect_change.py | 62 ++++++++++++++++++++++---- test/test_analytics/test_analyzer_c.py | 46 +++++++++---------- 5 files changed, 99 insertions(+), 37 deletions(-) diff --git a/persper/analytics/c.py b/persper/analytics/c.py index 622a067948c..284caeb6131 100644 --- a/persper/analytics/c.py +++ b/persper/analytics/c.py @@ -8,7 +8,7 @@ from persper.analytics.call_commit_graph import CallCommitGraph -def function_change_stats(old_ast, new_ast, patch, patch_parser, ranges_func): +def function_change_stats(old_ast, old_src, new_ast, new_src, patch, patch_parser, ranges_func): """ Parse old/new source files and extract the change info for all functions """ @@ -19,19 +19,21 @@ def function_change_stats(old_ast, new_ast, patch, patch_parser, ranges_func): if old_ast is not None: forward_stats = get_changed_functions( - *ranges_func(old_ast), adds, dels, separate=True) + *ranges_func(old_ast), adds, dels, old_src, new_src, separate=True) if new_ast is not None: inv_adds, inv_dels = inverse_diff(adds, dels) bckward_stats = get_changed_functions( - *ranges_func(new_ast), inv_adds, inv_dels, separate=True) + *ranges_func(new_ast), inv_adds, inv_dels, new_src, old_src, separate=True) # merge forward and backward stats for func, fstat in bckward_stats.items(): if func not in forward_stats: forward_stats[func] = { 'adds': fstat['dels'], - 'dels': fstat['adds'] + 'dels': fstat['adds'], + 'added_units': fstat['removed_units'], + 'removed_units': fstat['added_units'] } return forward_stats @@ -85,7 +87,7 @@ def update_graph(self, old_filename, old_src, new_filename, new_src, patch): # Compatible with both the old and the new Analyzer change_stats = {} if self._seeking_mode != CommitSeekingMode.MergeCommit: - change_stats = function_change_stats(old_ast, new_ast, patch, + change_stats = function_change_stats(old_ast, old_src, new_ast, new_src, patch, self._parse_patch, get_func_ranges_c) diff --git a/persper/analytics/call_commit_graph.py b/persper/analytics/call_commit_graph.py index 6d3db3bb0f1..bd5de0fc81b 100644 --- a/persper/analytics/call_commit_graph.py +++ b/persper/analytics/call_commit_graph.py @@ -119,6 +119,19 @@ def update_node_history(self, node, num_adds, num_dels): else: node_history[self._current_commit_id] = {'adds': num_adds, 'dels': num_dels} + def update_node_history(self, node, fstat): + node_history = self._get_node_history(node) + # A commit might update a node's history more than once when + # a single FunctionNode corresponds to more than one actual functions + if self._current_commit_id in node_history: + node_history[self._current_commit_id]['adds'] += fstat['adds'] + node_history[self._current_commit_id]['dels'] += fstat['dels'] + node_history[self._current_commit_id]['added_units'] += fstat['added_units'] + node_history[self._current_commit_id]['removed_units'] += fstat['removed_units'] + else: + node_history[self._current_commit_id] = {'adds': fstat['adds'], 'dels': fstat['dels'], + 'added_units': fstat['added_units'], 'removed_units': fstat['removed_units']} + # read/write access to node history are thourgh this function def _get_node_history(self, node: str) -> Dict[str, Dict[str, int]]: return self._digraph.nodes[node]['history'] diff --git a/persper/analytics/call_graph/c.py b/persper/analytics/call_graph/c.py index 8563ddd3c30..1ab241b1b7e 100644 --- a/persper/analytics/call_graph/c.py +++ b/persper/analytics/call_graph/c.py @@ -179,7 +179,8 @@ def update_graph(ccgraph, ast_list, change_stats, new_fname_to_old_fname): if func not in ccgraph: print("%s in change_stats but not in ccgraph" % func) continue - ccgraph.update_node_history(func, fstat['adds'], fstat['dels']) + ccgraph.update_node_history(func, fstat) + # ccgraph.update_node_history(func, fstat['adds'], fstat['dels']) def get_func_ranges_c(root): diff --git a/persper/analytics/detect_change.py b/persper/analytics/detect_change.py index 419279cd87a..ceffec7bc39 100644 --- a/persper/analytics/detect_change.py +++ b/persper/analytics/detect_change.py @@ -1,3 +1,5 @@ +import re + def get_intersected_length(a, b): """ >>> get_intersected_length([1, 9], [2, 8]) @@ -16,9 +18,44 @@ def get_intersected_length(a, b): else: return end - start + 1 +def get_add_line_number(additions, deletions): + """ + Get the line number in new src for each added block + Input: + additions = [[7, 31], [27, 3], [44, 1], [50, 2], [70, 1], [77, 2], [99, 2]] + deletions = [[32, 44], [56, 70]] + Output: + [[8, 38], [59, 61], [66, 66], [73, 74], [80, 80], [88, 89], [112, 113]] + ground truth: + https://github.com/basicthinker/Sexain-MemController/commit/f050c6f6dd4b1d3626574b0d23bb41125f7b75ca + """ + add_line_number = [] + del_ptr, num_dels = 0, len(deletions) + add_num, del_num = 0, 0 + for add_range in additions: + while del_ptr < num_dels and deletions[del_ptr][1] <= add_range[0]: + del_num += deletions[del_ptr][1] - deletions[del_ptr][0] + 1 + del_ptr += 1 + start_line = add_range[0]+1+add_num-del_num + tmp_line_number = [start_line, start_line+add_range[1]-1] + add_line_number.append(tmp_line_number) + add_num += add_range[1] + return add_line_number +#need test +def get_units(src_list, line_number_range): + """ + Get the sum of units for each line in line_number_range + """ + units_sum = 0 + p = re.compile(r'\w+') + for i in range(line_number_range[0]-1, line_number_range[1]): + if i >= len(src_list): + break + units_sum += len(p.findall(src_list[i])) + return units_sum def get_changed_functions(func_names, func_ranges, additions, deletions, - separate=False): + old_src, new_src, separate=False): """ Args: func_names: A list of function names, @@ -38,29 +75,38 @@ def get_changed_functions(func_names, func_ranges, additions, deletions, info = {} if (func_names is None or func_ranges is None or - additions is None or deletions is None): + additions is None or deletions is None or + old_src is None or new_src is None): return info - def update_info(fn, num_lines, key): + def update_info(fn, num_lines, num_units, key1, key2): """key should be one of 'adds' or 'dels'.""" if fn in info: - info[fn][key] += num_lines + info[fn][key1] += num_lines + info[fn][key2] += num_units else: - info[fn] = {'adds': 0, 'dels': 0} - info[fn][key] = num_lines + info[fn] = {'adds': 0, 'dels': 0, 'added_units': 0, 'removed_units': 0} + info[fn][key1] = num_lines + info[fn][key2] = num_units + + old_src_list = old_src.split('\n') + new_src_list = new_src.split('\n') + add_line_number = get_add_line_number(additions, deletions) add_ptr, del_ptr = 0, 0 num_adds, num_dels = len(additions), len(deletions) for fn, fr in zip(func_names, func_ranges): for i in range(add_ptr, num_adds): if fr[0] <= additions[i][0] < fr[1]: - update_info(fn, additions[i][1], 'adds') + units = get_units(new_src_list, add_line_number[i]) + update_info(fn, additions[i][1], units, 'adds', 'added_units') add_ptr = i + 1 for j in range(del_ptr, num_dels): inter_length = get_intersected_length(fr, deletions[j]) if inter_length > 0: - update_info(fn, inter_length, 'dels') + units = get_units(old_src_list, [max(fr[0],deletions[j][0]), min(fr[1],deletions[j][1])]) + update_info(fn, inter_length, units, 'dels', 'removed_units') del_ptr = j if not separate: diff --git a/test/test_analytics/test_analyzer_c.py b/test/test_analytics/test_analyzer_c.py index 29e5f12af43..378f145eabc 100644 --- a/test/test_analytics/test_analyzer_c.py +++ b/test/test_analytics/test_analyzer_c.py @@ -32,23 +32,23 @@ async def test_analyzer_all_branches(az): history_truth = { 'K': { - 'display': {'adds': 0, 'dels': 5} + 'display': {'adds': 0, 'dels': 5, 'added_units': 0, 'removed_units': 10} }, 'F': { - 'display': {'adds': 14, 'dels': 0}, - 'count': {'adds': 12, 'dels': 0} + 'display': {'adds': 14, 'dels': 0, 'added_units': 23, 'removed_units': 0}, + 'count': {'adds': 12, 'dels': 0, 'added_units': 19, 'removed_units': 0} }, 'E': { - 'append': {'adds': 29, 'dels': 0}, - 'add': {'adds': 11, 'dels': 0} + 'append': {'adds': 29, 'dels': 0, 'added_units': 44, 'removed_units': 0}, + 'add': {'adds': 11, 'dels': 0, 'added_units': 25, 'removed_units': 0} }, 'D': { - 'str_replace': {'adds': 26, 'dels': 0} + 'str_replace': {'adds': 26, 'dels': 0, 'added_units': 76, 'removed_units': 0} }, # TODO: fix \No newline at the end of file 'C': { - 'str_append_chr': {'adds': 30, 'dels': 4}, - 'str_equals': {'adds': 0, 'dels': 1} + 'str_append_chr': {'adds': 30, 'dels': 4, 'added_units': 78, 'removed_units': 21}, + 'str_equals': {'adds': 0, 'dels': 1, 'added_units': 0, 'removed_units': 0} }, # Commit `B` is an example of imperfect diff, # it removes `str_append` and adds a new function `str_append_chr` @@ -56,38 +56,38 @@ async def test_analyzer_all_branches(az): # diff doesn't separate these changes into two chunks # please see here: https://github.com/UltimateBeaver/test_feature_branch/commit/caaac10f604ea7ac759c2147df8fb2b588ee2a27 'B': { - 'str_append': {'adds': 6, 'dels': 3}, - 'str_append_chr': {'adds': 3, 'dels': 2}, - 'str_equals': {'adds': 11, 'dels': 0} + 'str_append': {'adds': 6, 'dels': 3, 'added_units': 29, 'removed_units': 21}, + 'str_append_chr': {'adds': 3, 'dels': 2, 'added_units': 21, 'removed_units': 15}, + 'str_equals': {'adds': 11, 'dels': 0, 'added_units': 27, 'removed_units': 0} }, 'A': { - 'str_append': {'adds': 7, 'dels': 0}, - 'str_len': {'adds': 6, 'dels': 0} + 'str_append': {'adds': 7, 'dels': 0, 'added_units': 29, 'removed_units': 0}, + 'str_len': {'adds': 6, 'dels': 0, 'added_units': 13, 'removed_units': 0} }, # branch J from commit A, merge back through F 'J': { - 'count': {'adds': 12, 'dels': 0}, - 'display': {'adds': 14, 'dels': 0} + 'count': {'adds': 12, 'dels': 0, 'added_units': 19, 'removed_units': 0}, + 'display': {'adds': 14, 'dels': 0, 'added_units': 23, 'removed_units': 0} }, # TODO: fix \No newline at the end of file # branch G from commit B, merge back through D 'G': { - 'str_equals': {'adds': 0, 'dels': 1}, - 'str_replace': {'adds': 26, 'dels': 0} + 'str_equals': {'adds': 0, 'dels': 1, 'added_units': 0, 'removed_units': 0}, + 'str_replace': {'adds': 26, 'dels': 0, 'added_units': 76, 'removed_units': 0} }, # branch H from commit D, merge back through E 'H': { - 'add': {'adds': 16, 'dels': 0}, - 'append': {'adds': 12, 'dels': 0}, - 'insert': {'adds': 25, 'dels': 0} + 'add': {'adds': 16, 'dels': 0, 'added_units': 31, 'removed_units': 0}, + 'append': {'adds': 12, 'dels': 0, 'added_units': 37, 'removed_units': 0}, + 'insert': {'adds': 25, 'dels': 0, 'added_units': 44, 'removed_units': 0} }, 'I': { - 'add': {'adds': 0, 'dels': 5}, - 'append': {'adds': 26, 'dels': 9}, - 'insert': {'adds': 0, 'dels': 25} + 'add': {'adds': 0, 'dels': 5, 'added_units': 0, 'removed_units': 6}, + 'append': {'adds': 26, 'dels': 9, 'added_units': 40, 'removed_units': 33}, + 'insert': {'adds': 0, 'dels': 25, 'added_units': 0, 'removed_units': 44} }, } From eecd9c39b00f0e54097cf6f6090b32142d18e7d4 Mon Sep 17 00:00:00 2001 From: YingjieLiu <18706819589@163.com> Date: Tue, 21 May 2019 10:25:50 +0800 Subject: [PATCH 25/39] Change name update_node_history to update_node_history_accurate. --- persper/analytics/call_commit_graph.py | 2 +- persper/analytics/call_graph/c.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/persper/analytics/call_commit_graph.py b/persper/analytics/call_commit_graph.py index bd5de0fc81b..652319cb5d3 100644 --- a/persper/analytics/call_commit_graph.py +++ b/persper/analytics/call_commit_graph.py @@ -119,7 +119,7 @@ def update_node_history(self, node, num_adds, num_dels): else: node_history[self._current_commit_id] = {'adds': num_adds, 'dels': num_dels} - def update_node_history(self, node, fstat): + def update_node_history_accurate(self, node, fstat): node_history = self._get_node_history(node) # A commit might update a node's history more than once when # a single FunctionNode corresponds to more than one actual functions diff --git a/persper/analytics/call_graph/c.py b/persper/analytics/call_graph/c.py index 1ab241b1b7e..7ff8bbda713 100644 --- a/persper/analytics/call_graph/c.py +++ b/persper/analytics/call_graph/c.py @@ -179,7 +179,7 @@ def update_graph(ccgraph, ast_list, change_stats, new_fname_to_old_fname): if func not in ccgraph: print("%s in change_stats but not in ccgraph" % func) continue - ccgraph.update_node_history(func, fstat) + ccgraph.update_node_history_accurate(func, fstat) # ccgraph.update_node_history(func, fstat['adds'], fstat['dels']) From d177672907224ac862c96748cb90bc52622070e8 Mon Sep 17 00:00:00 2001 From: YingjieLiu <18706819589@163.com> Date: Wed, 22 May 2019 09:34:43 +0800 Subject: [PATCH 26/39] Update regex; Update update_info; If old_src is None or new_src is None, we still have to count the logic units. --- persper/analytics/detect_change.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/persper/analytics/detect_change.py b/persper/analytics/detect_change.py index ceffec7bc39..22254e36f18 100644 --- a/persper/analytics/detect_change.py +++ b/persper/analytics/detect_change.py @@ -41,13 +41,14 @@ def get_add_line_number(additions, deletions): add_line_number.append(tmp_line_number) add_num += add_range[1] return add_line_number -#need test + def get_units(src_list, line_number_range): """ Get the sum of units for each line in line_number_range """ units_sum = 0 - p = re.compile(r'\w+') + # p = re.compile(r'\w+') + p = re.compile(r'[\w_][\w\d_]*') for i in range(line_number_range[0]-1, line_number_range[1]): if i >= len(src_list): break @@ -65,6 +66,8 @@ def get_changed_functions(func_names, func_ranges, additions, deletions, in the same order of func_names. additions: A list of pair of integers, deletions: A list of pair of integers, + old_src: Old source files, + new_src: New source files, separate: A boolean flag, if set to True, additions and deletions are reported separately. @@ -75,22 +78,25 @@ def get_changed_functions(func_names, func_ranges, additions, deletions, info = {} if (func_names is None or func_ranges is None or - additions is None or deletions is None or - old_src is None or new_src is None): + additions is None or deletions is None): return info - def update_info(fn, num_lines, num_units, key1, key2): + def update_info(fn, num_lines, num_units, key_lines, key_units): """key should be one of 'adds' or 'dels'.""" if fn in info: - info[fn][key1] += num_lines - info[fn][key2] += num_units + info[fn][key_lines] += num_lines + info[fn][key_units] += num_units else: info[fn] = {'adds': 0, 'dels': 0, 'added_units': 0, 'removed_units': 0} - info[fn][key1] = num_lines - info[fn][key2] = num_units + info[fn][key_lines] = num_lines + info[fn][key_units] = num_units - old_src_list = old_src.split('\n') - new_src_list = new_src.split('\n') + old_src_list = [] + new_src_list = [] + if not old_src is None: + old_src_list = old_src.split('\n') + if not new_src is None: + new_src_list = new_src.split('\n') add_line_number = get_add_line_number(additions, deletions) add_ptr, del_ptr = 0, 0 From b237e35be81042dac6da484a466c862105dab388 Mon Sep 17 00:00:00 2001 From: Yizhe Yuan Date: Mon, 20 May 2019 18:24:11 +0800 Subject: [PATCH 27/39] Filter out monolithic commits --- persper/analytics/analyzer2.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py index b56e922b239..f432c951650 100644 --- a/persper/analytics/analyzer2.py +++ b/persper/analytics/analyzer2.py @@ -21,7 +21,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer, terminalCommit: str = 'HEAD', firstParentOnly: bool = False, commit_classifier: Optional[CommitClassifier] = None, - skip_rewind_diff: bool = False): + skip_rewind_diff: bool = False, + monolithic_commit_lines_threshold: int = 5000): # skip_rewind_diff will skip diff, but rewind commit start/end will still be notified to the GraphServer. self._repositoryRoot = repositoryRoot self._graphServer = graphServer @@ -35,6 +36,7 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer, self._commit_classifier = commit_classifier self._clf_results: Dict[str, List[float]] = {} self._skip_rewind_diff = skip_rewind_diff + self._monolithic_commit_lines_threshold = monolithic_commit_lines_threshold def __getstate__(self): state = self.__dict__.copy() @@ -198,6 +200,9 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C if type(commit) != Commit: commit = self._repo.commit(commit) + # filter monolithic commit + seekingMode = self._filter_monolithic_commit(commit, seekingMode) + # t0: Total time usage t0 = time.monotonic() self._observer.onBeforeCommit(self, commit, seekingMode) @@ -266,6 +271,19 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C assert self._graphServer.get_workspace_commit_hexsha() == commit.hexsha, \ "GraphServer.get_workspace_commit_hexsha should be return the hexsha seen in last start_commit." + def _filter_monolithic_commit(self, commit: Commit, seeking_mode: CommitSeekingMode) -> CommitSeekingMode: + # filter monolithic commit + if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) == 1: + changed_lines = 0 + files = commit.stats.files + for fname in files: + if self._graphServer.filter_file(fname): + changed_lines += files[fname]['lines'] + if changed_lines > self._monolithic_commit_lines_threshold: + # enforce using CommitSeekingMode.MergeCommit to update graph without updating node history + return CommitSeekingMode.MergeCommit + return seeking_mode + def _get_fnames(diff: Diff): if diff.new_file: From 2fea1335c3df685c06858898c31cf03f7dc91ec5 Mon Sep 17 00:00:00 2001 From: Yizhe Yuan Date: Fri, 24 May 2019 00:36:33 +0800 Subject: [PATCH 28/39] Add unit test for filter_monolithic_commit --- test/test_analytics/test_analyzer2.py | 69 +++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 test/test_analytics/test_analyzer2.py diff --git a/test/test_analytics/test_analyzer2.py b/test/test_analytics/test_analyzer2.py new file mode 100644 index 00000000000..b7eb532e498 --- /dev/null +++ b/test/test_analytics/test_analyzer2.py @@ -0,0 +1,69 @@ +import os +import pytest +import subprocess +import shutil +from persper.analytics.c import CGraphServer +from persper.analytics.analyzer2 import Analyzer +from persper.analytics.graph_server import C_FILENAME_REGEXES, CommitSeekingMode +from persper.util.path import root_path + + +@pytest.fixture(scope='module') +def az(): + # build the repo first if not exists yet + repo_path = os.path.join(root_path, 'repos/test_feature_branch') + script_path = os.path.join(root_path, 'tools/repo_creater/create_repo.py') + test_src_path = os.path.join(root_path, 'test/test_feature_branch') + + # Always use latest source to create test repo + if os.path.exists(repo_path): + shutil.rmtree(repo_path) + + cmd = '{} {}'.format(script_path, test_src_path) + subprocess.call(cmd, shell=True) + + return Analyzer(repo_path, CGraphServer(C_FILENAME_REGEXES)) + + +def test_analyzer_filter_monolithic_commit(az): + threshold = az._monolithic_commit_lines_threshold + + case_1_files = { + 'main.c': {'lines': threshold + 1}, + } + case_1_commit = MockCommit(case_1_files, 0) + case_1_seeking_mode = az._filter_monolithic_commit(case_1_commit, CommitSeekingMode.NormalForward) + assert case_1_seeking_mode == CommitSeekingMode.NormalForward + + case_2_files = { + 'a.c': {'lines': threshold}, + } + case_2_commit = MockCommit(case_2_files, 1) + case_2_seeking_mode = az._filter_monolithic_commit(case_2_commit, CommitSeekingMode.NormalForward) + assert case_2_seeking_mode == CommitSeekingMode.NormalForward + + case_3_files = { + 'a.c': {'lines': threshold}, + 'b.c': {'lines': 1}, + } + case_3_commit = MockCommit(case_3_files, 1) + case_3_seeking_mode = az._filter_monolithic_commit(case_3_commit, CommitSeekingMode.NormalForward) + assert case_3_seeking_mode == CommitSeekingMode.MergeCommit + + case_4_files = { + 'a.c': {'lines': threshold}, + } + case_4_commit = MockCommit(case_4_files, 2) + case_4_seeking_mode = az._filter_monolithic_commit(case_4_commit, CommitSeekingMode.MergeCommit) + assert case_4_seeking_mode == CommitSeekingMode.MergeCommit + + +class MockCommit: + def __init__(self, files: dict, parent_number: int = 1): + self.stats = MockCommitStats(files) + self.parents = [{}] * parent_number + + +class MockCommitStats: + def __init__(self, files: dict): + self.files = files From 1b4d3f052e4f03f28cd52c1a0e5eb80e953a70a0 Mon Sep 17 00:00:00 2001 From: Hezheng Yin Date: Thu, 23 May 2019 13:56:12 -0700 Subject: [PATCH 29/39] Add comments to filter commit tests --- .../{test_analyzer2.py => test_filter_commit.py} | 8 ++++++++ 1 file changed, 8 insertions(+) rename test/test_analytics/{test_analyzer2.py => test_filter_commit.py} (84%) diff --git a/test/test_analytics/test_analyzer2.py b/test/test_analytics/test_filter_commit.py similarity index 84% rename from test/test_analytics/test_analyzer2.py rename to test/test_analytics/test_filter_commit.py index b7eb532e498..802ffbaddb9 100644 --- a/test/test_analytics/test_analyzer2.py +++ b/test/test_analytics/test_filter_commit.py @@ -28,6 +28,8 @@ def az(): def test_analyzer_filter_monolithic_commit(az): threshold = az._monolithic_commit_lines_threshold + # case 1: changes above threshold, but the commit is the first commit + # expected result: normal forward case_1_files = { 'main.c': {'lines': threshold + 1}, } @@ -35,6 +37,8 @@ def test_analyzer_filter_monolithic_commit(az): case_1_seeking_mode = az._filter_monolithic_commit(case_1_commit, CommitSeekingMode.NormalForward) assert case_1_seeking_mode == CommitSeekingMode.NormalForward + # case 2: changes equal to threshold, the commit has one parent commit + # expected result: normal forward case_2_files = { 'a.c': {'lines': threshold}, } @@ -42,6 +46,8 @@ def test_analyzer_filter_monolithic_commit(az): case_2_seeking_mode = az._filter_monolithic_commit(case_2_commit, CommitSeekingMode.NormalForward) assert case_2_seeking_mode == CommitSeekingMode.NormalForward + # case 3: changes above threshold, the commit has one parent commit + # expected result: merge commit case_3_files = { 'a.c': {'lines': threshold}, 'b.c': {'lines': 1}, @@ -50,6 +56,8 @@ def test_analyzer_filter_monolithic_commit(az): case_3_seeking_mode = az._filter_monolithic_commit(case_3_commit, CommitSeekingMode.NormalForward) assert case_3_seeking_mode == CommitSeekingMode.MergeCommit + # case 4: changes equal to threshold, the commit is a merge commit + # expected result: merge commit case_4_files = { 'a.c': {'lines': threshold}, } From c027f15f9981c6cca85e1fd66b8c9a314228d944 Mon Sep 17 00:00:00 2001 From: xinyan Date: Sat, 18 May 2019 22:47:43 +0800 Subject: [PATCH 30/39] Add preliminary project complexity metric based on LOC, nodes and edges. --- persper/analytics/analyzer2.py | 9 +++++++++ persper/analytics/call_commit_graph.py | 11 ++++++++++- persper/analytics/complexity.py | 24 ++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 persper/analytics/complexity.py diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py index f432c951650..ecfc398e850 100644 --- a/persper/analytics/analyzer2.py +++ b/persper/analytics/analyzer2.py @@ -128,6 +128,15 @@ def compute_commit_scores(self, alpha: float, label_weights: List[float], top_one=top_one, additive=additive) + def compute_project_complexity(self, r_n: int, r_e: int): + """ + Evaluates project complexity. + params + r_n: The conversion factor from node count to logic units. + r_e: The conversion factor from edge count to logic units. + """ + return self.graph.eval_project_complexity(r_n, r_e) + async def analyze(self, maxAnalyzedCommits=None, suppressStdOutLogs=False): commitSpec = self._terminalCommit if self._originCommit: diff --git a/persper/analytics/call_commit_graph.py b/persper/analytics/call_commit_graph.py index 652319cb5d3..0c83fc928c6 100644 --- a/persper/analytics/call_commit_graph.py +++ b/persper/analytics/call_commit_graph.py @@ -8,7 +8,7 @@ from persper.analytics.devrank import devrank from persper.analytics.score import normalize from typing import Union, Set, List, Dict, Optional - +from persper.analytics.complexity import eval_project_complexity class CommitIdGenerators: @staticmethod @@ -171,6 +171,15 @@ def _set_all_edges_weight(self): for nbr, datadict in self._digraph.pred[node].items(): datadict['weight'] = self._digraph.nodes[node]['size'] + def eval_project_complexity(self, r_n: float, r_e: float): + """ + Evaluates project complexity. + params + r_n: The conversion factor from node count to logic units. + r_e: The conversion factor from edge count to logic units. + """ + return eval_project_complexity(self._digraph, r_n, r_e) + def function_devranks(self, alpha, black_set=None): """ Args: diff --git a/persper/analytics/complexity.py b/persper/analytics/complexity.py new file mode 100644 index 00000000000..523aeff4b7a --- /dev/null +++ b/persper/analytics/complexity.py @@ -0,0 +1,24 @@ +from typing import Dict, List + +import numpy as np +from networkx import DiGraph + + +def eval_project_complexity(G: DiGraph, r_n: float, r_e: float): + """ + Evaluates project complexity from the specified bare call commit graph. + remarks + The formula is + complexity = sum_by_node(added_units + removed_units) + r_n*len(nodes) + r_e*len(edges) + """ + logical_units = 0 + for _, data in G.nodes(data=True): + added = 0 + removed = 0 + for _, v in data["history"].items(): + # TODO change from LOC to logic units + added += v["adds"] + removed += v["dels"] + logical_units += added + removed + complexity = logical_units + r_n*len(G.nodes) + r_e*len(G.edges) + return complexity From 3ac4854423f9f81555d3bb39bffa32d04e03ec02 Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 22 May 2019 22:20:07 +0800 Subject: [PATCH 31/39] Use logic units to evaluate complexity where possible. --- persper/analytics/complexity.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/persper/analytics/complexity.py b/persper/analytics/complexity.py index 523aeff4b7a..013184b9e1f 100644 --- a/persper/analytics/complexity.py +++ b/persper/analytics/complexity.py @@ -1,8 +1,11 @@ +import logging from typing import Dict, List import numpy as np from networkx import DiGraph +_logger = logging.getLogger(__file__) + def eval_project_complexity(G: DiGraph, r_n: float, r_e: float): """ @@ -12,13 +15,22 @@ def eval_project_complexity(G: DiGraph, r_n: float, r_e: float): complexity = sum_by_node(added_units + removed_units) + r_n*len(nodes) + r_e*len(edges) """ logical_units = 0 + useFallback = None for _, data in G.nodes(data=True): added = 0 removed = 0 for _, v in data["history"].items(): - # TODO change from LOC to logic units - added += v["adds"] - removed += v["dels"] + if useFallback == None: + useFallback = not "added_units" in v + if useFallback: + _logger.warning( + "Will use LOC instead of logic units to measure complexity.") + if useFallback: + added += v["adds"] + removed += v["dels"] + else: + added += v["added_units"] + removed += v["removed_units"] logical_units += added + removed complexity = logical_units + r_n*len(G.nodes) + r_e*len(G.edges) return complexity From 62b00390a17b656d1cd51fa9742c25ff64c2ac09 Mon Sep 17 00:00:00 2001 From: xinyan Date: Wed, 22 May 2019 23:44:40 +0800 Subject: [PATCH 32/39] Add unit test assertion in test_analyzer_all_branches. --- test/test_analytics/test_analyzer_c.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_analytics/test_analyzer_c.py b/test/test_analytics/test_analyzer_c.py index 378f145eabc..97268c6cc3e 100644 --- a/test/test_analytics/test_analyzer_c.py +++ b/test/test_analytics/test_analyzer_c.py @@ -127,3 +127,4 @@ async def test_analyzer_all_branches(az): ('insert', 'append') ] assert set(ccgraph.edges()) == set(edges_truth) + assert ccgraph.eval_project_complexity(20, 10) == 1157 From d67cd207a9432238a6cb6ce7d598fd0df6156828 Mon Sep 17 00:00:00 2001 From: Yang Zhikai Date: Fri, 24 May 2019 03:41:11 +0000 Subject: [PATCH 33/39] delete test_iterator.py --- test/test_analytics/test_iterator.py | 65 ---------------------------- 1 file changed, 65 deletions(-) delete mode 100644 test/test_analytics/test_iterator.py diff --git a/test/test_analytics/test_iterator.py b/test/test_analytics/test_iterator.py deleted file mode 100644 index 745b879c83e..00000000000 --- a/test/test_analytics/test_iterator.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import pytest -import pickle -import subprocess -from persper.analytics.iterator import RepoIterator -from persper.util.path import root_path - - -def serialized_messages(commits): - return ' '.join([c.message.strip() for c in commits]) - - -@pytest.fixture(scope='module') -def ri(): - # build the repo first if not exists yet - repo_path = os.path.join(root_path, 'repos/test_processor') - script_path = os.path.join(root_path, 'tools/repo_creater/create_repo.py') - test_src_path = os.path.join(root_path, 'test/test_processor') - if not os.path.isdir(repo_path): - cmd = '{} {}'.format(script_path, test_src_path) - subprocess.call(cmd, shell=True) - - repo_path = os.path.join(root_path, 'repos/test_processor') - ri = RepoIterator(repo_path) - return ri - - -def test_iterator(ri): - commits, branch_commits = ri.iter(from_beginning=True, into_branches=True) - # from A to L - # use `git log --graph` to view ground truth - assert len(ri.visited) == 12 - assert len(commits) == 4 - assert len(branch_commits) == 8 - assert serialized_messages(commits) == 'D C B A' - assert serialized_messages(branch_commits) == 'G F E J I H L K' - - -def test_continue_iter(ri): - commits, branch_commits = ri.iter( - from_beginning=True, num_commits=2, into_branches=True) - assert serialized_messages(commits) == 'B A' - assert serialized_messages(branch_commits) == '' - commits2, branch_commits2 = ri.iter( - continue_iter=True, num_commits=2, into_branches=True) - assert serialized_messages(commits2) == 'D C' - assert serialized_messages(branch_commits2) == 'G F E J I H L K' - - -def test_rev(ri): - commits, branch_commits = ri.iter(rev='C', into_branches=True) - assert serialized_messages(commits) == 'C B A' - assert serialized_messages(branch_commits) == '' - commits2, branch_commits2 = ri.iter( - continue_iter=True, end_commit_sha='D', into_branches=True) - assert serialized_messages(commits2) == 'D' - assert serialized_messages(branch_commits2) == 'G F E J I H L K' - - -def test_iter_twice(ri): - commits, branch_commits = ri.iter(from_beginning=True, into_branches=True) - commits2, branch_commits2 = ri.iter( - from_beginning=True, into_branches=True) - assert commits == commits2 - assert branch_commits == branch_commits2 From 9b311913a11b7d2b6ad3d39dbcf20ab6a65212af Mon Sep 17 00:00:00 2001 From: Yang zhikai Date: Fri, 24 May 2019 11:49:39 +0800 Subject: [PATCH 34/39] setup-ci --- .gitlab-ci.yml | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 00000000000..6bdf1342a7b --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,45 @@ +stages: + - build + - test + + +test_ci: + stage: test + image: ubuntu:18.04 +# only: +# - setup-ci + before_script: + - apt update && apt install -y openssh-client wget libarchive-dev libcurl4-openssl-dev git python3.7 python3-pip + - wget http://131.123.42.38/lmcrs/beta/srcML-Ubuntu18.04.deb + - dpkg -i srcML-Ubuntu18.04.deb + - mkdir -p ~/.ssh + - echo "${DEPLOY_KEY}" | tr -d '\r' > ~/.ssh/id_rsa + - chmod 600 ~/.ssh/id_rsa + - eval "$(ssh-agent -s)" + - ssh-keyscan -H "gitlab.com" >> ~/.ssh/known_hosts + - chmod 644 ~/.ssh/known_hosts + - set LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib + - export LC_ALL=C.UTF-8 + - export LANG=C.UTF-8 + script: + - apt-get update + - git config --global user.email "merico@meir.co" + - git config --global user.name "merico" + - pip3 install pipenv + + - echo -e "machine gitlab.com\nlogin ${GITLAB_USER}\npassword ${GITLAB_PASSWD}" > ~/.netrc + - git clone https://gitlab.com/persper/code-analytics.git && cd code-analytics + #&& git checkout ${CI_COMMIT_REF_NAME} + - export PYTHONPATH=$PYTHONPATH:/root/code-analytics + - pipenv install --python 3.7 + - pipenv run pytest -s test/test_analytics/test_analyzer_c.py + - pipenv run pytest -s test/test_analytics/test_inverse_diff.py + - pipenv run pytest -s test/test_analytics/test_analyzer.py + - pipenv run pytest -s test/test_analytics/test_call_commit_graph.py + - pipenv run pytest -s test/test_analytics/test_detect_change.py + - pipenv run pytest -s test/test_analytics/test_devrank.py + - pipenv run pytest -s test/test_analytics/test_diff.py + - pipenv run pytest -s test/test_analytics/test_score.py + - pipenv run pytest -s test/test_analytics/test_srcml.py + - pipenv run pytest -s test/test_analytics2 + - echo "Done" From e2132e82608e3f331615025c7d12b607cbbab169 Mon Sep 17 00:00:00 2001 From: Hezheng Yin Date: Fri, 24 May 2019 13:22:34 -0700 Subject: [PATCH 35/39] Properly ignore lsp tests --- .gitlab-ci.yml | 12 ++---------- test/test_analytics/conftest.py | 2 ++ 2 files changed, 4 insertions(+), 10 deletions(-) create mode 100644 test/test_analytics/conftest.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6bdf1342a7b..218e1bed254 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -23,7 +23,7 @@ test_ci: - export LANG=C.UTF-8 script: - apt-get update - - git config --global user.email "merico@meir.co" + - git config --global user.email "merico@meri.co" - git config --global user.name "merico" - pip3 install pipenv @@ -32,14 +32,6 @@ test_ci: #&& git checkout ${CI_COMMIT_REF_NAME} - export PYTHONPATH=$PYTHONPATH:/root/code-analytics - pipenv install --python 3.7 - - pipenv run pytest -s test/test_analytics/test_analyzer_c.py - - pipenv run pytest -s test/test_analytics/test_inverse_diff.py - - pipenv run pytest -s test/test_analytics/test_analyzer.py - - pipenv run pytest -s test/test_analytics/test_call_commit_graph.py - - pipenv run pytest -s test/test_analytics/test_detect_change.py - - pipenv run pytest -s test/test_analytics/test_devrank.py - - pipenv run pytest -s test/test_analytics/test_diff.py - - pipenv run pytest -s test/test_analytics/test_score.py - - pipenv run pytest -s test/test_analytics/test_srcml.py + - pipenv run pytest -s test/test_analytics - pipenv run pytest -s test/test_analytics2 - echo "Done" diff --git a/test/test_analytics/conftest.py b/test/test_analytics/conftest.py new file mode 100644 index 00000000000..a4c40467487 --- /dev/null +++ b/test/test_analytics/conftest.py @@ -0,0 +1,2 @@ + +collect_ignore = ["test_analyzer_cpp.py", "test_analyzer_lsp_ccls.py"] From fc3ad6147e68f10e603263703004c6d20d78a2ca Mon Sep 17 00:00:00 2001 From: Hezheng Yin Date: Fri, 24 May 2019 13:31:15 -0700 Subject: [PATCH 36/39] Update test readme --- test/README.md | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/test/README.md b/test/README.md index 76e7cf7cb20..88e589fb43b 100644 --- a/test/README.md +++ b/test/README.md @@ -2,12 +2,14 @@ Our recommended way to run tests is through [pytest](https://docs.pytest.org/en/latest/). -Installation with your favorite package manager: +It should have been installed if you have run `pipenv install`. Otherwise, install pytest with your favorite package manager: -``` -pip install -U pytest # pip +```bash +// pip +$ pip install -U pytest -conda install pytest # conda +// or conda +$ conda install pytest ``` ## Run Tests @@ -15,14 +17,26 @@ conda install pytest # conda To run the entire test suite, simply: ``` -cd test -pytest +cd ${root} +pipenv run pytest -s test/ ``` To test a specific module: ``` -pytest .py +pipenv run pytest -s .py ``` -To learn more about how pytest detects tests, follow this [link](https://docs.pytest.org/en/latest/goodpractices.html#goodpractices). \ No newline at end of file +To learn more about how pytest detects tests, follow this [link](https://docs.pytest.org/en/latest/goodpractices.html#goodpractices). + +## Tests that are ignored + +You can ignore certain tests by customizing test collection using `conftest.py`. For details, please see [here](https://docs.pytest.org/en/latest/example/pythoncollection.html#customizing-test-collection). + +Here is a list of tests that are currently ignored: + +1. `test/test_analytics/test_analyzer_cpp.py` +2. `test/test_analytics/test_analyzer_lsp_ccls.py` + + + From 8dbd2703ee7dcd8cc9c2ee5828253d98cc6b48d1 Mon Sep 17 00:00:00 2001 From: xinyan Date: Thu, 23 May 2019 23:20:11 +0800 Subject: [PATCH 37/39] #46 graphToDict should not modify passed-in CallCommitGraph. --- test/test_analytics/utility/graph_baseline.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/test_analytics/utility/graph_baseline.py b/test/test_analytics/utility/graph_baseline.py index 00164fda366..e246f1f33a9 100644 --- a/test/test_analytics/utility/graph_baseline.py +++ b/test/test_analytics/utility/graph_baseline.py @@ -24,12 +24,14 @@ def formatEdgeId(u: str, v: str): def graphToDict(ccg: CallCommitGraph): - nodes = ccg.nodes(data=True) - for name, attr in nodes: - if "files" in attr: - files = list(attr["files"]) + nodes = {} + for name, attr in ccg.nodes(data=True): + attr1 = dict(attr) + nodes[name] = attr1 + if "files" in attr1: + files = list(attr1["files"]) files.sort() - attr["files"] = files + attr1["files"] = files result = { "nodes": dict(nodes), "edges": dict(((formatEdgeId(u, v), data) for (u, v, data) in ccg.edges(data=True))) From 3205c5ff3f287b863328a6ad8563994e2c48d8e2 Mon Sep 17 00:00:00 2001 From: YingjieLiu <18706819589@163.com> Date: Sun, 26 May 2019 11:32:12 +0800 Subject: [PATCH 38/39] Fix test cases --- .../patch_test_files/example7_new.c | 52 +++++++++++++++ .../patch_test_files/example7_old.c | 65 +++++++++++++++++++ test/test_analytics/test_analyzer_cpp.py | 8 +-- test/test_analytics/test_detect_change.py | 22 +++++-- 4 files changed, 138 insertions(+), 9 deletions(-) create mode 100644 test/test_analytics/patch_test_files/example7_new.c create mode 100644 test/test_analytics/patch_test_files/example7_old.c diff --git a/test/test_analytics/patch_test_files/example7_new.c b/test/test_analytics/patch_test_files/example7_new.c new file mode 100644 index 00000000000..9782b3507f9 --- /dev/null +++ b/test/test_analytics/patch_test_files/example7_new.c @@ -0,0 +1,52 @@ +/* added in H */ +struct node +{ + int data; + struct node *next; +}*head; + +/* added in H, edited in I */ +void append(int num) +{ + struct node *temp, *prev; + temp=head; + while(temp!=NULL) + { + if(temp->data==num) + { + if(temp==head) + { + head=temp->next; + free(temp); + return 1; + } + else + { + prev->next=temp->next; + free(temp); + return 1; + } + } + else + { + prev=temp; + temp= temp->next; + } + } + return 0; +} + +/* added in H, edited in G */ +void add( int num ) +{ + struct node *temp; + temp=(struct node *)malloc(sizeof(struct node)); + temp->data=num; + if (head== NULL) + { + head=temp; + head->next=NULL; + } +} + +/* insert() is deleted in I */ diff --git a/test/test_analytics/patch_test_files/example7_old.c b/test/test_analytics/patch_test_files/example7_old.c new file mode 100644 index 00000000000..6157d001d3f --- /dev/null +++ b/test/test_analytics/patch_test_files/example7_old.c @@ -0,0 +1,65 @@ +/* added in H */ +struct node +{ + int data; + struct node *next; +}*head; + +/* added in H */ +void append(int num) +{ + struct node *temp,*right; + temp= (struct node *)malloc(sizeof(struct node)); + temp->data=num; + right=(struct node *)head; + while(right->next != NULL) + right=right->next; + right->next =temp; + right=temp; + right->next=NULL; +} + +/* added in H */ +void add( int num ) +{ + struct node *temp; + temp=(struct node *)malloc(sizeof(struct node)); + temp->data=num; + if (head== NULL) + { + head=temp; + head->next=NULL; + } + else + { + temp->next=head; + head=temp; + } +} + +/* added in H */ +void insert(int num) +{ + int c=0; + struct node *temp; + temp=head; + if(temp==NULL) + { + add(num); + } + else + { + while(temp!=NULL) + { + if(temp->datanext; + } + if(c==0) + add(num); + else if(c Date: Sun, 26 May 2019 12:24:32 +0800 Subject: [PATCH 39/39] fix test_analyzer.py --- test/test_analytics/test_analyzer.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/test/test_analytics/test_analyzer.py b/test/test_analytics/test_analyzer.py index fd011d463aa..3ed8ca8b7b5 100644 --- a/test/test_analytics/test_analyzer.py +++ b/test/test_analytics/test_analyzer.py @@ -32,23 +32,23 @@ async def test_analyzer_master_only(az): history_truth = { 'K': { - 'display': {'adds': 0, 'dels': 5} + 'display': {'adds': 0, 'dels': 5, 'added_units': 0, 'removed_units': 10} }, 'F': { - 'display': {'adds': 14, 'dels': 0}, - 'count': {'adds': 12, 'dels': 0} + 'display': {'adds': 14, 'dels': 0, 'added_units': 23, 'removed_units': 0}, + 'count': {'adds': 12, 'dels': 0, 'added_units': 19, 'removed_units': 0} }, 'E': { - 'append': {'adds': 29, 'dels': 0}, - 'add': {'adds': 11, 'dels': 0} + 'append': {'adds': 29, 'dels': 0, 'added_units': 44, 'removed_units': 0}, + 'add': {'adds': 11, 'dels': 0, 'added_units': 25, 'removed_units': 0} }, 'D': { - 'str_replace': {'adds': 26, 'dels': 0} + 'str_replace': {'adds': 26, 'dels': 0, 'added_units': 76, 'removed_units': 0} }, # TODO: fix \No newline at the end of file 'C': { - 'str_append_chr': {'adds': 30, 'dels': 4}, - 'str_equals': {'adds': 0, 'dels': 1} + 'str_append_chr': {'adds': 30, 'dels': 4, 'added_units': 78, 'removed_units': 21}, + 'str_equals': {'adds': 0, 'dels': 1, 'added_units': 0, 'removed_units': 0} }, # Commit `B` is an example of imperfect diff, # it removes `str_append` and adds a new function `str_append_chr` @@ -56,15 +56,16 @@ async def test_analyzer_master_only(az): # diff doesn't separate these changes into two chunks # please see here: https://github.com/UltimateBeaver/test_feature_branch/commit/caaac10f604ea7ac759c2147df8fb2b588ee2a27 'B': { - 'str_append': {'adds': 6, 'dels': 3}, - 'str_append_chr': {'adds': 3, 'dels': 2}, - 'str_equals': {'adds': 11, 'dels': 0} + 'str_append': {'adds': 6, 'dels': 3, 'added_units': 29, 'removed_units': 21}, + 'str_append_chr': {'adds': 3, 'dels': 2, 'added_units': 21, 'removed_units': 15}, + 'str_equals': {'adds': 11, 'dels': 0, 'added_units': 27, 'removed_units': 0} }, 'A': { - 'str_append': {'adds': 7, 'dels': 0}, - 'str_len': {'adds': 6, 'dels': 0} + 'str_append': {'adds': 7, 'dels': 0, 'added_units': 29, 'removed_units': 0}, + 'str_len': {'adds': 6, 'dels': 0, 'added_units': 13, 'removed_units': 0} }, + # # branch J from commit A, merge back through F # 'J': { # 'count': {'adds': 12, 'dels': 0},