From 8d4665cb4f3c0c5b10b19b90b24911aa64ef59fe Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Sat, 11 May 2019 15:39:10 +0800
Subject: [PATCH 01/39] Add lifecycle methods in IGraphServer.

---
 .../abstractions/callcommitgraph.py           | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/persper/analytics2/abstractions/callcommitgraph.py b/persper/analytics2/abstractions/callcommitgraph.py
index 551a82a0b60..1f07eed72b1 100644
--- a/persper/analytics2/abstractions/callcommitgraph.py
+++ b/persper/analytics2/abstractions/callcommitgraph.py
@@ -293,8 +293,31 @@ class ICallCommitGraph(IReadOnlyCallCommitGraph, IWriteOnlyCallCommitGraph):
 
 class IGraphServer(ABC):
     """
-    Provides basic functionality to trigger the commit analysis on graph server. 
+    Provides basic functionality to trigger the commit analysis on graph server.
+    remarks
+        The call sequence:
+        * start
+        * update_graph
+        * update_graph
+        * ...
+        * update_graph
+        * stop
     """
+    @abstractmethod
+    def start(self) -> None:
+        """
+        When implemented, starts the graph server and get ready for commit analysis, if applicable.
+        This includes starting the graph server process, preparing workspace folder, etc.
+        """
+        pass
+
+    @abstractmethod
+    def stop(self) -> None:
+        """
+        When implemented, stops the graph server and do necessary cleanup, if applicable.
+        """
+        pass
+
     @abstractmethod
     def update_graph(self, commit: ICommitInfo) -> None:
         """

From 2fc33f73a33d48238aea103799cf98dcd258892e Mon Sep 17 00:00:00 2001
From: Yang Zhikai <zhikai@meri.co>
Date: Sun, 12 May 2019 16:10:06 +0000
Subject: [PATCH 02/39] Squashed commits: e308d117  Merge remote-tracking
 branch 'origin/develop' into memory-ccg 9c5486bd delete comment code 6d4b6d06
 update _ensure_node_exists method 3c674261 update ccg update-node-files
 edae8ef4  Merge branch 'hurthwell-refactor' into 'develop' 8d4665cb  Add
 lifecycle methods in IGraphServer. b70339ce update ccg reuse method to
 construct ccg 1ec3c5e2 pass memory-ccg a08dbabb Merge branch 'master' into
 memory-ccg d29d45ee update memory-ccg 04d7c094 modify memory-ccg
 update_node_history 1532bf1c fix bugs f59d4fc8 modify memory-ccg bugs
 5f62ab17 fix some exists problems 07c80d1d add memory-ccg test case and
 modify node data struture a28e6365 update ccg-memory node iterator 9ca1008c
 update memory-ccg cdbeeafc update ccg' a935aee1 update ccg-abstraction
 874238b9 update memoryccg 866c61a8 update memoryccg ad711f3d implement basic
 momory-ccg ed3195bd implement basic momory-ccg bb0b4dfb modify memory-ccg
 bugs 3756f5f4 fix some exists problems 30c86fbd add memory-ccg test case and
 modify node data struture cccd1e77 Merge branch 'master' into memory-ccg
 7cab91ab update ccg-memory node iterator 40aa8323 update memory-ccg cce89ea3
 update ccg' 5e7d7641 Merge branch 'hurthwell-refactor' into memory-ccg
 6ab9fb22 update ccg-abstraction 6cc9ece0 Merge remote-tracking branch
 'origin/hurthwell-refactor' into memory-ccg c1dded46 update memoryccg
 d996ee1f update memoryccg 717f68a4 implement basic momory-ccg 04f1a682
 implement basic momory-ccg

---
 .../abstractions/callcommitgraph.py           |   6 +-
 persper/analytics2/memorycallcommitgraph.py   | 149 ++++++++++++++++++
 test/analytics2/__init__.py                   |   0
 test/analytics2/abstractions/__init__.py      |   0
 .../abstractions/callcommitgraph.py           |  29 ++--
 test/analytics2/callcommitgraph.py            |  14 --
 test/analytics2/test_callcommitgraph.py       |  13 ++
 7 files changed, 182 insertions(+), 29 deletions(-)
 create mode 100644 persper/analytics2/memorycallcommitgraph.py
 create mode 100644 test/analytics2/__init__.py
 create mode 100644 test/analytics2/abstractions/__init__.py
 delete mode 100644 test/analytics2/callcommitgraph.py
 create mode 100644 test/analytics2/test_callcommitgraph.py

diff --git a/persper/analytics2/abstractions/callcommitgraph.py b/persper/analytics2/abstractions/callcommitgraph.py
index 1f07eed72b1..fb438d2ae20 100644
--- a/persper/analytics2/abstractions/callcommitgraph.py
+++ b/persper/analytics2/abstractions/callcommitgraph.py
@@ -12,6 +12,10 @@ class NodeId(NamedTuple):
     name: str
     language: str
 
+    def __eq__(self, other):
+        return self.name == other.name and self.language == other.language
+        
+
 
 class NodeHistoryItem:
     """
@@ -247,7 +251,7 @@ def update_node_history(self, node_id: NodeId, commit_hexsha: str, added_lines:
         pass
 
     @abstractmethod
-    def update_node_files(self, node_id: NodeId, files: Iterable[str] = None) -> None:
+    def update_node_files(self, node_id: NodeId, commit_hexsha: str, files: Iterable[str] = None) -> None:
         """
         Sets or replaces the list of files that contains this node in the latest commit.
         Note that this method will replace the whole file list of the specified node.
diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py
new file mode 100644
index 00000000000..7c18d9ade4d
--- /dev/null
+++ b/persper/analytics2/memorycallcommitgraph.py
@@ -0,0 +1,149 @@
+from persper.analytics2.abstractions.callcommitgraph import *
+from persper.analytics2.abstractions.repository import *
+import sys
+import logging
+from collections import defaultdict
+
+
+class MemoryCallCommitGraph(ICallCommitGraph):
+    def __init__(self, graph_data: dict=None):
+        self._nodes_dict = {}
+        self._edges_dict = {}
+        self._commits = {}
+        self._from_edges = defaultdict(list)
+        self._to_edges = defaultdict(list)
+        if graph_data:
+            for i in graph_data["nodes"]:
+                nodeid = NodeId(i["id"]['name'], i["id"]['language'])
+                for commit_id, history in i['history'].items:
+                    self.update_node_history(nodeid, commit_id, history['adds'], history['dels'])
+                files = []
+                for file in i["files"]:
+                    files.append(file)
+                self.update_node_files(nodeid, files)
+            for i in graph_data["edges"]:
+                from_id = NodeId(i['from_id']["name"], i['from_id']["language"])
+                to_id = NodeId(i['to_id']["name"], i['to_id']["language"])
+                self.add_edge(from_id, to_id, i["added_by"])
+
+            for i in graph_data["commits"]:
+                self.add_commit(i["hex_sha"], Commit(i["hex_sha"], i["author_email"], i['author_name'],
+                                                i['author_date'], i['committer_email'],
+                                                i['committer_name'], i['commit_date'],
+                                                i['message'], i['parent']) )
+
+    def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None:
+        if node_id not in self._nodes_dict:
+            self._nodes_dict[node_id] = Node(node_id, added_by=commit_hexsha)
+        assert self._nodes_dict[node_id].added_by
+
+    def get_node(self, id: NodeId) -> Node:
+        return self._nodes_dict.get(id, None)
+
+    def get_nodes_count(self, name: str = None, language: str = None,
+                        from_id: NodeId = None, to_id: NodeId = None) -> int:
+        base_set = self._nodes_dict.values()
+        if name is None and language is None and from_id is None and to_id is None:
+            return len(base_set)
+        count = 0
+        for node in base_set:
+            if name is not None and node.node_id.name != name:
+                continue
+            if language is not None and node.node_id.language != language:
+                continue
+            if from_id is not None and node in self._from_edges[from_id]:
+                continue
+            if to_id is not None and node in self._to_edges[to_id]:
+                continue
+            count += 1
+        return count
+
+    def get_edge(self, from_id: NodeId, to_id: NodeId) -> Edge:
+        return self._edges_dict[(from_id, to_id)]
+
+    def get_edges_count(self, from_name: str = None, from_language: str = None, to_name: str = None,
+                        to_language: str = None) -> int:
+        base_set = self._edges_dict.values()
+        if from_name is None and from_language is None and to_name is None and to_language is None:
+            return len(base_set)
+        count = 0
+        for edge in base_set:
+            if from_name is not None and edge.from_id.name != from_name:
+                continue
+            if to_name is not None and edge.to_id.name != to_name:
+                continue
+            if from_language is not None and edge.from_id.language != from_language:
+                continue
+            if to_language is not None and edge.to_id.language != to_language:
+                continue
+            count += 1
+        return count
+
+    def enum_edges(self, from_name: str = None, from_language: str = None, to_name: str = None, to_language: str = None) -> Iterable[Edge]:
+        base_set = self._edges_dict.values()
+        for edge in base_set:
+            if from_name is not None and edge.from_id.name != from_name:
+                continue
+            if to_name is not None and edge.to_id.name != to_name:
+                continue
+            if from_language is not None and edge.from_id.language != from_language:
+                continue
+            if to_language is not None and edge.to_id.language != to_language:
+                continue
+            yield edge
+
+    def enum_nodes(self, name: str = None, language: str = None, from_id: NodeId = None, to_id: NodeId = None) -> Iterable[Node]:
+        base_set = self._nodes_dict.values()
+        for node in base_set:
+            if name is not None and node.name != name:
+                continue
+            if language is not None and node.language != language:
+                continue
+            if from_id is not None and node in self._from_edges[from_id]:
+                continue
+            if to_id is not None and node in self._to_edges[to_id]:
+                continue
+        yield node
+
+    def enum_commits(self) -> Iterable[Commit]:
+        for commit in self._commits.values():
+            yield commit
+
+    def add_node(self, id: NodeId, node: Node) -> None:
+        self._nodes_dict[id] = node
+
+    def update_node_history(self, node_id: NodeId, commit_hexsha: str,
+                            added_lines: int = 0, removed_lines: int = 0) -> None:
+        self._ensure_node_exists(node_id, commit_hexsha)
+        for historyitem in self._nodes_dict[node_id].history:
+            if historyitem.hexsha == commit_hexsha:
+                self._nodes_dict[node_id].history = [NodeHistoryItem(commit_hexsha,
+                                                    added_lines, removed_lines)]
+            return
+        self._nodes_dict[node_id].history.append(NodeHistoryItem(commit_hexsha,
+                                                    added_lines, removed_lines))
+
+    def update_node_files(self, node_id: NodeId, commit_hexsha: str,
+                        files: Iterable[str] = None) -> None:
+        self._ensure_node_exists(node_id, commit_hexsha)
+        self._nodes_dict[node_id].files = files
+
+    def add_edge(self, from_id: NodeId, to_id: NodeId, commit_hexsha: str) -> None:
+        edge = Edge(from_id, to_id, commit_hexsha)
+        self._edges_dict[(from_id, to_id)] = edge
+        self._from_edges[from_id].append(to_id)
+        self._to_edges[to_id].append(from_id)
+
+    def flush(self) -> None:
+        pass
+
+    def add_commit(self, hex_sha: str, author_email: str, author_name: str, author_date: str,
+                   committer_email: str, committer_name: str, commit_date: str, message: str) -> None:
+        self._commits[hex_sha] = Commit(hex_sha, author_email, author_name,
+                                author_date, committer_email, committer_name, commit_date, message)
+
+    def get_commit(self, hex_sha: str) -> Commit:
+        return self._commits[hex_sha]
+
+    def update_commit(self, commit: Commit) -> None:
+        self._commits[commit.hexsha] = commit
diff --git a/test/analytics2/__init__.py b/test/analytics2/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/test/analytics2/abstractions/__init__.py b/test/analytics2/abstractions/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/test/analytics2/abstractions/callcommitgraph.py b/test/analytics2/abstractions/callcommitgraph.py
index 7b8d4f9c2ab..bf1e51f86ac 100644
--- a/test/analytics2/abstractions/callcommitgraph.py
+++ b/test/analytics2/abstractions/callcommitgraph.py
@@ -31,7 +31,7 @@ def create_dummy_commit(message: str = None, parents: Iterable[str] = None):
 
 
 def test_call_commit_graph(ccg: ICallCommitGraph):
-    #assert ccg
+    # assert ccg
     # commits
     commit1 = create_dummy_commit()
     commit2 = create_dummy_commit()
@@ -59,12 +59,13 @@ def test_call_commit_graph(ccg: ICallCommitGraph):
     cppFiles = ["MyClass.h", "MyClass.cpp"]
     csFiles = ["MyClass.cs"]
     javaFiles = ["MyClass.java"]
-    ccg.update_node_files(cppnode1, added_files=cppFiles)
-    ccg.update_node_files(cppnode2, added_files=cppFiles)
-    ccg.update_node_files(cppnode3, added_files=cppFiles)
-    ccg.update_node_files(csnode2, added_files=csFiles)
-    ccg.update_node_files(csnode3, added_files=csFiles)
-    ccg.update_node_files(javanode1, added_files=javaFiles)
+    ccg.update_node_files(cppnode1, commit1.hexsha, files=cppFiles)
+    ccg.update_node_files(cppnode2, commit1.hexsha, files=cppFiles)
+    ccg.update_node_files(cppnode3, commit1.hexsha, files=cppFiles)
+    ccg.update_node_files(csnode1, commit2.hexsha, files=csFiles)
+    ccg.update_node_files(csnode2, commit2.hexsha, files=csFiles)
+    ccg.update_node_files(csnode3, commit2.hexsha, files=csFiles)
+    ccg.update_node_files(javanode1, commit3.hexsha, files=javaFiles)
     ccg.update_node_history(cppnode1, commit1.hexsha, 10, 0)
     # 10 will be overwritten
     ccg.update_node_history(cppnode1, commit1.hexsha, 20, -10)
@@ -72,6 +73,7 @@ def test_call_commit_graph(ccg: ICallCommitGraph):
     ccg.update_node_history(cppnode3, commit1.hexsha, 10, 0)
     ccg.update_node_history(csnode2, commit2.hexsha, 5, 0)
     ccg.update_node_history(csnode3, commit2.hexsha, 4, 0)
+    ccg.update_node_history(csnode1, commit2.hexsha, 4, 0)
     ccg.add_edge(cppnode2, cppnode1, commit1.hexsha)
     ccg.add_edge(cppnode3, cppnode1, commit1.hexsha)
     # csnode1 is implicitly added
@@ -79,7 +81,6 @@ def test_call_commit_graph(ccg: ICallCommitGraph):
     ccg.add_edge(csnode2, csnode1, commit1.hexsha)
     ccg.add_edge(csnode3, csnode2, commit1.hexsha)
     ccg.flush()
-
     assert ccg.get_nodes_count() == 7
     assert ccg.get_nodes_count(name=csnode2.name) == 2
     assert ccg.get_nodes_count(name=csnode2.name, language=csnode2.language) == 1
@@ -91,7 +92,7 @@ def test_call_commit_graph(ccg: ICallCommitGraph):
     assert ccg.get_nodes_count(language="non_existent") == 0
     assert ccg.get_edges_count() == 5
     assert ccg.get_edges_count(from_language="cs") == 3
-    assert ccg.get_edges_count(to_language="cpp") == 4
+    assert ccg.get_edges_count(to_language="cpp") == 3
     assert ccg.get_edges_count(from_language="cs", to_language="cpp") == 1
     assert ccg.get_edges_count(to_name=cppnode1.name) == 3
 
@@ -101,15 +102,15 @@ def assertNode(node_id, added_by, files):
         assert node.node_id == node_id
         assert node.added_by == added_by
         assert set(node.files) == set(files)
+
         return node
 
-    assert ccg.get_node(NodeId("non_existent", "cpp")) == None
-    assert ccg.get_node(NodeId(cppnode1.name, "non_existent")) == None
+    assert ccg.get_node(NodeId("non_existent", "cpp")) is None
+    assert ccg.get_node(NodeId(cppnode1.name, "non_existent")) is None
     assertNode(cppnode1, added_by=commit1.hexsha, files=cppFiles)
     assertNode(cppnode2, added_by=commit1.hexsha, files=cppFiles)
     assertNode(cppnode3, added_by=commit1.hexsha, files=cppFiles)
-    assertNode(csnode1, added_by=commit3.hexsha, files=csFiles)
+    assertNode(csnode1, added_by=commit2.hexsha, files=csFiles)
     assertNode(csnode2, added_by=commit2.hexsha, files=csFiles)
     assertNode(csnode3, added_by=commit2.hexsha, files=csFiles)
-    # javanode1 is not connected nor has node history, so it shouldn't have added_by.
-    assertNode(javanode1, added_by=None, files=javaFiles)
+    assertNode(javanode1, added_by=commit3.hexsha, files=javaFiles)
diff --git a/test/analytics2/callcommitgraph.py b/test/analytics2/callcommitgraph.py
deleted file mode 100644
index dab597a1b19..00000000000
--- a/test/analytics2/callcommitgraph.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import os.path
-import subprocess
-import test.analytics2.abstractions.callcommitgraph as ccghelper
-
-#   TODO import your call commit graph implementation(s)
-# from persper.analytics2.callcommitgraph import InMemoryCallCommitGraph
-from persper.util.path import root_path
-
-
-def test_in_memory_call_commit_graph():
-    ccg = None
-    #   TODO create an instance for testing
-    #ccg = InMemoryCallCommitGraph()
-    ccghelper.test_call_commit_graph(ccg)
diff --git a/test/analytics2/test_callcommitgraph.py b/test/analytics2/test_callcommitgraph.py
new file mode 100644
index 00000000000..575ea9e187a
--- /dev/null
+++ b/test/analytics2/test_callcommitgraph.py
@@ -0,0 +1,13 @@
+import subprocess
+import pytest
+from ..analytics2.abstractions import callcommitgraph as ccghelper
+
+#   TODO import your call commit graph implementation(s)
+from persper.analytics2.memorycallcommitgraph import MemoryCallCommitGraph
+from persper.util.path import root_path
+
+
+def test_memory_call_commit_graph():
+    ccg = MemoryCallCommitGraph()
+    ccghelper.test_call_commit_graph(ccg)
+

From d16a6b0f41148b85482b9acbf4fa1565a52dcb6d Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 8 May 2019 23:43:18 +0800
Subject: [PATCH 03/39] Rename IRepositoryHistoryProvider -> ICommitRepository.

---
 persper/analytics2/abstractions/repository.py |  2 +-
 persper/analytics2/repository.py              | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/persper/analytics2/abstractions/repository.py b/persper/analytics2/abstractions/repository.py
index 91437f5d048..28f4599a2a6 100644
--- a/persper/analytics2/abstractions/repository.py
+++ b/persper/analytics2/abstractions/repository.py
@@ -208,7 +208,7 @@ def __repr__(self):
             type(self).__name__, self.old_file, self.new_file, str(self.operation))
 
 
-class IRepositoryHistoryProvider(ABC):
+class ICommitRepository(ABC):
     """
     Provides functionality for accessing commit history of a specified commit.
     """
diff --git a/persper/analytics2/repository.py b/persper/analytics2/repository.py
index 44820376424..a66b6e1a4d9 100644
--- a/persper/analytics2/repository.py
+++ b/persper/analytics2/repository.py
@@ -6,16 +6,18 @@
 
 from git import Blob, Commit, Diff, DiffIndex, Repo
 
-from persper.analytics2.abstractions.repository import (
-    FileDiffOperation, ICommitInfo, IFileDiff, IFileInfo,
-    IRepositoryHistoryProvider, IWorkspaceFileFilter)
+from persper.analytics2.abstractions.repository import (FileDiffOperation,
+                                                        ICommitInfo,
+                                                        ICommitRepository,
+                                                        IFileDiff, IFileInfo,
+                                                        IWorkspaceFileFilter)
 
 _logger = logging.getLogger(__name__)
 
 EMPTY_TREE_SHA = '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
 
 
-class GitRepository(IRepositoryHistoryProvider):
+class GitRepository(ICommitRepository):
     def __init__(self, repo_path: str, first_parent_only: bool = False):
         """
         params

From 618f2671f0d89736932f2c987221a7e61a342421 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 8 May 2019 23:49:51 +0800
Subject: [PATCH 04/39] Add skeleton for MetaAnalyzer.

---
 persper/analytics2/metaanalyzer.py | 52 ++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 persper/analytics2/metaanalyzer.py

diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py
new file mode 100644
index 00000000000..ea2dac265d1
--- /dev/null
+++ b/persper/analytics2/metaanalyzer.py
@@ -0,0 +1,52 @@
+from typing import Iterable
+
+from persper.analytics2.abstractions.analyzers import ICommitAnalyzer, IPostAnalyzer
+from persper.analytics2.abstractions.repository import ICommitRepository
+
+
+class MetaAnalyzer():
+    """
+    Coordinates `ICommitAnalyzer` and `IPostAnalyzer` implementation, doing analysis through the commit history.
+    """
+    def __init__(self, history_provider: ICommitRepository,
+                 commit_analyzers: Iterable[ICommitAnalyzer], post_analyzers: Iterable[IPostAnalyzer],
+                 origin_commit: str = None, terminal_commit: str = "HEAD",
+                 first_parent_only: bool = False,
+                 analyzed_commits: Iterable[str] = None):
+        if not isinstance(history_provider, ICommitRepository):
+            raise ValueError("Expect ICommitRepository instance for history_provider.")
+        # do necessary defensive copies
+        self._history_provider = history_provider
+        self._commit_analyzers = list(commit_analyzers)
+        self._post_analyzers = list(post_analyzers)
+        self._origin_commit = origin_commit
+        self._terminal_commit = terminal_commit
+        self._first_parent_only = first_parent_only
+        self._analyzed_commits = set(analyzed_commits) if analyzed_commits else set()
+
+    @property
+    def origin_commit(self):
+        return self._origin_commit
+
+    @origin_commit.setter
+    def origin_commit(self, value: str):
+        self._origin_commit = value
+
+    @property
+    def terminal_commit(self):
+        return self._terminal_commit
+
+    @terminal_commit.setter
+    def terminal_commit(self, value: str):
+        self._terminal_commit = value
+
+    @property
+    def first_parent_only(self):
+        return self._first_parent_only
+
+    @first_parent_only.setter
+    def first_parent_only(self, value: bool):
+        self._first_parent_only = value
+
+    def analyze(self, max_commits: int = 100):
+        pass

From ba46358d734babde2ef822b4e09902f34b312c0c Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 8 May 2019 23:54:10 +0800
Subject: [PATCH 05/39] Remove MetaAnalyzer.first_parent_only because we
 actually cannot control this.

---
 persper/analytics2/metaanalyzer.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py
index ea2dac265d1..e61bc9a3614 100644
--- a/persper/analytics2/metaanalyzer.py
+++ b/persper/analytics2/metaanalyzer.py
@@ -7,11 +7,13 @@
 class MetaAnalyzer():
     """
     Coordinates `ICommitAnalyzer` and `IPostAnalyzer` implementation, doing analysis through the commit history.
+    params
+        origin_commit, terminal_commit: See `ICommitRepository.enum_commits` for details.
     """
+
     def __init__(self, history_provider: ICommitRepository,
                  commit_analyzers: Iterable[ICommitAnalyzer], post_analyzers: Iterable[IPostAnalyzer],
                  origin_commit: str = None, terminal_commit: str = "HEAD",
-                 first_parent_only: bool = False,
                  analyzed_commits: Iterable[str] = None):
         if not isinstance(history_provider, ICommitRepository):
             raise ValueError("Expect ICommitRepository instance for history_provider.")
@@ -21,7 +23,6 @@ def __init__(self, history_provider: ICommitRepository,
         self._post_analyzers = list(post_analyzers)
         self._origin_commit = origin_commit
         self._terminal_commit = terminal_commit
-        self._first_parent_only = first_parent_only
         self._analyzed_commits = set(analyzed_commits) if analyzed_commits else set()
 
     @property
@@ -40,13 +41,5 @@ def terminal_commit(self):
     def terminal_commit(self, value: str):
         self._terminal_commit = value
 
-    @property
-    def first_parent_only(self):
-        return self._first_parent_only
-
-    @first_parent_only.setter
-    def first_parent_only(self, value: bool):
-        self._first_parent_only = value
-
     def analyze(self, max_commits: int = 100):
         pass

From 88340026cf1e9bdafa44fda1ce6369ba7025a446 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 8 May 2019 23:55:33 +0800
Subject: [PATCH 06/39] Add pep8 config for analytics2 module. Extend
 max-line-length.

---
 persper/analytics2/setup.cfg | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 persper/analytics2/setup.cfg

diff --git a/persper/analytics2/setup.cfg b/persper/analytics2/setup.cfg
new file mode 100644
index 00000000000..68859ad034c
--- /dev/null
+++ b/persper/analytics2/setup.cfg
@@ -0,0 +1,2 @@
+[pep8]
+max-line-length = 120

From 1788e8ef41775be5d807878af4ad0debd3e9a54e Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Fri, 10 May 2019 00:32:44 +0800
Subject: [PATCH 07/39] Implement MetaAnalyzer.analyze.

---
 persper/analytics2/metaanalyzer.py | 97 +++++++++++++++++++++++++++---
 1 file changed, 90 insertions(+), 7 deletions(-)

diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py
index e61bc9a3614..75f99d2c9a1 100644
--- a/persper/analytics2/metaanalyzer.py
+++ b/persper/analytics2/metaanalyzer.py
@@ -1,7 +1,16 @@
+import logging
+import re
+import traceback
 from typing import Iterable
 
-from persper.analytics2.abstractions.analyzers import ICommitAnalyzer, IPostAnalyzer
-from persper.analytics2.abstractions.repository import ICommitRepository
+from persper.analytics2.abstractions.analyzers import (
+    AnalysisStatus, CommitAnalysisStopReason, ICommitAnalyzer, IPostAnalyzer)
+from persper.analytics2.abstractions.repository import (ICommitInfo,
+                                                        ICommitRepository,
+                                                        repr_hexsha)
+
+_logger = logging.getLogger(__file__)
+_whitespace_re = re.compile(r"\s+")
 
 
 class MetaAnalyzer():
@@ -11,14 +20,14 @@ class MetaAnalyzer():
         origin_commit, terminal_commit: See `ICommitRepository.enum_commits` for details.
     """
 
-    def __init__(self, history_provider: ICommitRepository,
+    def __init__(self, repository: ICommitRepository,
                  commit_analyzers: Iterable[ICommitAnalyzer], post_analyzers: Iterable[IPostAnalyzer],
                  origin_commit: str = None, terminal_commit: str = "HEAD",
                  analyzed_commits: Iterable[str] = None):
-        if not isinstance(history_provider, ICommitRepository):
-            raise ValueError("Expect ICommitRepository instance for history_provider.")
+        if not isinstance(repository, ICommitRepository):
+            raise ValueError("Expect ICommitRepository instance for repository.")
         # do necessary defensive copies
-        self._history_provider = history_provider
+        self._repository = repository
         self._commit_analyzers = list(commit_analyzers)
         self._post_analyzers = list(post_analyzers)
         self._origin_commit = origin_commit
@@ -42,4 +51,78 @@ def terminal_commit(self, value: str):
         self._terminal_commit = value
 
     def analyze(self, max_commits: int = 100):
-        pass
+        _logger.info("Start analyzing: %s..%s, max_commits=%d .",
+                     self._origin_commit, self._terminal_commit, max_commits)
+        analyzedCommits = []
+        currentSkippedCommits = 0
+        currentSkippedFirstCommit = None
+        currentSkippedLastCommit = None
+        stopReason = CommitAnalysisStopReason.ReachedTerminalCommit
+        lastCommitRef = None
+        # XXX determine whether we need to add this into AnalysisStatus
+        failedAnalyzer = None
+        failedAnalyzerException = None
+        for commit in self._repository.enum_commits(self._origin_commit, self._terminal_commit):
+            assert isinstance(commit, ICommitInfo)
+            if len(analyzedCommits) >= max_commits:
+                _logger.info("Max analyzed commits reached.")
+                stopReason = CommitAnalysisStopReason.ReachedMaximumCommits
+                break
+            lastCommitRef = commit.hexsha
+            # Skip commits we have already analyzed previously
+            if lastCommitRef in self._analyzed_commits:
+                currentSkippedLastCommit = lastCommitRef
+                if currentSkippedFirstCommit == None:
+                    currentSkippedCommits = 0
+                    currentSkippedFirstCommit = currentSkippedLastCommit
+                currentSkippedCommits += 1
+                continue
+            if currentSkippedFirstCommit != None:
+                _logger.info("Skipped %s analyzed commits: %s..%s .",
+                             currentSkippedCommits, currentSkippedFirstCommit, currentSkippedLastCommit)
+                currentSkippedFirstCommit = None
+            # Analyze commit
+            if _logger.getEffectiveLevel <= logging.INFO:
+                briefMessage = commit.message
+                trimmed = len(briefMessage) > 50
+                briefMessage = re.sub(_whitespace_re, " ", briefMessage[:60])[:47]
+                if trimmed:
+                    briefMessage += "..."
+                _logger.info("Analyzing commit [%s]: %s", repr_hexsha(lastCommitRef), briefMessage)
+            analyzer = None
+            analyzerIndex = 0
+            try:
+                for analyzer in self._commit_analyzers:
+                    assert isinstance(analyzer, ICommitAnalyzer)
+                    _logger.debug("Analyzing with [%d]: %s .", analyzerIndex, analyzer)
+                    analyzer.analyze(commit)
+                    analyzerIndex += 1
+                if _logger.getEffectiveLevel <= logging.DEBUG:
+                    _logger.debug("Finished analyzing commit [%s].", repr_hexsha(lastCommitRef))
+            except Exception as ex:
+                _logger.error("Failed to analyze commit [%s] with analyzer [%d][%s].\n%s",
+                              lastCommitRef, analyzerIndex, analyzer, ex)
+                failedAnalyzer = analyzer
+                failedAnalyzerException = ex
+                stopReason = CommitAnalysisStopReason.FatalError
+                break
+            analyzedCommits.append(lastCommitRef)
+            self._analyzed_commits.add(lastCommitRef)
+        # Post analysis
+        if self._post_analyzers:
+            analyzer = None
+            analyzerIndex = 0
+            status = AnalysisStatus(stop_reason=stopReason, exception=failedAnalyzerException,
+                                    origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit,
+                                    analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef)
+            try:
+                for analyzer in self._post_analyzers:
+                    assert isinstance(analyzer, IPostAnalyzer)
+                    _logger.debug("Post-analyzing with [%d]: %s .", analyzerIndex, analyzer)
+                    analyzer.analyze(status)
+                    analyzerIndex += 1
+            except Exception as ex:
+                _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s",
+                              analyzerIndex, analyzer, ex)
+                # We can do nothing about it. Crash the caller.
+                raise

From 9fa881c6d9479f5fb97688ad8c90a8ddf7b0e95b Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Fri, 10 May 2019 00:40:05 +0800
Subject: [PATCH 08/39] Add profiling log for MetaAnalyzer.

---
 persper/analytics2/metaanalyzer.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py
index 75f99d2c9a1..3b86183ee66 100644
--- a/persper/analytics2/metaanalyzer.py
+++ b/persper/analytics2/metaanalyzer.py
@@ -1,6 +1,7 @@
 import logging
 import re
 import traceback
+from time import monotonic
 from typing import Iterable
 
 from persper.analytics2.abstractions.analyzers import (
@@ -62,7 +63,13 @@ def analyze(self, max_commits: int = 100):
         # XXX determine whether we need to add this into AnalysisStatus
         failedAnalyzer = None
         failedAnalyzerException = None
+        t0 = monotonic()
+        tAnalyzer = None
+        analyzerEllapsedTime = 0
         for commit in self._repository.enum_commits(self._origin_commit, self._terminal_commit):
+            if tAnalyzer is not None:
+                analyzerEllapsedTime += monotonic() - tAnalyzer
+            tAnalyzer = monotonic()
             assert isinstance(commit, ICommitInfo)
             if len(analyzedCommits) >= max_commits:
                 _logger.info("Max analyzed commits reached.")
@@ -108,7 +115,13 @@ def analyze(self, max_commits: int = 100):
                 break
             analyzedCommits.append(lastCommitRef)
             self._analyzed_commits.add(lastCommitRef)
+        if tAnalyzer is not None:
+            analyzerEllapsedTime += monotonic() - tAnalyzer
+        _logger.info("Analyzed %d commits in %.2fs, analyzer exclusive %.2fs.",
+                     len(analyzedCommits), monotonic() - t0, analyzerEllapsedTime)
         # Post analysis
+        t0 = monotonic()
+        _logger.info("Start post-analyzing: %s..%s .", self._origin_commit, self._terminal_commit)
         if self._post_analyzers:
             analyzer = None
             analyzerIndex = 0
@@ -121,6 +134,7 @@ def analyze(self, max_commits: int = 100):
                     _logger.debug("Post-analyzing with [%d]: %s .", analyzerIndex, analyzer)
                     analyzer.analyze(status)
                     analyzerIndex += 1
+                _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0)
             except Exception as ex:
                 _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s",
                               analyzerIndex, analyzer, ex)

From f884d2abb4b7d42245fb590e4d7f1868df9bbee4 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Fri, 10 May 2019 00:49:11 +0800
Subject: [PATCH 09/39] Put traceback rather than exception message in log.

---
 persper/analytics2/metaanalyzer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py
index 3b86183ee66..0fd434a24e2 100644
--- a/persper/analytics2/metaanalyzer.py
+++ b/persper/analytics2/metaanalyzer.py
@@ -108,7 +108,7 @@ def analyze(self, max_commits: int = 100):
                     _logger.debug("Finished analyzing commit [%s].", repr_hexsha(lastCommitRef))
             except Exception as ex:
                 _logger.error("Failed to analyze commit [%s] with analyzer [%d][%s].\n%s",
-                              lastCommitRef, analyzerIndex, analyzer, ex)
+                              lastCommitRef, analyzerIndex, analyzer, traceback.format_exc())
                 failedAnalyzer = analyzer
                 failedAnalyzerException = ex
                 stopReason = CommitAnalysisStopReason.FatalError
@@ -137,6 +137,6 @@ def analyze(self, max_commits: int = 100):
                 _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0)
             except Exception as ex:
                 _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s",
-                              analyzerIndex, analyzer, ex)
+                              analyzerIndex, analyzer, traceback.format_exc())
                 # We can do nothing about it. Crash the caller.
                 raise

From 344c9fef316dcebacbb0d47cc53d315486d38c6d Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Sat, 11 May 2019 15:58:21 +0800
Subject: [PATCH 10/39] Implement CallCommitGraphAnalyzer.

---
 persper/analytics2/devrank.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/persper/analytics2/devrank.py b/persper/analytics2/devrank.py
index fff0e602ea3..646bbbd31be 100644
--- a/persper/analytics2/devrank.py
+++ b/persper/analytics2/devrank.py
@@ -1,23 +1,35 @@
+import logging
+from time import monotonic
 from typing import Iterable
 
 from persper.analytics2.abstractions.analyzers import (AnalysisStatus,
                                                        ICommitAnalyzer,
                                                        IPostAnalyzer)
-from persper.analytics2.abstractions.callcommitgraph import (IGraphServer,
-                                                             IReadOnlyCallCommitGraph,
-                                                             IWriteOnlyCallCommitGraph)
+from persper.analytics2.abstractions.callcommitgraph import (
+    IGraphServer, IReadOnlyCallCommitGraph, IWriteOnlyCallCommitGraph)
 from persper.analytics2.abstractions.repository import ICommitInfo
 
+_logger = logging.getLogger(__file__)
+
 
 class CallCommitGraphAnalyzer(ICommitAnalyzer):
     def __init__(self, graph_servers: Iterable[IGraphServer], call_commit_graph: IWriteOnlyCallCommitGraph):
         assert graph_servers
         assert call_commit_graph
         self._graph_servers = list(graph_servers)
+        # We only need this for flushing.
+        # We actually can flush the graph at a later stage.
         self._call_commit_graph = call_commit_graph
 
     def analyze(self, commit: ICommitInfo):
-        raise NotImplementedError()
+        for gs in self._graph_servers:
+            t0 = monotonic()
+            _logger.info("Analyzing %s with %s...", commit, gs)
+            assert isinstance(gs, IGraphServer)
+            _logger.info("%s finished in %.2fs.", gs, monotonic() - t0)
+        t0 = monotonic()
+        self._call_commit_graph.flush()
+        _logger.info("Call commit graph flush used %.2fs.", monotonic() - t0)
 
 
 class DevRankAnalyzer(IPostAnalyzer):

From 1ad6d86341c4b4c56efd9ee9975e249c9eb767b9 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Sat, 11 May 2019 16:35:32 +0800
Subject: [PATCH 11/39] Add unit test for MetaAnalyzer. Fixed bugs in
 MetaAnalyzer to get it works. Let MetaAnalyzer.analyze return analysis
 status.

---
 persper/analytics2/metaanalyzer.py         | 42 ++++++-------
 test/analytics2/abstractions/repository.py |  9 ++-
 test/analytics2/metaanalyzer.py            | 71 ++++++++++++++++++++++
 3 files changed, 98 insertions(+), 24 deletions(-)
 create mode 100644 test/analytics2/metaanalyzer.py

diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py
index 0fd434a24e2..3343e922103 100644
--- a/persper/analytics2/metaanalyzer.py
+++ b/persper/analytics2/metaanalyzer.py
@@ -51,7 +51,7 @@ def terminal_commit(self):
     def terminal_commit(self, value: str):
         self._terminal_commit = value
 
-    def analyze(self, max_commits: int = 100):
+    def analyze(self, max_commits: int = 100) -> AnalysisStatus:
         _logger.info("Start analyzing: %s..%s, max_commits=%d .",
                      self._origin_commit, self._terminal_commit, max_commits)
         analyzedCommits = []
@@ -89,7 +89,7 @@ def analyze(self, max_commits: int = 100):
                              currentSkippedCommits, currentSkippedFirstCommit, currentSkippedLastCommit)
                 currentSkippedFirstCommit = None
             # Analyze commit
-            if _logger.getEffectiveLevel <= logging.INFO:
+            if _logger.getEffectiveLevel() <= logging.INFO:
                 briefMessage = commit.message
                 trimmed = len(briefMessage) > 50
                 briefMessage = re.sub(_whitespace_re, " ", briefMessage[:60])[:47]
@@ -104,7 +104,7 @@ def analyze(self, max_commits: int = 100):
                     _logger.debug("Analyzing with [%d]: %s .", analyzerIndex, analyzer)
                     analyzer.analyze(commit)
                     analyzerIndex += 1
-                if _logger.getEffectiveLevel <= logging.DEBUG:
+                if _logger.getEffectiveLevel() <= logging.DEBUG:
                     _logger.debug("Finished analyzing commit [%s].", repr_hexsha(lastCommitRef))
             except Exception as ex:
                 _logger.error("Failed to analyze commit [%s] with analyzer [%d][%s].\n%s",
@@ -120,23 +120,23 @@ def analyze(self, max_commits: int = 100):
         _logger.info("Analyzed %d commits in %.2fs, analyzer exclusive %.2fs.",
                      len(analyzedCommits), monotonic() - t0, analyzerEllapsedTime)
         # Post analysis
+        status = AnalysisStatus(stop_reason=stopReason, exception=failedAnalyzerException,
+                        origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit,
+                        analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef)
         t0 = monotonic()
         _logger.info("Start post-analyzing: %s..%s .", self._origin_commit, self._terminal_commit)
-        if self._post_analyzers:
-            analyzer = None
-            analyzerIndex = 0
-            status = AnalysisStatus(stop_reason=stopReason, exception=failedAnalyzerException,
-                                    origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit,
-                                    analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef)
-            try:
-                for analyzer in self._post_analyzers:
-                    assert isinstance(analyzer, IPostAnalyzer)
-                    _logger.debug("Post-analyzing with [%d]: %s .", analyzerIndex, analyzer)
-                    analyzer.analyze(status)
-                    analyzerIndex += 1
-                _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0)
-            except Exception as ex:
-                _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s",
-                              analyzerIndex, analyzer, traceback.format_exc())
-                # We can do nothing about it. Crash the caller.
-                raise
+        analyzer = None
+        analyzerIndex = 0
+        try:
+            for analyzer in self._post_analyzers:
+                assert isinstance(analyzer, IPostAnalyzer)
+                _logger.debug("Post-analyzing with [%d]: %s .", analyzerIndex, analyzer)
+                analyzer.analyze(status)
+                analyzerIndex += 1
+            _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0)
+        except Exception as ex:
+            _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s",
+                            analyzerIndex, analyzer, traceback.format_exc())
+            # We can do nothing about it. Crash the caller.
+            raise
+        return status
diff --git a/test/analytics2/abstractions/repository.py b/test/analytics2/abstractions/repository.py
index 85905d5e331..a676dc3e2e6 100644
--- a/test/analytics2/abstractions/repository.py
+++ b/test/analytics2/abstractions/repository.py
@@ -3,13 +3,15 @@
 from itertools import islice
 from random import randint
 
-from persper.analytics2.abstractions.repository import (
-    FileDiffOperation, ICommitInfo, IFileDiff, IRepositoryHistoryProvider)
+from persper.analytics2.abstractions.repository import (FileDiffOperation,
+                                                        ICommitInfo,
+                                                        ICommitRepository,
+                                                        IFileDiff)
 
 _logger = logging.getLogger(__file__)
 
 
-def test_repository_history_provider(rhp: IRepositoryHistoryProvider):
+def test_repository_history_provider(rhp: ICommitRepository):
     assert rhp
     # We enumerate from the beginning
     commits = list(islice(rhp.enum_commits(None, "HEAD"), 1000))
@@ -22,6 +24,7 @@ def test_repository_history_provider(rhp: IRepositoryHistoryProvider):
     seenCommits = set()
     for c in commits:
         assert isinstance(c, ICommitInfo)
+        assert isinstance(c.hexsha, str)
         # We should see every commit only once
         assert c.hexsha not in seenCommits
         seenCommits.add(c.hexsha)
diff --git a/test/analytics2/metaanalyzer.py b/test/analytics2/metaanalyzer.py
new file mode 100644
index 00000000000..cd36f0314b6
--- /dev/null
+++ b/test/analytics2/metaanalyzer.py
@@ -0,0 +1,71 @@
+import logging
+from itertools import islice
+from test.analytics2.repository import prepare_repository
+
+from persper.analytics2.abstractions.analyzers import (
+    AnalysisStatus, CommitAnalysisStopReason, ICommitAnalyzer, IPostAnalyzer)
+from persper.analytics2.abstractions.repository import ICommitInfo
+from persper.analytics2.metaanalyzer import MetaAnalyzer
+from persper.analytics2.repository import GitRepository
+
+_logger = logging.getLogger(__file__)
+
+
+class DummyCommitAnalyzer(ICommitAnalyzer):
+    def __init__(self, raiseExceptionAtIndex=-1):
+        self.analyzedCommits = []
+        self._raiseExceptionAtIndex = raiseExceptionAtIndex
+
+    def analyze(self, commit: ICommitInfo) -> None:
+        assert commit
+        index = self.analyzedCommits
+        print("Current commit #{0}, hexsha {1}", index, commit.hexsha)
+        if index == self._raiseExceptionAtIndex:
+            raise Exception("Raised exception at commit #{0}.".format(index))
+        self.analyzedCommits.append(commit.hexsha)
+
+
+class DummyPostAnalyzer(IPostAnalyzer):
+    def __init__(self):
+        self.status = None
+
+    def analyze(self, status: AnalysisStatus) -> None:
+        self.status = status
+
+
+def test_meta_analyzer():
+    repoPath = prepare_repository("test_feature_branch")
+    repo = GitRepository(repoPath)
+    ca = DummyCommitAnalyzer()
+    pa = DummyPostAnalyzer()
+    ma = MetaAnalyzer(repo, [ca], [pa], origin_commit=None, terminal_commit="HEAD", analyzed_commits=())
+    status = ma.analyze(100)
+    assert status == pa.status
+
+    commits = [c.hexsha for c in islice(repo.enum_commits(None, "HEAD"), 101)]
+    if len(commits) <= 100:
+        assert pa.status.stop_reason == CommitAnalysisStopReason.ReachedTerminalCommit
+    else:
+        assert pa.status.stop_reason == CommitAnalysisStopReason.ReachedMaximumCommits
+    commits = commits[:100]
+    assert ca.analyzedCommits == commits
+    assert status.analyzed_commits_ref == commits
+    assert status.origin_commit_ref == None
+    assert status.terminal_commit_ref == "HEAD"
+    assert status.last_commit_ref == commits[-1]
+    assert status.exception == None
+
+    if len(commits) < 2:
+        _logger.warning("Skipped exception test because it needs repository have at least 2 commits.")
+        exceptionIndex = len(commits)//2
+        ca = DummyCommitAnalyzer(raiseExceptionAtIndex=exceptionIndex)
+        pa = DummyPostAnalyzer()
+        ma = MetaAnalyzer(repo, [ca], [pa], origin_commit=None, terminal_commit="HEAD", analyzed_commits=())
+        status = ma.analyze(100)
+        assert status == pa.status
+        assert status.stop_reason == CommitAnalysisStopReason.FatalError
+        assert isinstance(status.exception, Exception)
+        assert status.analyzed_commits_ref == commits[:exceptionIndex]
+        assert status.origin_commit_ref == None
+        assert status.terminal_commit_ref == "HEAD"
+        assert status.last_commit_ref == commits[exceptionIndex]

From 26eee25abca20221f9ac5c9df9efcba4c72dca23 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Tue, 14 May 2019 21:39:45 +0800
Subject: [PATCH 12/39] Add missing update_graph call in
 CallCommitGraphAnalyzer.

---
 persper/analytics2/devrank.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/persper/analytics2/devrank.py b/persper/analytics2/devrank.py
index 646bbbd31be..bb7b93f67ac 100644
--- a/persper/analytics2/devrank.py
+++ b/persper/analytics2/devrank.py
@@ -22,10 +22,12 @@ def __init__(self, graph_servers: Iterable[IGraphServer], call_commit_graph: IWr
         self._call_commit_graph = call_commit_graph
 
     def analyze(self, commit: ICommitInfo):
+        assert commit
         for gs in self._graph_servers:
             t0 = monotonic()
             _logger.info("Analyzing %s with %s...", commit, gs)
             assert isinstance(gs, IGraphServer)
+            gs.update_graph(commit)
             _logger.info("%s finished in %.2fs.", gs, monotonic() - t0)
         t0 = monotonic()
         self._call_commit_graph.flush()
@@ -38,4 +40,5 @@ def __init__(self, call_commit_graph: IReadOnlyCallCommitGraph):
         self._call_commit_graph = call_commit_graph
 
     def analyze(self, status: AnalysisStatus):
+        # TODO put analysis code here.
         pass

From 93d726fa0586e058b24f07123dc70f184bee4547 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Tue, 14 May 2019 21:48:36 +0800
Subject: [PATCH 13/39] Add docs for analyzer invocation order in MetaAnalyzer.

---
 persper/analytics2/metaanalyzer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py
index 3343e922103..5230bf79ae2 100644
--- a/persper/analytics2/metaanalyzer.py
+++ b/persper/analytics2/metaanalyzer.py
@@ -17,14 +17,18 @@
 class MetaAnalyzer():
     """
     Coordinates `ICommitAnalyzer` and `IPostAnalyzer` implementation, doing analysis through the commit history.
-    params
-        origin_commit, terminal_commit: See `ICommitRepository.enum_commits` for details.
     """
 
     def __init__(self, repository: ICommitRepository,
                  commit_analyzers: Iterable[ICommitAnalyzer], post_analyzers: Iterable[IPostAnalyzer],
                  origin_commit: str = None, terminal_commit: str = "HEAD",
                  analyzed_commits: Iterable[str] = None):
+        """
+        params
+            commit_analyzers: a list of commit analyzers. They will be invoked sequentially in each commit.
+            post_analyzers: a list of post analyzers. They will be invoked sequentially after the analysis ends successfully or in fault.
+            origin_commit, terminal_commit: see `ICommitRepository.enum_commits` for details.
+        """
         if not isinstance(repository, ICommitRepository):
             raise ValueError("Expect ICommitRepository instance for repository.")
         # do necessary defensive copies

From 97d218aa9c9bf2501ed7103e7e6411e12806d0b1 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Mon, 13 May 2019 22:10:56 +0800
Subject: [PATCH 14/39] Use explicit member imports instead of wildcard
 imports.

---
 persper/analytics2/memorycallcommitgraph.py | 34 +++++++++++++--------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py
index 7c18d9ade4d..023bfad6664 100644
--- a/persper/analytics2/memorycallcommitgraph.py
+++ b/persper/analytics2/memorycallcommitgraph.py
@@ -1,12 +1,18 @@
-from persper.analytics2.abstractions.callcommitgraph import *
-from persper.analytics2.abstractions.repository import *
-import sys
 import logging
+import sys
 from collections import defaultdict
 
+from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge,
+                                                             ICallCommitGraph,
+                                                             Node,
+                                                             NodeHistoryItem,
+                                                             NodeId)
+from persper.analytics2.abstractions.repository import (
+    ICommitInfo, IRepositoryHistoryProvider)
+from typing import Iterable
 
 class MemoryCallCommitGraph(ICallCommitGraph):
-    def __init__(self, graph_data: dict=None):
+    def __init__(self, graph_data: dict = None):
         self._nodes_dict = {}
         self._edges_dict = {}
         self._commits = {}
@@ -16,21 +22,23 @@ def __init__(self, graph_data: dict=None):
             for i in graph_data["nodes"]:
                 nodeid = NodeId(i["id"]['name'], i["id"]['language'])
                 for commit_id, history in i['history'].items:
-                    self.update_node_history(nodeid, commit_id, history['adds'], history['dels'])
+                    self.update_node_history(
+                        nodeid, commit_id, history['adds'], history['dels'])
                 files = []
                 for file in i["files"]:
                     files.append(file)
                 self.update_node_files(nodeid, files)
             for i in graph_data["edges"]:
-                from_id = NodeId(i['from_id']["name"], i['from_id']["language"])
+                from_id = NodeId(i['from_id']["name"],
+                                 i['from_id']["language"])
                 to_id = NodeId(i['to_id']["name"], i['to_id']["language"])
                 self.add_edge(from_id, to_id, i["added_by"])
 
             for i in graph_data["commits"]:
                 self.add_commit(i["hex_sha"], Commit(i["hex_sha"], i["author_email"], i['author_name'],
-                                                i['author_date'], i['committer_email'],
-                                                i['committer_name'], i['commit_date'],
-                                                i['message'], i['parent']) )
+                                                     i['author_date'], i['committer_email'],
+                                                     i['committer_name'], i['commit_date'],
+                                                     i['message'], i['parent']))
 
     def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None:
         if node_id not in self._nodes_dict:
@@ -118,13 +126,13 @@ def update_node_history(self, node_id: NodeId, commit_hexsha: str,
         for historyitem in self._nodes_dict[node_id].history:
             if historyitem.hexsha == commit_hexsha:
                 self._nodes_dict[node_id].history = [NodeHistoryItem(commit_hexsha,
-                                                    added_lines, removed_lines)]
+                                                                     added_lines, removed_lines)]
             return
         self._nodes_dict[node_id].history.append(NodeHistoryItem(commit_hexsha,
-                                                    added_lines, removed_lines))
+                                                                 added_lines, removed_lines))
 
     def update_node_files(self, node_id: NodeId, commit_hexsha: str,
-                        files: Iterable[str] = None) -> None:
+                          files: Iterable[str] = None) -> None:
         self._ensure_node_exists(node_id, commit_hexsha)
         self._nodes_dict[node_id].files = files
 
@@ -140,7 +148,7 @@ def flush(self) -> None:
     def add_commit(self, hex_sha: str, author_email: str, author_name: str, author_date: str,
                    committer_email: str, committer_name: str, commit_date: str, message: str) -> None:
         self._commits[hex_sha] = Commit(hex_sha, author_email, author_name,
-                                author_date, committer_email, committer_name, commit_date, message)
+                                        author_date, committer_email, committer_name, commit_date, message)
 
     def get_commit(self, hex_sha: str) -> Commit:
         return self._commits[hex_sha]

From e37c18db6bc9dfaf0a4fced43ad788f37754ffd6 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Mon, 13 May 2019 23:31:00 +0800
Subject: [PATCH 15/39] Refactor memory ccg deserialization logic.

---
 persper/analytics2/memorycallcommitgraph.py | 105 +++++++++++++-------
 1 file changed, 68 insertions(+), 37 deletions(-)

diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py
index 023bfad6664..158e92800f8 100644
--- a/persper/analytics2/memorycallcommitgraph.py
+++ b/persper/analytics2/memorycallcommitgraph.py
@@ -1,44 +1,78 @@
+import json
 import logging
 import sys
 from collections import defaultdict
+from typing import Iterable, TextIO
 
 from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge,
                                                              ICallCommitGraph,
                                                              Node,
                                                              NodeHistoryItem,
                                                              NodeId)
-from persper.analytics2.abstractions.repository import (
-    ICommitInfo, IRepositoryHistoryProvider)
-from typing import Iterable
+from persper.analytics2.abstractions.repository import (ICommitInfo,
+                                                        ICommitRepository)
+
+
+def serialize_node_id(d: NodeId) -> dict:
+    return {"name": d.name, "language": d.language}
+
+
+def deserialize_node_id(d: dict) -> NodeId:
+    return NodeId(d["name"], d["language"])
+
+
+def deserialize_node_history_item(d: dict) -> NodeHistoryItem:
+    return NodeHistoryItem(hexsha=d["hexsha"], added_lines=d["added_lines"], removed_lines=d["removed_lines"])
+
+
+def deserialize_node(d: dict) -> NodeId:
+    return Node(node_id=deserialize_node_id(d["id"]), added_by=d["added_by"],
+                history=[deserialize_node_history_item(i) for i in d["history"]],
+                files=list(d["files"]))
+
+
+def deserialize_edge(d: dict) -> Edge:
+    return Edge(from_id=deserialize_node_id(d["from_id"]), to_id=d["to_id"], added_by=d["added_by"])
+
+
+def deserialize_commit(d: dict) -> Commit:
+    return Commit(d["hex_sha"],
+                  d["author_email"], d['author_name'], d['author_date'],
+                  d['committer_email'], d['committer_name'], d['commit_date'],
+                  d['message'], d['parents'])
+
 
 class MemoryCallCommitGraph(ICallCommitGraph):
-    def __init__(self, graph_data: dict = None):
+    def __init__(self):
         self._nodes_dict = {}
         self._edges_dict = {}
         self._commits = {}
         self._from_edges = defaultdict(list)
         self._to_edges = defaultdict(list)
-        if graph_data:
-            for i in graph_data["nodes"]:
-                nodeid = NodeId(i["id"]['name'], i["id"]['language'])
-                for commit_id, history in i['history'].items:
-                    self.update_node_history(
-                        nodeid, commit_id, history['adds'], history['dels'])
-                files = []
-                for file in i["files"]:
-                    files.append(file)
-                self.update_node_files(nodeid, files)
-            for i in graph_data["edges"]:
-                from_id = NodeId(i['from_id']["name"],
-                                 i['from_id']["language"])
-                to_id = NodeId(i['to_id']["name"], i['to_id']["language"])
-                self.add_edge(from_id, to_id, i["added_by"])
-
-            for i in graph_data["commits"]:
-                self.add_commit(i["hex_sha"], Commit(i["hex_sha"], i["author_email"], i['author_name'],
-                                                     i['author_date'], i['committer_email'],
-                                                     i['committer_name'], i['commit_date'],
-                                                     i['message'], i['parent']))
+
+    @staticmethod
+    def deserialize_dict(graph_data: dict) -> "MemoryCallCommitGraph":
+        graph = MemoryCallCommitGraph()
+        for nd in graph_data["nodes"]:
+            node = deserialize_node(nd)
+            graph._add_node_direct(node)
+        for ed in graph_data["edges"]:
+            edge = deserialize_edge(ed)
+            graph._add_edge_direct(edge)
+        for cd in graph_data["commits"]:
+            commit = deserialize_commit(cd)
+            graph.update_commit(commit)
+        return graph
+
+    @staticmethod
+    def load_from(fp: TextIO) -> "MemoryCallCommitGraph":
+        d = json.load(fp)
+        return MemoryCallCommitGraph.deserialize_dict(d)
+
+    @staticmethod
+    def load(json_content: str) -> "MemoryCallCommitGraph":
+        d = json.loads(json_content)
+        return MemoryCallCommitGraph.deserialize_dict(d)
 
     def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None:
         if node_id not in self._nodes_dict:
@@ -117,8 +151,8 @@ def enum_commits(self) -> Iterable[Commit]:
         for commit in self._commits.values():
             yield commit
 
-    def add_node(self, id: NodeId, node: Node) -> None:
-        self._nodes_dict[id] = node
+    def _add_node_direct(self, node: Node) -> None:
+        self._nodes_dict[node.id] = node
 
     def update_node_history(self, node_id: NodeId, commit_hexsha: str,
                             added_lines: int = 0, removed_lines: int = 0) -> None:
@@ -137,21 +171,18 @@ def update_node_files(self, node_id: NodeId, commit_hexsha: str,
         self._nodes_dict[node_id].files = files
 
     def add_edge(self, from_id: NodeId, to_id: NodeId, commit_hexsha: str) -> None:
-        edge = Edge(from_id, to_id, commit_hexsha)
-        self._edges_dict[(from_id, to_id)] = edge
-        self._from_edges[from_id].append(to_id)
-        self._to_edges[to_id].append(from_id)
+        self._add_edge_direct(Edge(from_id, to_id, commit_hexsha))
+
+    def _add_edge_direct(self, edge: Edge) -> None:
+        self._edges_dict[(edge.from_id, edge.to_id)] = edge
+        self._from_edges[edge.from_id].append(edge.to_id)
+        self._to_edges[edge.to_id].append(edge.from_id)
 
     def flush(self) -> None:
         pass
 
-    def add_commit(self, hex_sha: str, author_email: str, author_name: str, author_date: str,
-                   committer_email: str, committer_name: str, commit_date: str, message: str) -> None:
-        self._commits[hex_sha] = Commit(hex_sha, author_email, author_name,
-                                        author_date, committer_email, committer_name, commit_date, message)
-
     def get_commit(self, hex_sha: str) -> Commit:
-        return self._commits[hex_sha]
+        return self._commits.get(hex_sha, None)
 
     def update_commit(self, commit: Commit) -> None:
         self._commits[commit.hexsha] = commit

From 368eadc1bd24e434c5834b2297d06dcae22173b0 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Tue, 14 May 2019 22:57:24 +0800
Subject: [PATCH 16/39] Add serialization support in
 test_memory_call_commit_graph. Add test for serialization/deserialization.
 Bug fix in MemoryCallCommitGraph to make it pass the test.

---
 persper/analytics2/memorycallcommitgraph.py   | 58 ++++++++++++++++---
 .../abstractions/callcommitgraph.py           | 17 ++++++
 test/analytics2/test_callcommitgraph.py       |  8 ++-
 3 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py
index 158e92800f8..17824688110 100644
--- a/persper/analytics2/memorycallcommitgraph.py
+++ b/persper/analytics2/memorycallcommitgraph.py
@@ -2,8 +2,11 @@
 import logging
 import sys
 from collections import defaultdict
+from datetime import datetime
 from typing import Iterable, TextIO
 
+import pytz
+
 from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge,
                                                              ICallCommitGraph,
                                                              Node,
@@ -13,12 +16,34 @@
                                                         ICommitRepository)
 
 
-def serialize_node_id(d: NodeId) -> dict:
-    return {"name": d.name, "language": d.language}
+def serialize_node_id(o: NodeId) -> tuple:
+    return (o.name, o.language)
+
+
+def serialize_node_history_item(o: NodeHistoryItem) -> dict:
+    return {"hexsha": o.hexsha, "added_lines": o.added_lines, "removed_lines": o.removed_lines}
+
+
+def serialize_node(o: Node) -> dict:
+    return {"id": o.node_id, "added_by": o.added_by,
+            "history": [serialize_node_history_item(h) for h in o.history],
+            "files": list(o.files)}
+
+
+def serialize_edge(o: Edge) -> dict:
+    return {"from_id": serialize_node_id(o.from_id), "to_id": serialize_node_id(o.to_id),
+            "added_by": o.added_by}
 
 
-def deserialize_node_id(d: dict) -> NodeId:
-    return NodeId(d["name"], d["language"])
+def serialize_commit(o: Commit) -> dict:
+    return {"hex_sha": o.hexsha,
+            "author_email": o.author_email, "author_name": o.author_name, "authored_time": str(o.authored_time),
+            "committer_email": o.committer_email, "committer_name": o.committer_name, "committed_time": str(o.committed_time),
+            "message": o.message, "parents": o.parents}
+
+
+def deserialize_node_id(t: tuple) -> NodeId:
+    return NodeId(t[0], t[1])
 
 
 def deserialize_node_history_item(d: dict) -> NodeHistoryItem:
@@ -32,13 +57,13 @@ def deserialize_node(d: dict) -> NodeId:
 
 
 def deserialize_edge(d: dict) -> Edge:
-    return Edge(from_id=deserialize_node_id(d["from_id"]), to_id=d["to_id"], added_by=d["added_by"])
+    return Edge(from_id=deserialize_node_id(d["from_id"]), to_id=deserialize_node_id(d["to_id"]), added_by=d["added_by"])
 
 
 def deserialize_commit(d: dict) -> Commit:
     return Commit(d["hex_sha"],
-                  d["author_email"], d['author_name'], d['author_date'],
-                  d['committer_email'], d['committer_name'], d['commit_date'],
+                  d["author_email"], d['author_name'], datetime.fromisoformat(d['authored_time']),
+                  d['committer_email'], d['committer_name'], datetime.fromisoformat(d['committed_time']),
                   d['message'], d['parents'])
 
 
@@ -70,10 +95,24 @@ def load_from(fp: TextIO) -> "MemoryCallCommitGraph":
         return MemoryCallCommitGraph.deserialize_dict(d)
 
     @staticmethod
-    def load(json_content: str) -> "MemoryCallCommitGraph":
+    def deserialize(json_content: str) -> "MemoryCallCommitGraph":
         d = json.loads(json_content)
         return MemoryCallCommitGraph.deserialize_dict(d)
 
+    def serialize_dict(self) -> dict:
+        return {
+            "nodes": [serialize_node(n) for n in self._nodes_dict.values()],
+            "edges": [serialize_edge(n) for n in self._edges_dict.values()],
+            "commits": [serialize_commit(n) for n in self._commits.values()],
+        }
+
+    def save_to(self, fp: TextIO):
+        d = self.serialize_dict()
+        json.dump(d, fp)
+
+    def serialize(self) -> str:
+        return json.dumps(self.serialize_dict())
+
     def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None:
         if node_id not in self._nodes_dict:
             self._nodes_dict[node_id] = Node(node_id, added_by=commit_hexsha)
@@ -152,7 +191,7 @@ def enum_commits(self) -> Iterable[Commit]:
             yield commit
 
     def _add_node_direct(self, node: Node) -> None:
-        self._nodes_dict[node.id] = node
+        self._nodes_dict[node.node_id] = node
 
     def update_node_history(self, node_id: NodeId, commit_hexsha: str,
                             added_lines: int = 0, removed_lines: int = 0) -> None:
@@ -174,6 +213,7 @@ def add_edge(self, from_id: NodeId, to_id: NodeId, commit_hexsha: str) -> None:
         self._add_edge_direct(Edge(from_id, to_id, commit_hexsha))
 
     def _add_edge_direct(self, edge: Edge) -> None:
+
         self._edges_dict[(edge.from_id, edge.to_id)] = edge
         self._from_edges[edge.from_id].append(edge.to_id)
         self._to_edges[edge.to_id].append(edge.from_id)
diff --git a/test/analytics2/abstractions/callcommitgraph.py b/test/analytics2/abstractions/callcommitgraph.py
index bf1e51f86ac..a538adb39f5 100644
--- a/test/analytics2/abstractions/callcommitgraph.py
+++ b/test/analytics2/abstractions/callcommitgraph.py
@@ -5,6 +5,7 @@
 
 from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge,
                                                              ICallCommitGraph,
+                                                             IReadOnlyCallCommitGraph,
                                                              Node, NodeId)
 
 
@@ -114,3 +115,19 @@ def assertNode(node_id, added_by, files):
     assertNode(csnode2, added_by=commit2.hexsha, files=csFiles)
     assertNode(csnode3, added_by=commit2.hexsha, files=csFiles)
     assertNode(javanode1, added_by=commit3.hexsha, files=javaFiles)
+
+
+def assert_graph_same(expected: IReadOnlyCallCommitGraph, actual: IReadOnlyCallCommitGraph, compare_hexsha: bool = True):
+    for n1 in expected.enum_nodes():
+        n2 = actual.get_node(n1.node_id)
+        assert n2, "Node missing: {0}".format(n1.node_id)
+        assert n1.node_id == n2.node_id
+        if compare_hexsha:
+            assert n1.added_by == n2.added_by
+        else:
+            c1 = expected.get_commit(n1.added_by)
+            c2 = actual.get_commit(n2.added_by)
+            assert c1
+            assert c2
+            assert c1.message == c2.message
+        # TODO add more assertions
diff --git a/test/analytics2/test_callcommitgraph.py b/test/analytics2/test_callcommitgraph.py
index 575ea9e187a..91c6d0ef638 100644
--- a/test/analytics2/test_callcommitgraph.py
+++ b/test/analytics2/test_callcommitgraph.py
@@ -1,6 +1,6 @@
 import subprocess
 import pytest
-from ..analytics2.abstractions import callcommitgraph as ccghelper
+import test.analytics2.abstractions.callcommitgraph as ccghelper
 
 #   TODO import your call commit graph implementation(s)
 from persper.analytics2.memorycallcommitgraph import MemoryCallCommitGraph
@@ -10,4 +10,8 @@
 def test_memory_call_commit_graph():
     ccg = MemoryCallCommitGraph()
     ccghelper.test_call_commit_graph(ccg)
-
+    serialized = ccg.serialize()
+    print("Serialized:", serialized)
+    assert isinstance(serialized, str)
+    ccg2 = MemoryCallCommitGraph.deserialize(serialized)
+    ccghelper.assert_graph_same(ccg, ccg2)

From da50ed180b58ea50e82cf065c69a153e5bade58d Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 15 May 2019 23:17:03 +0800
Subject: [PATCH 17/39] Add more tests in assert_graph_same.

---
 .../abstractions/callcommitgraph.py           | 78 +++++++++++++++----
 1 file changed, 64 insertions(+), 14 deletions(-)

diff --git a/test/analytics2/abstractions/callcommitgraph.py b/test/analytics2/abstractions/callcommitgraph.py
index a538adb39f5..22c16e4a75b 100644
--- a/test/analytics2/abstractions/callcommitgraph.py
+++ b/test/analytics2/abstractions/callcommitgraph.py
@@ -3,10 +3,9 @@
 from random import randint
 from typing import Iterable
 
-from persper.analytics2.abstractions.callcommitgraph import (Commit, Edge,
-                                                             ICallCommitGraph,
-                                                             IReadOnlyCallCommitGraph,
-                                                             Node, NodeId)
+from persper.analytics2.abstractions.callcommitgraph import (
+    Commit, Edge, ICallCommitGraph, IReadOnlyCallCommitGraph, Node,
+    NodeHistoryItem, NodeId)
 
 
 def commit_equals(x: Commit, y: Commit):
@@ -117,17 +116,68 @@ def assertNode(node_id, added_by, files):
     assertNode(javanode1, added_by=commit3.hexsha, files=javaFiles)
 
 
-def assert_graph_same(expected: IReadOnlyCallCommitGraph, actual: IReadOnlyCallCommitGraph, compare_hexsha: bool = True):
+def commit_assertion_skip(expectedGraph, actualGraph, expectedHexsha, actualHexsha):
+    pass
+
+
+def commit_assertion_by_hexsha(expectedGraph, actualGraph, expectedHexsha, actualHexsha):
+    assert expectedHexsha == actualHexsha, "Commits are not the same by hexsha."
+
+
+def commit_assertion_by_comment(expectedGraph, actualGraph, expectedHexsha, actualHexsha):
+    c1 = expectedGraph.get_commit(expectedHexsha)
+    c2 = actualGraph.get_commit(actualHexsha)
+    assert c1, "Expected-side of commit is missing."
+    assert c2, "Actual-side of commit is missing."
+    assert c1.message == c2.message, "Commits are not the same by commit message."
+
+
+def assert_graph_same(expected: IReadOnlyCallCommitGraph, actual: IReadOnlyCallCommitGraph,
+                      commit_assertion=commit_assertion_by_hexsha):
+    def assertCommitEqual(expectedHexsha, actualHexsha):
+        return commit_assertion(expected, actual, expectedHexsha, actualHexsha)
     for n1 in expected.enum_nodes():
         n2 = actual.get_node(n1.node_id)
         assert n2, "Node missing: {0}".format(n1.node_id)
         assert n1.node_id == n2.node_id
-        if compare_hexsha:
-            assert n1.added_by == n2.added_by
-        else:
-            c1 = expected.get_commit(n1.added_by)
-            c2 = actual.get_commit(n2.added_by)
-            assert c1
-            assert c2
-            assert c1.message == c2.message
-        # TODO add more assertions
+        assertCommitEqual(n1.added_by, n2.added_by)
+        keyExtractor = None
+        if commit_assertion == commit_assertion_by_hexsha:
+            # Make autopep8 happy.
+            def f(h):
+                return h.hexsha
+            keyExtractor = f
+        elif commit_assertion == commit_assertion_by_comment:
+            def f(h):
+                return h.message
+            keyExtractor = f
+        if keyExtractor:
+            d1 = dict((keyExtractor, h) for h in n1.history)
+            d2 = dict((keyExtractor, h) for h in n2.history)
+            for k, h1 in d1:
+                h2 = d2.get(k, None)
+                assert isinstance(h1, NodeHistoryItem)
+                assert h2, "Commit history {0} missing for node {1}.".format(h1, n1.node_id)
+                assert isinstance(h2, NodeHistoryItem)
+                assert h1.added_lines == h2.added_lines, "In commit: {0}".format(h1)
+                assert h1.removed_lines == h2.removed_lines, "In commit: {0}".format(h1)
+            if len(d1) < len(d2):
+                # there are extra node history
+                for k, h2 in d2:
+                    h1 = d1.get(k, None)
+                    assert h2, "Extra commit history {0} for node {1}.".format(h1, n1.node_id)
+        assert set(n1.files) == set(n2.files)
+    if expected.get_nodes_count() < actual.get_nodes_count():
+        # there are extra nodes
+        for n2 in actual.enum_nodes():
+            n1 = expected.get_node(n2.node_id)
+            assert n1, "Extra node: {0}".format(n2.node_id)
+    for b1 in expected.enum_edges():
+        b2 = actual.get_edge(b1.from_id, b1.to_id)
+        assert b2, "Edge missing: {0} -> {1}".format(b1.from_id, b1.to_id)
+        assertCommitEqual(b1.added_by, b2.added_by)
+    if expected.get_edges_count() < actual.get_edges_count():
+        # there are extra edges
+        for n2 in actual.enum_edges():
+            n1 = expected.get_edge(n2.from_id, n2.to_id)
+            assert n1, "Extra edge: {0} -> {1}".format(b2.from_id, b2.to_id)

From 52b6b4bc71a58fe31e253cf2f668c08e36833cab Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 15 May 2019 23:26:04 +0800
Subject: [PATCH 18/39] Reorganize test_analytics2. Make file names compatible
 with pytest.

---
 test/analytics2/abstractions/__init__.py       |  0
 .../__init__.py                                |  0
 .../helpers/__init__.py}                       |  0
 .../helpers}/callcommitgraph.py                |  0
 .../helpers}/repository.py                     |  0
 test/{analytics2 => test_analytics2}/setup.cfg |  0
 .../test_callcommitgraph.py                    |  2 +-
 .../test_metaanalyzer.py}                      |  2 +-
 .../test_repository.py}                        |  2 +-
 test/test_analytics2/utilities.py              | 18 ++++++++++++++++++
 10 files changed, 21 insertions(+), 3 deletions(-)
 delete mode 100644 test/analytics2/abstractions/__init__.py
 rename test/{analytics2 => test_analytics2}/__init__.py (100%)
 rename test/{analytics2/utilities.py => test_analytics2/helpers/__init__.py} (100%)
 rename test/{analytics2/abstractions => test_analytics2/helpers}/callcommitgraph.py (100%)
 rename test/{analytics2/abstractions => test_analytics2/helpers}/repository.py (100%)
 rename test/{analytics2 => test_analytics2}/setup.cfg (100%)
 rename test/{analytics2 => test_analytics2}/test_callcommitgraph.py (89%)
 rename test/{analytics2/metaanalyzer.py => test_analytics2/test_metaanalyzer.py} (98%)
 rename test/{analytics2/repository.py => test_analytics2/test_repository.py} (93%)
 create mode 100644 test/test_analytics2/utilities.py

diff --git a/test/analytics2/abstractions/__init__.py b/test/analytics2/abstractions/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/test/analytics2/__init__.py b/test/test_analytics2/__init__.py
similarity index 100%
rename from test/analytics2/__init__.py
rename to test/test_analytics2/__init__.py
diff --git a/test/analytics2/utilities.py b/test/test_analytics2/helpers/__init__.py
similarity index 100%
rename from test/analytics2/utilities.py
rename to test/test_analytics2/helpers/__init__.py
diff --git a/test/analytics2/abstractions/callcommitgraph.py b/test/test_analytics2/helpers/callcommitgraph.py
similarity index 100%
rename from test/analytics2/abstractions/callcommitgraph.py
rename to test/test_analytics2/helpers/callcommitgraph.py
diff --git a/test/analytics2/abstractions/repository.py b/test/test_analytics2/helpers/repository.py
similarity index 100%
rename from test/analytics2/abstractions/repository.py
rename to test/test_analytics2/helpers/repository.py
diff --git a/test/analytics2/setup.cfg b/test/test_analytics2/setup.cfg
similarity index 100%
rename from test/analytics2/setup.cfg
rename to test/test_analytics2/setup.cfg
diff --git a/test/analytics2/test_callcommitgraph.py b/test/test_analytics2/test_callcommitgraph.py
similarity index 89%
rename from test/analytics2/test_callcommitgraph.py
rename to test/test_analytics2/test_callcommitgraph.py
index 91c6d0ef638..07a9bc3cc67 100644
--- a/test/analytics2/test_callcommitgraph.py
+++ b/test/test_analytics2/test_callcommitgraph.py
@@ -1,6 +1,6 @@
 import subprocess
 import pytest
-import test.analytics2.abstractions.callcommitgraph as ccghelper
+import test.analytics2.helpers.callcommitgraph as ccghelper
 
 #   TODO import your call commit graph implementation(s)
 from persper.analytics2.memorycallcommitgraph import MemoryCallCommitGraph
diff --git a/test/analytics2/metaanalyzer.py b/test/test_analytics2/test_metaanalyzer.py
similarity index 98%
rename from test/analytics2/metaanalyzer.py
rename to test/test_analytics2/test_metaanalyzer.py
index cd36f0314b6..7d72caa5a53 100644
--- a/test/analytics2/metaanalyzer.py
+++ b/test/test_analytics2/test_metaanalyzer.py
@@ -1,6 +1,6 @@
 import logging
 from itertools import islice
-from test.analytics2.repository import prepare_repository
+from test.analytics2.utilities import prepare_repository
 
 from persper.analytics2.abstractions.analyzers import (
     AnalysisStatus, CommitAnalysisStopReason, ICommitAnalyzer, IPostAnalyzer)
diff --git a/test/analytics2/repository.py b/test/test_analytics2/test_repository.py
similarity index 93%
rename from test/analytics2/repository.py
rename to test/test_analytics2/test_repository.py
index 1eac74d43db..de785ecb346 100644
--- a/test/analytics2/repository.py
+++ b/test/test_analytics2/test_repository.py
@@ -1,6 +1,6 @@
 import os.path
 import subprocess
-import test.analytics2.abstractions.repository as repositoryhelper
+import test.analytics2.helpers.repository as repositoryhelper
 
 from persper.analytics2.repository import GitRepository
 from persper.util.path import root_path
diff --git a/test/test_analytics2/utilities.py b/test/test_analytics2/utilities.py
new file mode 100644
index 00000000000..e2a71d525c4
--- /dev/null
+++ b/test/test_analytics2/utilities.py
@@ -0,0 +1,18 @@
+import os.path
+import subprocess
+import test.analytics2.helpers.repository as repositoryhelper
+
+from persper.analytics2.repository import GitRepository
+from persper.util.path import root_path
+
+
+def prepare_repository(repo_name: str):
+    # build the repo first if not exists yet
+    repo_path = os.path.join(root_path, 'repos/' + repo_name)
+    script_path = os.path.join(root_path, 'tools/repo_creater/create_repo.py')
+    test_src_path = os.path.join(root_path, 'test/' + repo_name)
+    if not os.path.isdir(repo_path):
+        cmd = '{} {}'.format(script_path, test_src_path)
+        subprocess.call(cmd, shell=True)
+    print("Repository path: ", repo_path)
+    return repo_path

From a70445938f1f057c239268305fc7f177a061f68a Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 15 May 2019 23:37:44 +0800
Subject: [PATCH 19/39] Fix module imports in test_analytics2 accordingly.

---
 test/test_analytics2/test_callcommitgraph.py | 2 +-
 test/test_analytics2/test_metaanalyzer.py    | 2 +-
 test/test_analytics2/test_repository.py      | 2 +-
 test/test_analytics2/utilities.py            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_analytics2/test_callcommitgraph.py b/test/test_analytics2/test_callcommitgraph.py
index 07a9bc3cc67..b2a459f4cc3 100644
--- a/test/test_analytics2/test_callcommitgraph.py
+++ b/test/test_analytics2/test_callcommitgraph.py
@@ -1,6 +1,6 @@
 import subprocess
 import pytest
-import test.analytics2.helpers.callcommitgraph as ccghelper
+import test.test_analytics2.helpers.callcommitgraph as ccghelper
 
 #   TODO import your call commit graph implementation(s)
 from persper.analytics2.memorycallcommitgraph import MemoryCallCommitGraph
diff --git a/test/test_analytics2/test_metaanalyzer.py b/test/test_analytics2/test_metaanalyzer.py
index 7d72caa5a53..6aaa61df0f5 100644
--- a/test/test_analytics2/test_metaanalyzer.py
+++ b/test/test_analytics2/test_metaanalyzer.py
@@ -1,6 +1,6 @@
 import logging
 from itertools import islice
-from test.analytics2.utilities import prepare_repository
+from test.test_analytics2.utilities import prepare_repository
 
 from persper.analytics2.abstractions.analyzers import (
     AnalysisStatus, CommitAnalysisStopReason, ICommitAnalyzer, IPostAnalyzer)
diff --git a/test/test_analytics2/test_repository.py b/test/test_analytics2/test_repository.py
index de785ecb346..89404b90b57 100644
--- a/test/test_analytics2/test_repository.py
+++ b/test/test_analytics2/test_repository.py
@@ -1,6 +1,6 @@
 import os.path
 import subprocess
-import test.analytics2.helpers.repository as repositoryhelper
+import test.test_analytics2.helpers.repository as repositoryhelper
 
 from persper.analytics2.repository import GitRepository
 from persper.util.path import root_path
diff --git a/test/test_analytics2/utilities.py b/test/test_analytics2/utilities.py
index e2a71d525c4..a0f9044b201 100644
--- a/test/test_analytics2/utilities.py
+++ b/test/test_analytics2/utilities.py
@@ -1,6 +1,6 @@
 import os.path
 import subprocess
-import test.analytics2.helpers.repository as repositoryhelper
+import test.test_analytics2.helpers.repository as repositoryhelper
 
 from persper.analytics2.repository import GitRepository
 from persper.util.path import root_path

From a21716000df8d117d668adc0779659850b9625f1 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 15 May 2019 23:40:33 +0800
Subject: [PATCH 20/39] Add docs.

---
 persper/analytics2/memorycallcommitgraph.py   | 24 +++++++++++++++++++
 .../helpers/callcommitgraph.py                |  6 +++++
 2 files changed, 30 insertions(+)

diff --git a/persper/analytics2/memorycallcommitgraph.py b/persper/analytics2/memorycallcommitgraph.py
index 17824688110..8415cf18340 100644
--- a/persper/analytics2/memorycallcommitgraph.py
+++ b/persper/analytics2/memorycallcommitgraph.py
@@ -15,6 +15,8 @@
 from persper.analytics2.abstractions.repository import (ICommitInfo,
                                                         ICommitRepository)
 
+# Helper methods for data model <--> dict serialization.
+
 
 def serialize_node_id(o: NodeId) -> tuple:
     return (o.name, o.language)
@@ -68,6 +70,10 @@ def deserialize_commit(d: dict) -> Commit:
 
 
 class MemoryCallCommitGraph(ICallCommitGraph):
+    """
+    Represents a call commit graph stored in-memory.
+    """
+
     def __init__(self):
         self._nodes_dict = {}
         self._edges_dict = {}
@@ -77,6 +83,9 @@ def __init__(self):
 
     @staticmethod
     def deserialize_dict(graph_data: dict) -> "MemoryCallCommitGraph":
+        """
+        Deserializes a MemoryCallCommitGraph from dict generated by `serialize_dict` method.
+        """
         graph = MemoryCallCommitGraph()
         for nd in graph_data["nodes"]:
             node = deserialize_node(nd)
@@ -91,15 +100,24 @@ def deserialize_dict(graph_data: dict) -> "MemoryCallCommitGraph":
 
     @staticmethod
     def load_from(fp: TextIO) -> "MemoryCallCommitGraph":
+        """
+        Deserializes a MemoryCallCommitGraph from the specified text IO containing JSON.
+        """
         d = json.load(fp)
         return MemoryCallCommitGraph.deserialize_dict(d)
 
     @staticmethod
     def deserialize(json_content: str) -> "MemoryCallCommitGraph":
+        """
+        Deserializes a MemoryCallCommitGraph from the specified JSON string.
+        """
         d = json.loads(json_content)
         return MemoryCallCommitGraph.deserialize_dict(d)
 
     def serialize_dict(self) -> dict:
+        """
+        Serializes the call commit graph contained in the current instance into a simple dict.
+        """
         return {
             "nodes": [serialize_node(n) for n in self._nodes_dict.values()],
             "edges": [serialize_edge(n) for n in self._edges_dict.values()],
@@ -107,10 +125,16 @@ def serialize_dict(self) -> dict:
         }
 
     def save_to(self, fp: TextIO):
+        """
+        Serializes the call commit graph contained in the current instance into the specified text IO as JSON.
+        """
         d = self.serialize_dict()
         json.dump(d, fp)
 
     def serialize(self) -> str:
+        """
+        Serializes the call commit graph contained in the current instance into JSON string.
+        """
         return json.dumps(self.serialize_dict())
 
     def _ensure_node_exists(self, node_id: NodeId, commit_hexsha: str) -> None:
diff --git a/test/test_analytics2/helpers/callcommitgraph.py b/test/test_analytics2/helpers/callcommitgraph.py
index 22c16e4a75b..9b35f70ea49 100644
--- a/test/test_analytics2/helpers/callcommitgraph.py
+++ b/test/test_analytics2/helpers/callcommitgraph.py
@@ -134,6 +134,12 @@ def commit_assertion_by_comment(expectedGraph, actualGraph, expectedHexsha, actu
 
 def assert_graph_same(expected: IReadOnlyCallCommitGraph, actual: IReadOnlyCallCommitGraph,
                       commit_assertion=commit_assertion_by_hexsha):
+    """
+    Asserts two `IReadOnlyCallCommitGraph` instances contain the equivalent content.
+    params
+        commit_assertion:   Specifies how to treat two commits as equivalent. You need to choose between
+                            `commit_assertion_skip`, `commit_assertion_by_hexsha`, and `commit_assertion_by_comment`.
+    """
     def assertCommitEqual(expectedHexsha, actualHexsha):
         return commit_assertion(expected, actual, expectedHexsha, actualHexsha)
     for n1 in expected.enum_nodes():

From bb09cc6a5c82a642b03cf03db320a9fa0a265082 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 15 May 2019 23:52:38 +0800
Subject: [PATCH 21/39] Add MetaAnalyzer.load/save_dict methods.

---
 persper/analytics2/metaanalyzer.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/persper/analytics2/metaanalyzer.py b/persper/analytics2/metaanalyzer.py
index 5230bf79ae2..8026a7d5b39 100644
--- a/persper/analytics2/metaanalyzer.py
+++ b/persper/analytics2/metaanalyzer.py
@@ -28,6 +28,10 @@ def __init__(self, repository: ICommitRepository,
             commit_analyzers: a list of commit analyzers. They will be invoked sequentially in each commit.
             post_analyzers: a list of post analyzers. They will be invoked sequentially after the analysis ends successfully or in fault.
             origin_commit, terminal_commit: see `ICommitRepository.enum_commits` for details.
+        remarks
+            You may use `load_state_dict` to load `origin_commit`, `terminal_commit`, and `analyzed_commits` from
+            dict after instantiating this class. Still you need to inject required services (indicated by required
+            parameters) so that you can instantiate this class.
         """
         if not isinstance(repository, ICommitRepository):
             raise ValueError("Expect ICommitRepository instance for repository.")
@@ -39,6 +43,22 @@ def __init__(self, repository: ICommitRepository,
         self._terminal_commit = terminal_commit
         self._analyzed_commits = set(analyzed_commits) if analyzed_commits else set()
 
+    def save_state_dict(self) -> dict:
+        """
+        Save the current state into a dict with simple values.
+        """
+        return {"origin_commit": self._origin_commit,
+                "terminal_commit": self._terminal_commit,
+                "analyzed_commits": list(self._analyzed_commits)}
+
+    def load_state_dict(self, d: dict):
+        """
+        Load the current state from a dict with simple values.
+        """
+        self.origin_commit = d["origin_commit"]
+        self.terminal_commit = d["terminal_commit"]
+        self.analyzed_commits = set(d["analyzed_commits"])
+
     @property
     def origin_commit(self):
         return self._origin_commit
@@ -125,8 +145,8 @@ def analyze(self, max_commits: int = 100) -> AnalysisStatus:
                      len(analyzedCommits), monotonic() - t0, analyzerEllapsedTime)
         # Post analysis
         status = AnalysisStatus(stop_reason=stopReason, exception=failedAnalyzerException,
-                        origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit,
-                        analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef)
+                                origin_commit_ref=self._origin_commit, terminal_commit_ref=self._terminal_commit,
+                                analyzed_commits_ref=analyzedCommits, last_commit_ref=lastCommitRef)
         t0 = monotonic()
         _logger.info("Start post-analyzing: %s..%s .", self._origin_commit, self._terminal_commit)
         analyzer = None
@@ -140,7 +160,7 @@ def analyze(self, max_commits: int = 100) -> AnalysisStatus:
             _logger.info("Finished post-analyzing in %.2fs.", monotonic() - t0)
         except Exception as ex:
             _logger.error("Failed during post-analysis with analyzer [%d][%s].\n%s",
-                            analyzerIndex, analyzer, traceback.format_exc())
+                          analyzerIndex, analyzer, traceback.format_exc())
             # We can do nothing about it. Crash the caller.
             raise
         return status

From 40b765d36d5b7d416b6edd3c32a6e8926e535f33 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Fri, 17 May 2019 22:55:04 +0800
Subject: [PATCH 22/39] Add skip_rewind_diff param in Analyzer constructor.

---
 persper/analytics/analyzer2.py | 68 +++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py
index 8439df5b0c9..bdb890157d4 100644
--- a/persper/analytics/analyzer2.py
+++ b/persper/analytics/analyzer2.py
@@ -4,7 +4,7 @@
 import re
 import time
 from abc import ABC
-from typing import List, Optional, Set, Union
+from typing import List, Optional, Set, Union, Dict
 
 from git import Commit, Diff, DiffIndex, Repo
 
@@ -20,7 +20,9 @@ class Analyzer:
     def __init__(self, repositoryRoot: str, graphServer: GraphServer,
                  terminalCommit: str = 'HEAD',
                  firstParentOnly: bool = False,
-                 commit_classifier: Optional[CommitClassifier] = None):
+                 commit_classifier: Optional[CommitClassifier] = None,
+                 skip_rewind_diff: bool = False):
+        # skip_rewind_diff will skip diff, but rewind commit start/end will still be notified to the GraphServer.
         self._repositoryRoot = repositoryRoot
         self._graphServer = graphServer
         self._repo = Repo(repositoryRoot)
@@ -32,6 +34,7 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer,
         self._observer: AnalyzerObserver = emptyAnalyzerObserver
         self._commit_classifier = commit_classifier
         self._clf_results: Dict[str, List[float]] = {}
+        self._skip_rewind_diff = skip_rewind_diff
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -207,7 +210,11 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
             await result
 
         t1 = time.monotonic() - t1
-        diff_index = diff_with_commit(self._repo, commit, parentCommit)
+        diff_index = None
+        if self._skip_rewind_diff:
+            _logger.info("Skipped diff for rewinding commit.")
+        else:
+            diff_index = diff_with_commit(self._repo, commit, parentCommit)
 
         # commit classification
         if self._commit_classifier and commit.hexsha not in self._clf_results:
@@ -216,33 +223,34 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
 
         # t2: update_graph time
         t2 = time.monotonic()
-        for diff in diff_index:
-            old_fname, new_fname = _get_fnames(diff)
-            # apply filter
-            # if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None,
-            # as if the file is introduced/removed in this commit.
-            # However, the diff will keep its original, no matter if the file has been filtered in/out.
-            if old_fname and not self._graphServer.filter_file(old_fname):
-                old_fname = None
-            if new_fname and not self._graphServer.filter_file(new_fname):
-                new_fname = None
-            if not old_fname and not new_fname:
-                # no modification
-                continue
-
-            old_src = new_src = None
-
-            if old_fname:
-                old_src = get_contents(self._repo, parentCommit, old_fname)
-
-            if new_fname:
-                new_src = get_contents(self._repo, commit, new_fname)
-
-            if old_src or new_src:
-                result = self._graphServer.update_graph(
-                    old_fname, old_src, new_fname, new_src, diff.diff)
-                if asyncio.iscoroutine(result):
-                    await result
+        if diff_index:
+            for diff in diff_index:
+                old_fname, new_fname = _get_fnames(diff)
+                # apply filter
+                # if a file comes into/goes from our view, we will set corresponding old_fname/new_fname to None,
+                # as if the file is introduced/removed in this commit.
+                # However, the diff will keep its original, no matter if the file has been filtered in/out.
+                if old_fname and not self._graphServer.filter_file(old_fname):
+                    old_fname = None
+                if new_fname and not self._graphServer.filter_file(new_fname):
+                    new_fname = None
+                if not old_fname and not new_fname:
+                    # no modification
+                    continue
+
+                old_src = new_src = None
+
+                if old_fname:
+                    old_src = get_contents(self._repo, parentCommit, old_fname)
+
+                if new_fname:
+                    new_src = get_contents(self._repo, commit, new_fname)
+
+                if old_src or new_src:
+                    result = self._graphServer.update_graph(
+                        old_fname, old_src, new_fname, new_src, diff.diff)
+                    if asyncio.iscoroutine(result):
+                        await result
         t2 = time.monotonic() - t2
 
         # t3: end_commit time

From a1adac22a102a4f19a8ae75e58b11e01193a70c9 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Sat, 18 May 2019 11:07:59 +0800
Subject: [PATCH 23/39] Skip rewind diff only when seekingMode is Rewind.

---
 persper/analytics/analyzer2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py
index bdb890157d4..b56e922b239 100644
--- a/persper/analytics/analyzer2.py
+++ b/persper/analytics/analyzer2.py
@@ -211,7 +211,7 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
 
         t1 = time.monotonic() - t1
         diff_index = None
-        if self._skip_rewind_diff:
+        if self._skip_rewind_diff and seekingMode == CommitSeekingMode.Rewind:
             _logger.info("Skipped diff for rewinding commit.")
         else:
             diff_index = diff_with_commit(self._repo, commit, parentCommit)

From aa9e9b0fbf15f9949da950b48f1d4619b1e1178d Mon Sep 17 00:00:00 2001
From: YingjieLiu <18706819589@163.com>
Date: Mon, 20 May 2019 16:26:43 +0800
Subject: [PATCH 24/39] Count added or removed units for each patch in
 update_graph

---
 persper/analytics/c.py                 | 12 ++---
 persper/analytics/call_commit_graph.py | 13 ++++++
 persper/analytics/call_graph/c.py      |  3 +-
 persper/analytics/detect_change.py     | 62 ++++++++++++++++++++++----
 test/test_analytics/test_analyzer_c.py | 46 +++++++++----------
 5 files changed, 99 insertions(+), 37 deletions(-)

diff --git a/persper/analytics/c.py b/persper/analytics/c.py
index 622a067948c..284caeb6131 100644
--- a/persper/analytics/c.py
+++ b/persper/analytics/c.py
@@ -8,7 +8,7 @@
 from persper.analytics.call_commit_graph import CallCommitGraph
 
 
-def function_change_stats(old_ast, new_ast, patch, patch_parser, ranges_func):
+def function_change_stats(old_ast, old_src, new_ast, new_src, patch, patch_parser, ranges_func):
     """
     Parse old/new source files and extract the change info for all functions
     """
@@ -19,19 +19,21 @@ def function_change_stats(old_ast, new_ast, patch, patch_parser, ranges_func):
 
     if old_ast is not None:
         forward_stats = get_changed_functions(
-            *ranges_func(old_ast), adds, dels, separate=True)
+            *ranges_func(old_ast), adds, dels, old_src, new_src, separate=True)
 
     if new_ast is not None:
         inv_adds, inv_dels = inverse_diff(adds, dels)
         bckward_stats = get_changed_functions(
-            *ranges_func(new_ast), inv_adds, inv_dels, separate=True)
+            *ranges_func(new_ast), inv_adds, inv_dels, new_src, old_src, separate=True)
 
     # merge forward and backward stats
     for func, fstat in bckward_stats.items():
         if func not in forward_stats:
             forward_stats[func] = {
                 'adds': fstat['dels'],
-                'dels': fstat['adds']
+                'dels': fstat['adds'],
+                'added_units': fstat['removed_units'],
+                'removed_units': fstat['added_units']
             }
 
     return forward_stats
@@ -85,7 +87,7 @@ def update_graph(self, old_filename, old_src, new_filename, new_src, patch):
         # Compatible with both the old and the new Analyzer
         change_stats = {}
         if self._seeking_mode != CommitSeekingMode.MergeCommit:
-            change_stats = function_change_stats(old_ast, new_ast, patch,
+            change_stats = function_change_stats(old_ast, old_src, new_ast, new_src, patch,
                                                  self._parse_patch,
                                                  get_func_ranges_c)
 
diff --git a/persper/analytics/call_commit_graph.py b/persper/analytics/call_commit_graph.py
index 6d3db3bb0f1..bd5de0fc81b 100644
--- a/persper/analytics/call_commit_graph.py
+++ b/persper/analytics/call_commit_graph.py
@@ -119,6 +119,19 @@ def update_node_history(self, node, num_adds, num_dels):
         else:
             node_history[self._current_commit_id] = {'adds': num_adds, 'dels': num_dels}
 
+    def update_node_history(self, node, fstat):
+        node_history = self._get_node_history(node)
+        # A commit might update a node's history more than once when
+        # a single FunctionNode corresponds to more than one actual functions
+        if self._current_commit_id in node_history:
+            node_history[self._current_commit_id]['adds'] += fstat['adds']
+            node_history[self._current_commit_id]['dels'] += fstat['dels']
+            node_history[self._current_commit_id]['added_units'] += fstat['added_units']
+            node_history[self._current_commit_id]['removed_units'] += fstat['removed_units']
+        else:
+            node_history[self._current_commit_id] = {'adds': fstat['adds'], 'dels': fstat['dels'],
+                                                     'added_units': fstat['added_units'], 'removed_units': fstat['removed_units']}
+
     # read/write access to node history are thourgh this function
     def _get_node_history(self, node: str) -> Dict[str, Dict[str, int]]:
         return self._digraph.nodes[node]['history']
diff --git a/persper/analytics/call_graph/c.py b/persper/analytics/call_graph/c.py
index 8563ddd3c30..1ab241b1b7e 100644
--- a/persper/analytics/call_graph/c.py
+++ b/persper/analytics/call_graph/c.py
@@ -179,7 +179,8 @@ def update_graph(ccgraph, ast_list, change_stats, new_fname_to_old_fname):
         if func not in ccgraph:
             print("%s in change_stats but not in ccgraph" % func)
             continue
-        ccgraph.update_node_history(func, fstat['adds'], fstat['dels'])
+        ccgraph.update_node_history(func, fstat)
+        # ccgraph.update_node_history(func, fstat['adds'], fstat['dels'])
 
 
 def get_func_ranges_c(root):
diff --git a/persper/analytics/detect_change.py b/persper/analytics/detect_change.py
index 419279cd87a..ceffec7bc39 100644
--- a/persper/analytics/detect_change.py
+++ b/persper/analytics/detect_change.py
@@ -1,3 +1,5 @@
+import re
+
 def get_intersected_length(a, b):
     """
     >>> get_intersected_length([1, 9], [2, 8])
@@ -16,9 +18,44 @@ def get_intersected_length(a, b):
     else:
         return end - start + 1
 
+def get_add_line_number(additions, deletions):
+    """
+    Get the line number in new src for each added block
+    Input:
+        additions = [[7, 31], [27, 3], [44, 1], [50, 2], [70, 1], [77, 2], [99, 2]]
+        deletions = [[32, 44], [56, 70]]
+    Output:
+        [[8, 38], [59, 61], [66, 66], [73, 74], [80, 80], [88, 89], [112, 113]]
+        ground truth:
+        https://github.com/basicthinker/Sexain-MemController/commit/f050c6f6dd4b1d3626574b0d23bb41125f7b75ca
+    """
+    add_line_number = []
+    del_ptr, num_dels = 0, len(deletions)
+    add_num, del_num = 0, 0
+    for add_range in additions:
+        while del_ptr < num_dels and deletions[del_ptr][1] <= add_range[0]:
+            del_num += deletions[del_ptr][1] - deletions[del_ptr][0] + 1
+            del_ptr += 1
+        start_line = add_range[0]+1+add_num-del_num
+        tmp_line_number = [start_line, start_line+add_range[1]-1]
+        add_line_number.append(tmp_line_number)
+        add_num += add_range[1]
+    return add_line_number
+#need test
+def get_units(src_list, line_number_range):
+    """
+    Get the sum of units for each line in line_number_range
+    """
+    units_sum = 0
+    p = re.compile(r'\w+')
+    for i in range(line_number_range[0]-1, line_number_range[1]):
+        if i >= len(src_list):
+            break
+        units_sum += len(p.findall(src_list[i]))
+    return units_sum
 
 def get_changed_functions(func_names, func_ranges, additions, deletions,
-                          separate=False):
+                          old_src, new_src, separate=False):
     """
     Args:
         func_names: A list of function names,
@@ -38,29 +75,38 @@ def get_changed_functions(func_names, func_ranges, additions, deletions,
     info = {}
 
     if (func_names is None or func_ranges is None or
-       additions is None or deletions is None):
+       additions is None or deletions is None or
+       old_src is None or new_src is None):
         return info
 
-    def update_info(fn, num_lines, key):
+    def update_info(fn, num_lines, num_units, key1, key2):
         """key should be one of 'adds' or 'dels'."""
         if fn in info:
-            info[fn][key] += num_lines
+            info[fn][key1] += num_lines
+            info[fn][key2] += num_units
         else:
-            info[fn] = {'adds': 0, 'dels': 0}
-            info[fn][key] = num_lines
+            info[fn] = {'adds': 0, 'dels': 0, 'added_units': 0, 'removed_units': 0}
+            info[fn][key1] = num_lines
+            info[fn][key2] = num_units
+
+    old_src_list = old_src.split('\n')
+    new_src_list = new_src.split('\n')
 
+    add_line_number = get_add_line_number(additions, deletions)
     add_ptr, del_ptr = 0, 0
     num_adds, num_dels = len(additions), len(deletions)
     for fn, fr in zip(func_names, func_ranges):
         for i in range(add_ptr, num_adds):
             if fr[0] <= additions[i][0] < fr[1]:
-                update_info(fn, additions[i][1], 'adds')
+                units = get_units(new_src_list, add_line_number[i])
+                update_info(fn, additions[i][1], units, 'adds', 'added_units')
                 add_ptr = i + 1
 
         for j in range(del_ptr, num_dels):
             inter_length = get_intersected_length(fr, deletions[j])
             if inter_length > 0:
-                update_info(fn, inter_length, 'dels')
+                units = get_units(old_src_list, [max(fr[0],deletions[j][0]), min(fr[1],deletions[j][1])])
+                update_info(fn, inter_length, units, 'dels', 'removed_units')
                 del_ptr = j
 
     if not separate:
diff --git a/test/test_analytics/test_analyzer_c.py b/test/test_analytics/test_analyzer_c.py
index 29e5f12af43..378f145eabc 100644
--- a/test/test_analytics/test_analyzer_c.py
+++ b/test/test_analytics/test_analyzer_c.py
@@ -32,23 +32,23 @@ async def test_analyzer_all_branches(az):
 
     history_truth = {
         'K': {
-            'display': {'adds': 0, 'dels': 5}
+            'display': {'adds': 0, 'dels': 5, 'added_units': 0, 'removed_units': 10}
         },
         'F': {
-            'display': {'adds': 14, 'dels': 0},
-            'count': {'adds': 12, 'dels': 0}
+            'display': {'adds': 14, 'dels': 0, 'added_units': 23, 'removed_units': 0},
+            'count': {'adds': 12, 'dels': 0, 'added_units': 19, 'removed_units': 0}
         },
         'E': {
-            'append': {'adds': 29, 'dels': 0},
-            'add': {'adds': 11, 'dels': 0}
+            'append': {'adds': 29, 'dels': 0, 'added_units': 44, 'removed_units': 0},
+            'add': {'adds': 11, 'dels': 0, 'added_units': 25, 'removed_units': 0}
         },
         'D': {
-            'str_replace': {'adds': 26, 'dels': 0}
+            'str_replace': {'adds': 26, 'dels': 0, 'added_units': 76, 'removed_units': 0}
         },
         # TODO: fix \No newline at the end of file
         'C': {
-            'str_append_chr': {'adds': 30, 'dels': 4},
-            'str_equals': {'adds': 0, 'dels': 1}
+            'str_append_chr': {'adds': 30, 'dels': 4, 'added_units': 78, 'removed_units': 21},
+            'str_equals': {'adds': 0, 'dels': 1, 'added_units': 0, 'removed_units': 0}
         },
         # Commit `B` is an example of imperfect diff,
         # it removes `str_append` and adds a new function `str_append_chr`
@@ -56,38 +56,38 @@ async def test_analyzer_all_branches(az):
         # diff doesn't separate these changes into two chunks
         # please see here: https://github.com/UltimateBeaver/test_feature_branch/commit/caaac10f604ea7ac759c2147df8fb2b588ee2a27
         'B': {
-            'str_append': {'adds': 6, 'dels': 3},
-            'str_append_chr': {'adds': 3, 'dels': 2},
-            'str_equals': {'adds': 11, 'dels': 0}
+            'str_append': {'adds': 6, 'dels': 3, 'added_units': 29, 'removed_units': 21},
+            'str_append_chr': {'adds': 3, 'dels': 2, 'added_units': 21, 'removed_units': 15},
+            'str_equals': {'adds': 11, 'dels': 0, 'added_units': 27, 'removed_units': 0}
         },
         'A': {
-            'str_append': {'adds': 7, 'dels': 0},
-            'str_len': {'adds': 6, 'dels': 0}
+            'str_append': {'adds': 7, 'dels': 0, 'added_units': 29, 'removed_units': 0},
+            'str_len': {'adds': 6, 'dels': 0, 'added_units': 13, 'removed_units': 0}
         },
 
         # branch J from commit A, merge back through F
         'J': {
-            'count': {'adds': 12, 'dels': 0},
-            'display': {'adds': 14, 'dels': 0}
+            'count': {'adds': 12, 'dels': 0, 'added_units': 19, 'removed_units': 0},
+            'display': {'adds': 14, 'dels': 0, 'added_units': 23, 'removed_units': 0}
         },
 
         # TODO: fix \No newline at the end of file
         # branch G from commit B, merge back through D
         'G': {
-            'str_equals': {'adds': 0, 'dels': 1},
-            'str_replace': {'adds': 26, 'dels': 0}
+            'str_equals': {'adds': 0, 'dels': 1, 'added_units': 0, 'removed_units': 0},
+            'str_replace': {'adds': 26, 'dels': 0, 'added_units': 76, 'removed_units': 0}
         },
 
         # branch H from commit D, merge back through  E
         'H': {
-            'add': {'adds': 16, 'dels': 0},
-            'append': {'adds': 12, 'dels': 0},
-            'insert': {'adds': 25, 'dels': 0}
+            'add': {'adds': 16, 'dels': 0, 'added_units': 31, 'removed_units': 0},
+            'append': {'adds': 12, 'dels': 0, 'added_units': 37, 'removed_units': 0},
+            'insert': {'adds': 25, 'dels': 0, 'added_units': 44, 'removed_units': 0}
         },
         'I': {
-            'add': {'adds': 0, 'dels': 5},
-            'append': {'adds': 26, 'dels': 9},
-            'insert': {'adds': 0, 'dels': 25}
+            'add': {'adds': 0, 'dels': 5, 'added_units': 0, 'removed_units': 6},
+            'append': {'adds': 26, 'dels': 9, 'added_units': 40, 'removed_units': 33},
+            'insert': {'adds': 0, 'dels': 25, 'added_units': 0, 'removed_units': 44}
         },
     }
 

From eecd9c39b00f0e54097cf6f6090b32142d18e7d4 Mon Sep 17 00:00:00 2001
From: YingjieLiu <18706819589@163.com>
Date: Tue, 21 May 2019 10:25:50 +0800
Subject: [PATCH 25/39] Change name update_node_history to
 update_node_history_accurate.

---
 persper/analytics/call_commit_graph.py | 2 +-
 persper/analytics/call_graph/c.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/persper/analytics/call_commit_graph.py b/persper/analytics/call_commit_graph.py
index bd5de0fc81b..652319cb5d3 100644
--- a/persper/analytics/call_commit_graph.py
+++ b/persper/analytics/call_commit_graph.py
@@ -119,7 +119,7 @@ def update_node_history(self, node, num_adds, num_dels):
         else:
             node_history[self._current_commit_id] = {'adds': num_adds, 'dels': num_dels}
 
-    def update_node_history(self, node, fstat):
+    def update_node_history_accurate(self, node, fstat):
         node_history = self._get_node_history(node)
         # A commit might update a node's history more than once when
         # a single FunctionNode corresponds to more than one actual functions
diff --git a/persper/analytics/call_graph/c.py b/persper/analytics/call_graph/c.py
index 1ab241b1b7e..7ff8bbda713 100644
--- a/persper/analytics/call_graph/c.py
+++ b/persper/analytics/call_graph/c.py
@@ -179,7 +179,7 @@ def update_graph(ccgraph, ast_list, change_stats, new_fname_to_old_fname):
         if func not in ccgraph:
             print("%s in change_stats but not in ccgraph" % func)
             continue
-        ccgraph.update_node_history(func, fstat)
+        ccgraph.update_node_history_accurate(func, fstat)
         # ccgraph.update_node_history(func, fstat['adds'], fstat['dels'])
 
 

From d177672907224ac862c96748cb90bc52622070e8 Mon Sep 17 00:00:00 2001
From: YingjieLiu <18706819589@163.com>
Date: Wed, 22 May 2019 09:34:43 +0800
Subject: [PATCH 26/39] Update regex; Update update_info; If old_src is None or
 new_src is None, we still have to count the logic units.

---
 persper/analytics/detect_change.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/persper/analytics/detect_change.py b/persper/analytics/detect_change.py
index ceffec7bc39..22254e36f18 100644
--- a/persper/analytics/detect_change.py
+++ b/persper/analytics/detect_change.py
@@ -41,13 +41,14 @@ def get_add_line_number(additions, deletions):
         add_line_number.append(tmp_line_number)
         add_num += add_range[1]
     return add_line_number
-#need test
+
 def get_units(src_list, line_number_range):
     """
     Get the sum of units for each line in line_number_range
     """
     units_sum = 0
-    p = re.compile(r'\w+')
+    # p = re.compile(r'\w+')
+    p = re.compile(r'[\w_][\w\d_]*')
     for i in range(line_number_range[0]-1, line_number_range[1]):
         if i >= len(src_list):
             break
@@ -65,6 +66,8 @@ def get_changed_functions(func_names, func_ranges, additions, deletions,
             in the same order of func_names.
         additions: A list of pair of integers,
         deletions: A list of pair of integers,
+        old_src: Old source files,
+        new_src: New source files,
         separate: A boolean flag, if set to True, additions and deletions are
             reported separately.
 
@@ -75,22 +78,25 @@ def get_changed_functions(func_names, func_ranges, additions, deletions,
     info = {}
 
     if (func_names is None or func_ranges is None or
-       additions is None or deletions is None or
-       old_src is None or new_src is None):
+       additions is None or deletions is None):
         return info
 
-    def update_info(fn, num_lines, num_units, key1, key2):
+    def update_info(fn, num_lines, num_units, key_lines, key_units):
         """key should be one of 'adds' or 'dels'."""
         if fn in info:
-            info[fn][key1] += num_lines
-            info[fn][key2] += num_units
+            info[fn][key_lines] += num_lines
+            info[fn][key_units] += num_units
         else:
             info[fn] = {'adds': 0, 'dels': 0, 'added_units': 0, 'removed_units': 0}
-            info[fn][key1] = num_lines
-            info[fn][key2] = num_units
+            info[fn][key_lines] = num_lines
+            info[fn][key_units] = num_units
 
-    old_src_list = old_src.split('\n')
-    new_src_list = new_src.split('\n')
+    old_src_list = []
+    new_src_list = []
+    if not old_src is None:
+        old_src_list = old_src.split('\n')
+    if not new_src is None:
+        new_src_list = new_src.split('\n')
 
     add_line_number = get_add_line_number(additions, deletions)
     add_ptr, del_ptr = 0, 0

From b237e35be81042dac6da484a466c862105dab388 Mon Sep 17 00:00:00 2001
From: Yizhe Yuan <yizhe@meri.co>
Date: Mon, 20 May 2019 18:24:11 +0800
Subject: [PATCH 27/39] Filter out monolithic commits

---
 persper/analytics/analyzer2.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py
index b56e922b239..f432c951650 100644
--- a/persper/analytics/analyzer2.py
+++ b/persper/analytics/analyzer2.py
@@ -21,7 +21,8 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer,
                  terminalCommit: str = 'HEAD',
                  firstParentOnly: bool = False,
                  commit_classifier: Optional[CommitClassifier] = None,
-                 skip_rewind_diff: bool = False):
+                 skip_rewind_diff: bool = False,
+                 monolithic_commit_lines_threshold: int = 5000):
         # skip_rewind_diff will skip diff, but rewind commit start/end will still be notified to the GraphServer.
         self._repositoryRoot = repositoryRoot
         self._graphServer = graphServer
@@ -35,6 +36,7 @@ def __init__(self, repositoryRoot: str, graphServer: GraphServer,
         self._commit_classifier = commit_classifier
         self._clf_results: Dict[str, List[float]] = {}
         self._skip_rewind_diff = skip_rewind_diff
+        self._monolithic_commit_lines_threshold = monolithic_commit_lines_threshold
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -198,6 +200,9 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
         if type(commit) != Commit:
             commit = self._repo.commit(commit)
 
+        # filter monolithic commit
+        seekingMode = self._filter_monolithic_commit(commit, seekingMode)
+
         # t0: Total time usage
         t0 = time.monotonic()
         self._observer.onBeforeCommit(self, commit, seekingMode)
@@ -266,6 +271,19 @@ async def _analyzeCommit(self, commit: Union[Commit, str], parentCommit: Union[C
         assert self._graphServer.get_workspace_commit_hexsha() == commit.hexsha, \
             "GraphServer.get_workspace_commit_hexsha should be return the hexsha seen in last start_commit."
 
+    def _filter_monolithic_commit(self, commit: Commit, seeking_mode: CommitSeekingMode) -> CommitSeekingMode:
+        # filter monolithic commit
+        if seeking_mode == CommitSeekingMode.NormalForward and len(commit.parents) == 1:
+            changed_lines = 0
+            files = commit.stats.files
+            for fname in files:
+                if self._graphServer.filter_file(fname):
+                    changed_lines += files[fname]['lines']
+            if changed_lines > self._monolithic_commit_lines_threshold:
+                # enforce using CommitSeekingMode.MergeCommit to update graph without updating node history
+                return CommitSeekingMode.MergeCommit
+        return seeking_mode
+
 
 def _get_fnames(diff: Diff):
     if diff.new_file:

From 2fea1335c3df685c06858898c31cf03f7dc91ec5 Mon Sep 17 00:00:00 2001
From: Yizhe Yuan <yizhe@meri.co>
Date: Fri, 24 May 2019 00:36:33 +0800
Subject: [PATCH 28/39] Add unit test for filter_monolithic_commit

---
 test/test_analytics/test_analyzer2.py | 69 +++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 test/test_analytics/test_analyzer2.py

diff --git a/test/test_analytics/test_analyzer2.py b/test/test_analytics/test_analyzer2.py
new file mode 100644
index 00000000000..b7eb532e498
--- /dev/null
+++ b/test/test_analytics/test_analyzer2.py
@@ -0,0 +1,69 @@
+import os
+import pytest
+import subprocess
+import shutil
+from persper.analytics.c import CGraphServer
+from persper.analytics.analyzer2 import Analyzer
+from persper.analytics.graph_server import C_FILENAME_REGEXES, CommitSeekingMode
+from persper.util.path import root_path
+
+
+@pytest.fixture(scope='module')
+def az():
+    # build the repo first if not exists yet
+    repo_path = os.path.join(root_path, 'repos/test_feature_branch')
+    script_path = os.path.join(root_path, 'tools/repo_creater/create_repo.py')
+    test_src_path = os.path.join(root_path, 'test/test_feature_branch')
+
+    # Always use latest source to create test repo
+    if os.path.exists(repo_path):
+        shutil.rmtree(repo_path)
+
+    cmd = '{} {}'.format(script_path, test_src_path)
+    subprocess.call(cmd, shell=True)
+
+    return Analyzer(repo_path, CGraphServer(C_FILENAME_REGEXES))
+
+
+def test_analyzer_filter_monolithic_commit(az):
+    threshold = az._monolithic_commit_lines_threshold
+
+    case_1_files = {
+        'main.c': {'lines': threshold + 1},
+    }
+    case_1_commit = MockCommit(case_1_files, 0)
+    case_1_seeking_mode = az._filter_monolithic_commit(case_1_commit, CommitSeekingMode.NormalForward)
+    assert case_1_seeking_mode == CommitSeekingMode.NormalForward
+
+    case_2_files = {
+        'a.c': {'lines': threshold},
+    }
+    case_2_commit = MockCommit(case_2_files, 1)
+    case_2_seeking_mode = az._filter_monolithic_commit(case_2_commit, CommitSeekingMode.NormalForward)
+    assert case_2_seeking_mode == CommitSeekingMode.NormalForward
+
+    case_3_files = {
+        'a.c': {'lines': threshold},
+        'b.c': {'lines': 1},
+    }
+    case_3_commit = MockCommit(case_3_files, 1)
+    case_3_seeking_mode = az._filter_monolithic_commit(case_3_commit, CommitSeekingMode.NormalForward)
+    assert case_3_seeking_mode == CommitSeekingMode.MergeCommit
+
+    case_4_files = {
+        'a.c': {'lines': threshold},
+    }
+    case_4_commit = MockCommit(case_4_files, 2)
+    case_4_seeking_mode = az._filter_monolithic_commit(case_4_commit, CommitSeekingMode.MergeCommit)
+    assert case_4_seeking_mode == CommitSeekingMode.MergeCommit
+
+
+class MockCommit:
+    def __init__(self, files: dict, parent_number: int = 1):
+        self.stats = MockCommitStats(files)
+        self.parents = [{}] * parent_number
+
+
+class MockCommitStats:
+    def __init__(self, files: dict):
+        self.files = files

From 1b4d3f052e4f03f28cd52c1a0e5eb80e953a70a0 Mon Sep 17 00:00:00 2001
From: Hezheng Yin <hezheng@meri.co>
Date: Thu, 23 May 2019 13:56:12 -0700
Subject: [PATCH 29/39] Add comments to filter commit tests

---
 .../{test_analyzer2.py => test_filter_commit.py}          | 8 ++++++++
 1 file changed, 8 insertions(+)
 rename test/test_analytics/{test_analyzer2.py => test_filter_commit.py} (84%)

diff --git a/test/test_analytics/test_analyzer2.py b/test/test_analytics/test_filter_commit.py
similarity index 84%
rename from test/test_analytics/test_analyzer2.py
rename to test/test_analytics/test_filter_commit.py
index b7eb532e498..802ffbaddb9 100644
--- a/test/test_analytics/test_analyzer2.py
+++ b/test/test_analytics/test_filter_commit.py
@@ -28,6 +28,8 @@ def az():
 def test_analyzer_filter_monolithic_commit(az):
     threshold = az._monolithic_commit_lines_threshold
 
+    # case 1: changes above threshold, but the commit is the first commit
+    # expected result: normal forward
     case_1_files = {
         'main.c': {'lines': threshold + 1},
     }
@@ -35,6 +37,8 @@ def test_analyzer_filter_monolithic_commit(az):
     case_1_seeking_mode = az._filter_monolithic_commit(case_1_commit, CommitSeekingMode.NormalForward)
     assert case_1_seeking_mode == CommitSeekingMode.NormalForward
 
+    # case 2: changes equal to threshold, the commit has one parent commit
+    # expected result: normal forward
     case_2_files = {
         'a.c': {'lines': threshold},
     }
@@ -42,6 +46,8 @@ def test_analyzer_filter_monolithic_commit(az):
     case_2_seeking_mode = az._filter_monolithic_commit(case_2_commit, CommitSeekingMode.NormalForward)
     assert case_2_seeking_mode == CommitSeekingMode.NormalForward
 
+    # case 3: changes above threshold, the commit has one parent commit
+    # expected result: merge commit
     case_3_files = {
         'a.c': {'lines': threshold},
         'b.c': {'lines': 1},
@@ -50,6 +56,8 @@ def test_analyzer_filter_monolithic_commit(az):
     case_3_seeking_mode = az._filter_monolithic_commit(case_3_commit, CommitSeekingMode.NormalForward)
     assert case_3_seeking_mode == CommitSeekingMode.MergeCommit
 
+    # case 4: changes equal to threshold, the commit is a merge commit
+    # expected result: merge commit
     case_4_files = {
         'a.c': {'lines': threshold},
     }

From c027f15f9981c6cca85e1fd66b8c9a314228d944 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Sat, 18 May 2019 22:47:43 +0800
Subject: [PATCH 30/39] Add preliminary project complexity metric based on LOC,
 nodes and edges.

---
 persper/analytics/analyzer2.py         |  9 +++++++++
 persper/analytics/call_commit_graph.py | 11 ++++++++++-
 persper/analytics/complexity.py        | 24 ++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 persper/analytics/complexity.py

diff --git a/persper/analytics/analyzer2.py b/persper/analytics/analyzer2.py
index f432c951650..ecfc398e850 100644
--- a/persper/analytics/analyzer2.py
+++ b/persper/analytics/analyzer2.py
@@ -128,6 +128,15 @@ def compute_commit_scores(self, alpha: float, label_weights: List[float],
                                      top_one=top_one,
                                      additive=additive)
 
+    def compute_project_complexity(self, r_n: int, r_e: int):
+        """
+        Evaluates project complexity.
+        params
+            r_n: The conversion factor from node count to logic units.
+            r_e: The conversion factor from edge count to logic units.
+        """
+        return self.graph.eval_project_complexity(r_n, r_e)
+
     async def analyze(self, maxAnalyzedCommits=None, suppressStdOutLogs=False):
         commitSpec = self._terminalCommit
         if self._originCommit:
diff --git a/persper/analytics/call_commit_graph.py b/persper/analytics/call_commit_graph.py
index 652319cb5d3..0c83fc928c6 100644
--- a/persper/analytics/call_commit_graph.py
+++ b/persper/analytics/call_commit_graph.py
@@ -8,7 +8,7 @@
 from persper.analytics.devrank import devrank
 from persper.analytics.score import normalize
 from typing import Union, Set, List, Dict, Optional
-
+from persper.analytics.complexity import eval_project_complexity
 
 class CommitIdGenerators:
     @staticmethod
@@ -171,6 +171,15 @@ def _set_all_edges_weight(self):
             for nbr, datadict in self._digraph.pred[node].items():
                 datadict['weight'] = self._digraph.nodes[node]['size']
 
+    def eval_project_complexity(self, r_n: float, r_e: float):
+        """
+        Evaluates project complexity.
+        params
+            r_n: The conversion factor from node count to logic units.
+            r_e: The conversion factor from edge count to logic units.
+        """
+        return eval_project_complexity(self._digraph, r_n, r_e)
+
     def function_devranks(self, alpha, black_set=None):
         """
         Args:
diff --git a/persper/analytics/complexity.py b/persper/analytics/complexity.py
new file mode 100644
index 00000000000..523aeff4b7a
--- /dev/null
+++ b/persper/analytics/complexity.py
@@ -0,0 +1,24 @@
+from typing import Dict, List
+
+import numpy as np
+from networkx import DiGraph
+
+
+def eval_project_complexity(G: DiGraph, r_n: float, r_e: float):
+    """
+    Evaluates project complexity from the specified bare call commit graph.
+    remarks
+        The formula is
+            complexity = sum_by_node(added_units + removed_units) + r_n*len(nodes) + r_e*len(edges)
+    """
+    logical_units = 0
+    for _, data in G.nodes(data=True):
+        added = 0
+        removed = 0
+        for _, v in data["history"].items():
+            # TODO change from LOC to logic units
+            added += v["adds"]
+            removed += v["dels"]
+        logical_units += added + removed
+    complexity = logical_units + r_n*len(G.nodes) + r_e*len(G.edges)
+    return complexity

From 3ac4854423f9f81555d3bb39bffa32d04e03ec02 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 22 May 2019 22:20:07 +0800
Subject: [PATCH 31/39] Use logic units to evaluate complexity where possible.

---
 persper/analytics/complexity.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/persper/analytics/complexity.py b/persper/analytics/complexity.py
index 523aeff4b7a..013184b9e1f 100644
--- a/persper/analytics/complexity.py
+++ b/persper/analytics/complexity.py
@@ -1,8 +1,11 @@
+import logging
 from typing import Dict, List
 
 import numpy as np
 from networkx import DiGraph
 
+_logger = logging.getLogger(__file__)
+
 
 def eval_project_complexity(G: DiGraph, r_n: float, r_e: float):
     """
@@ -12,13 +15,22 @@ def eval_project_complexity(G: DiGraph, r_n: float, r_e: float):
             complexity = sum_by_node(added_units + removed_units) + r_n*len(nodes) + r_e*len(edges)
     """
     logical_units = 0
+    useFallback = None
     for _, data in G.nodes(data=True):
         added = 0
         removed = 0
         for _, v in data["history"].items():
-            # TODO change from LOC to logic units
-            added += v["adds"]
-            removed += v["dels"]
+            if useFallback == None:
+                useFallback = not "added_units" in v
+                if useFallback:
+                    _logger.warning(
+                        "Will use LOC instead of logic units to measure complexity.")
+            if useFallback:
+                added += v["adds"]
+                removed += v["dels"]
+            else:
+                added += v["added_units"]
+                removed += v["removed_units"]
         logical_units += added + removed
     complexity = logical_units + r_n*len(G.nodes) + r_e*len(G.edges)
     return complexity

From 62b00390a17b656d1cd51fa9742c25ff64c2ac09 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Wed, 22 May 2019 23:44:40 +0800
Subject: [PATCH 32/39] Add unit test assertion in test_analyzer_all_branches.

---
 test/test_analytics/test_analyzer_c.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_analytics/test_analyzer_c.py b/test/test_analytics/test_analyzer_c.py
index 378f145eabc..97268c6cc3e 100644
--- a/test/test_analytics/test_analyzer_c.py
+++ b/test/test_analytics/test_analyzer_c.py
@@ -127,3 +127,4 @@ async def test_analyzer_all_branches(az):
         ('insert', 'append')
     ]
     assert set(ccgraph.edges()) == set(edges_truth)
+    assert ccgraph.eval_project_complexity(20, 10) == 1157

From d67cd207a9432238a6cb6ce7d598fd0df6156828 Mon Sep 17 00:00:00 2001
From: Yang Zhikai <zhikai@meri.co>
Date: Fri, 24 May 2019 03:41:11 +0000
Subject: [PATCH 33/39] delete test_iterator.py

---
 test/test_analytics/test_iterator.py | 65 ----------------------------
 1 file changed, 65 deletions(-)
 delete mode 100644 test/test_analytics/test_iterator.py

diff --git a/test/test_analytics/test_iterator.py b/test/test_analytics/test_iterator.py
deleted file mode 100644
index 745b879c83e..00000000000
--- a/test/test_analytics/test_iterator.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import os
-import pytest
-import pickle
-import subprocess
-from persper.analytics.iterator import RepoIterator
-from persper.util.path import root_path
-
-
-def serialized_messages(commits):
-    return ' '.join([c.message.strip() for c in commits])
-
-
-@pytest.fixture(scope='module')
-def ri():
-    # build the repo first if not exists yet
-    repo_path = os.path.join(root_path, 'repos/test_processor')
-    script_path = os.path.join(root_path, 'tools/repo_creater/create_repo.py')
-    test_src_path = os.path.join(root_path, 'test/test_processor')
-    if not os.path.isdir(repo_path):
-        cmd = '{} {}'.format(script_path, test_src_path)
-        subprocess.call(cmd, shell=True)
-
-    repo_path = os.path.join(root_path, 'repos/test_processor')
-    ri = RepoIterator(repo_path)
-    return ri
-
-
-def test_iterator(ri):
-    commits, branch_commits = ri.iter(from_beginning=True, into_branches=True)
-    # from A to L
-    # use `git log --graph` to view ground truth
-    assert len(ri.visited) == 12
-    assert len(commits) == 4
-    assert len(branch_commits) == 8
-    assert serialized_messages(commits) == 'D C B A'
-    assert serialized_messages(branch_commits) == 'G F E J I H L K'
-
-
-def test_continue_iter(ri):
-    commits, branch_commits = ri.iter(
-        from_beginning=True, num_commits=2, into_branches=True)
-    assert serialized_messages(commits) == 'B A'
-    assert serialized_messages(branch_commits) == ''
-    commits2, branch_commits2 = ri.iter(
-        continue_iter=True, num_commits=2, into_branches=True)
-    assert serialized_messages(commits2) == 'D C'
-    assert serialized_messages(branch_commits2) == 'G F E J I H L K'
-
-
-def test_rev(ri):
-    commits, branch_commits = ri.iter(rev='C', into_branches=True)
-    assert serialized_messages(commits) == 'C B A'
-    assert serialized_messages(branch_commits) == ''
-    commits2, branch_commits2 = ri.iter(
-        continue_iter=True, end_commit_sha='D', into_branches=True)
-    assert serialized_messages(commits2) == 'D'
-    assert serialized_messages(branch_commits2) == 'G F E J I H L K'
-
-
-def test_iter_twice(ri):
-    commits, branch_commits = ri.iter(from_beginning=True, into_branches=True)
-    commits2, branch_commits2 = ri.iter(
-        from_beginning=True, into_branches=True)
-    assert commits == commits2
-    assert branch_commits == branch_commits2

From 9b311913a11b7d2b6ad3d39dbcf20ab6a65212af Mon Sep 17 00:00:00 2001
From: Yang zhikai <zhikai@meri.co>
Date: Fri, 24 May 2019 11:49:39 +0800
Subject: [PATCH 34/39] setup-ci

---
 .gitlab-ci.yml | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 .gitlab-ci.yml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 00000000000..6bdf1342a7b
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,45 @@
+stages:
+  - build
+  - test
+
+
+test_ci:
+  stage: test
+  image: ubuntu:18.04
+#  only:
+#    - setup-ci
+  before_script:
+    - apt update && apt install -y openssh-client wget libarchive-dev libcurl4-openssl-dev git python3.7 python3-pip
+    - wget http://131.123.42.38/lmcrs/beta/srcML-Ubuntu18.04.deb
+    - dpkg -i srcML-Ubuntu18.04.deb
+    - mkdir -p ~/.ssh
+    - echo "${DEPLOY_KEY}" | tr -d '\r' > ~/.ssh/id_rsa
+    - chmod 600 ~/.ssh/id_rsa
+    - eval "$(ssh-agent -s)"
+    - ssh-keyscan -H  "gitlab.com" >> ~/.ssh/known_hosts
+    - chmod 644 ~/.ssh/known_hosts
+    - set LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+    - export LC_ALL=C.UTF-8
+    - export LANG=C.UTF-8
+  script:
+    - apt-get update
+    - git config --global user.email "merico@meir.co"
+    - git config --global user.name "merico"
+    - pip3 install pipenv
+
+    - echo -e "machine gitlab.com\nlogin ${GITLAB_USER}\npassword ${GITLAB_PASSWD}" > ~/.netrc
+    - git clone https://gitlab.com/persper/code-analytics.git && cd code-analytics 
+    #&& git checkout ${CI_COMMIT_REF_NAME}
+    - export PYTHONPATH=$PYTHONPATH:/root/code-analytics
+    - pipenv install --python 3.7
+    - pipenv run pytest -s test/test_analytics/test_analyzer_c.py
+    - pipenv run pytest -s test/test_analytics/test_inverse_diff.py 
+    - pipenv run pytest -s test/test_analytics/test_analyzer.py
+    - pipenv run pytest -s test/test_analytics/test_call_commit_graph.py 
+    - pipenv run pytest -s test/test_analytics/test_detect_change.py
+    - pipenv run pytest -s test/test_analytics/test_devrank.py 
+    - pipenv run pytest -s test/test_analytics/test_diff.py 
+    - pipenv run pytest -s test/test_analytics/test_score.py 
+    - pipenv run pytest -s test/test_analytics/test_srcml.py 
+    - pipenv run pytest -s test/test_analytics2
+    - echo "Done"

From e2132e82608e3f331615025c7d12b607cbbab169 Mon Sep 17 00:00:00 2001
From: Hezheng Yin <hezheng@meri.co>
Date: Fri, 24 May 2019 13:22:34 -0700
Subject: [PATCH 35/39] Properly ignore lsp tests

---
 .gitlab-ci.yml                  | 12 ++----------
 test/test_analytics/conftest.py |  2 ++
 2 files changed, 4 insertions(+), 10 deletions(-)
 create mode 100644 test/test_analytics/conftest.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6bdf1342a7b..218e1bed254 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -23,7 +23,7 @@ test_ci:
     - export LANG=C.UTF-8
   script:
     - apt-get update
-    - git config --global user.email "merico@meir.co"
+    - git config --global user.email "merico@meri.co"
     - git config --global user.name "merico"
     - pip3 install pipenv
 
@@ -32,14 +32,6 @@ test_ci:
     #&& git checkout ${CI_COMMIT_REF_NAME}
     - export PYTHONPATH=$PYTHONPATH:/root/code-analytics
     - pipenv install --python 3.7
-    - pipenv run pytest -s test/test_analytics/test_analyzer_c.py
-    - pipenv run pytest -s test/test_analytics/test_inverse_diff.py 
-    - pipenv run pytest -s test/test_analytics/test_analyzer.py
-    - pipenv run pytest -s test/test_analytics/test_call_commit_graph.py 
-    - pipenv run pytest -s test/test_analytics/test_detect_change.py
-    - pipenv run pytest -s test/test_analytics/test_devrank.py 
-    - pipenv run pytest -s test/test_analytics/test_diff.py 
-    - pipenv run pytest -s test/test_analytics/test_score.py 
-    - pipenv run pytest -s test/test_analytics/test_srcml.py 
+    - pipenv run pytest -s test/test_analytics
     - pipenv run pytest -s test/test_analytics2
     - echo "Done"
diff --git a/test/test_analytics/conftest.py b/test/test_analytics/conftest.py
new file mode 100644
index 00000000000..a4c40467487
--- /dev/null
+++ b/test/test_analytics/conftest.py
@@ -0,0 +1,2 @@
+
+collect_ignore = ["test_analyzer_cpp.py", "test_analyzer_lsp_ccls.py"]

From fc3ad6147e68f10e603263703004c6d20d78a2ca Mon Sep 17 00:00:00 2001
From: Hezheng Yin <hezheng@meri.co>
Date: Fri, 24 May 2019 13:31:15 -0700
Subject: [PATCH 36/39] Update test readme

---
 test/README.md | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/test/README.md b/test/README.md
index 76e7cf7cb20..88e589fb43b 100644
--- a/test/README.md
+++ b/test/README.md
@@ -2,12 +2,14 @@
 
 Our recommended way to run tests is through [pytest](https://docs.pytest.org/en/latest/).
 
-Installation with your favorite package manager:
+It should have been installed if you have run `pipenv install`. Otherwise, install pytest with your favorite package manager:
 
-```
-pip install -U pytest # pip
+```bash
+// pip
+$ pip install -U pytest
 
-conda install pytest # conda
+// or conda
+$ conda install pytest
 ```
 
 ## Run Tests
@@ -15,14 +17,26 @@ conda install pytest # conda
 To run the entire test suite, simply:
 
 ```
-cd test
-pytest 
+cd ${root}
+pipenv run pytest -s test/
 ```
 
 To test a specific module:
 
 ```
-pytest <test_module>.py
+pipenv run pytest -s <test_module>.py
 ```
 
-To learn more about how pytest detects tests, follow this [link](https://docs.pytest.org/en/latest/goodpractices.html#goodpractices).
\ No newline at end of file
+To learn more about how pytest detects tests, follow this [link](https://docs.pytest.org/en/latest/goodpractices.html#goodpractices).
+
+## Tests that are ignored
+
+You can ignore certain tests by customizing test collection using `conftest.py`. For details, please see [here](https://docs.pytest.org/en/latest/example/pythoncollection.html#customizing-test-collection).
+
+Here is a list of tests that are currently ignored:
+
+1. `test/test_analytics/test_analyzer_cpp.py`
+2. `test/test_analytics/test_analyzer_lsp_ccls.py`
+
+
+

From 8dbd2703ee7dcd8cc9c2ee5828253d98cc6b48d1 Mon Sep 17 00:00:00 2001
From: xinyan <xinyan@meri.co>
Date: Thu, 23 May 2019 23:20:11 +0800
Subject: [PATCH 37/39] #46 graphToDict should not modify passed-in
 CallCommitGraph.

---
 test/test_analytics/utility/graph_baseline.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/test/test_analytics/utility/graph_baseline.py b/test/test_analytics/utility/graph_baseline.py
index 00164fda366..e246f1f33a9 100644
--- a/test/test_analytics/utility/graph_baseline.py
+++ b/test/test_analytics/utility/graph_baseline.py
@@ -24,12 +24,14 @@ def formatEdgeId(u: str, v: str):
 
 
 def graphToDict(ccg: CallCommitGraph):
-    nodes = ccg.nodes(data=True)
-    for name, attr in nodes:
-        if "files" in attr:
-            files = list(attr["files"])
+    nodes = {}
+    for name, attr in ccg.nodes(data=True):
+        attr1 = dict(attr)
+        nodes[name] = attr1
+        if "files" in attr1:
+            files = list(attr1["files"])
             files.sort()
-            attr["files"] = files
+            attr1["files"] = files
     result = {
         "nodes": dict(nodes),
         "edges": dict(((formatEdgeId(u, v), data) for (u, v, data) in ccg.edges(data=True)))

From 3205c5ff3f287b863328a6ad8563994e2c48d8e2 Mon Sep 17 00:00:00 2001
From: YingjieLiu <18706819589@163.com>
Date: Sun, 26 May 2019 11:32:12 +0800
Subject: [PATCH 38/39] Fix test cases

---
 .../patch_test_files/example7_new.c           | 52 +++++++++++++++
 .../patch_test_files/example7_old.c           | 65 +++++++++++++++++++
 test/test_analytics/test_analyzer_cpp.py      |  8 +--
 test/test_analytics/test_detect_change.py     | 22 +++++--
 4 files changed, 138 insertions(+), 9 deletions(-)
 create mode 100644 test/test_analytics/patch_test_files/example7_new.c
 create mode 100644 test/test_analytics/patch_test_files/example7_old.c

diff --git a/test/test_analytics/patch_test_files/example7_new.c b/test/test_analytics/patch_test_files/example7_new.c
new file mode 100644
index 00000000000..9782b3507f9
--- /dev/null
+++ b/test/test_analytics/patch_test_files/example7_new.c
@@ -0,0 +1,52 @@
+/* added in H */
+struct node
+{
+    int data;
+    struct node *next;
+}*head;
+
+/* added in H, edited in I */
+void append(int num)
+{
+    struct node *temp, *prev;
+    temp=head;
+    while(temp!=NULL)
+    {
+        if(temp->data==num)
+        {
+            if(temp==head)
+            {
+                head=temp->next;
+                free(temp);
+                return 1;
+            }
+            else
+            {
+                prev->next=temp->next;
+                free(temp);
+                return 1;
+            }
+        }
+        else
+        {
+            prev=temp;
+            temp= temp->next;
+        }
+    }
+    return 0;
+}
+
+/* added in H, edited in G */
+void add( int num )
+{
+    struct node *temp;
+    temp=(struct node *)malloc(sizeof(struct node));
+    temp->data=num;
+    if (head== NULL)
+    {
+        head=temp;
+        head->next=NULL;
+    }
+}
+
+/* insert() is deleted in I */ 
diff --git a/test/test_analytics/patch_test_files/example7_old.c b/test/test_analytics/patch_test_files/example7_old.c
new file mode 100644
index 00000000000..6157d001d3f
--- /dev/null
+++ b/test/test_analytics/patch_test_files/example7_old.c
@@ -0,0 +1,65 @@
+/* added in H */
+struct node
+{
+    int data;
+    struct node *next;
+}*head;
+
+/* added in H */
+void append(int num)
+{
+    struct node *temp,*right;
+    temp= (struct node *)malloc(sizeof(struct node));
+    temp->data=num;
+    right=(struct node *)head;
+    while(right->next != NULL)
+    right=right->next;
+    right->next =temp;
+    right=temp;
+    right->next=NULL;
+}
+
+/* added in H */
+void add( int num )
+{
+    struct node *temp;
+    temp=(struct node *)malloc(sizeof(struct node));
+    temp->data=num;
+    if (head== NULL)
+    {
+        head=temp;
+        head->next=NULL;
+    }
+    else
+    {
+        temp->next=head;
+        head=temp;
+    }
+}
+
+/* added in H */ 
+void insert(int num)
+{
+    int c=0;
+    struct node *temp;
+    temp=head;
+    if(temp==NULL)
+    {
+        add(num);
+    }
+    else
+    {
+        while(temp!=NULL)
+        {
+            if(temp->data<num)
+            c++;
+            temp=temp->next;
+        }
+        if(c==0)
+            add(num);
+        else if(c<count())
+            addafter(num,++c);
+        else
+            append(num);
+    }
+} 
diff --git a/test/test_analytics/test_analyzer_cpp.py b/test/test_analytics/test_analyzer_cpp.py
index a9ed01d435a..347f8bd17cb 100644
--- a/test/test_analytics/test_analyzer_cpp.py
+++ b/test/test_analytics/test_analyzer_cpp.py
@@ -41,13 +41,13 @@ async def test_analyzer_files(az):
 
     history_truth = {
         'C': {
-            'printmessage': {'adds': 4, 'dels': 0},
-            'main': {'adds': 4, 'dels': 0}
+            'printmessage': {'adds': 4, 'dels': 0, 'added_units': 7, 'removed_units': 0},
+            'main': {'adds': 4, 'dels': 0, 'added_units': 3, 'removed_units': 0}
         },
         'B': {},
         'A': {
-            'addition': {'adds': 6, 'dels': 0},
-            'main': {'adds': 6, 'dels': 0}
+            'addition': {'adds': 6, 'dels': 0, 'added_units': 13, 'removed_units': 0},
+            'main': {'adds': 6, 'dels': 0, 'added_units': 13, 'removed_units': 0}
         },
     }
 
diff --git a/test/test_analytics/test_detect_change.py b/test/test_analytics/test_detect_change.py
index f10851f4e2d..797a7205bb0 100644
--- a/test/test_analytics/test_detect_change.py
+++ b/test/test_analytics/test_detect_change.py
@@ -45,7 +45,7 @@ def test_detect_change():
         assert func_ranges_result == func_ranges_truth
 
     assert changed_result == get_changed_functions(
-        *func_ranges_result, *parsing_result)
+        *func_ranges_result, *parsing_result, None, None)
 
 
 def test_get_changed_functions():
@@ -64,23 +64,35 @@ def test_get_changed_functions():
         example_patch = f.read()
         parsing_result = parser.parse(example_patch)
 
+    old_src, new_src = '', ''
+    with open(os.path.join(dir_path, 'patch_test_files/example7_old.c'), 'r') as f:
+        old_src = f.read()
+    with open(os.path.join(dir_path, 'patch_test_files/example7_new.c'), 'r') as f:
+        new_src = f.read()
+
     changed_truth = {
         'append': {
             'adds': 26,
-            'dels': 9
+            'dels': 9,
+            'added_units': 40,
+            'removed_units': 33
         },
         'add': {
             'adds': 0,
-            'dels': 5
+            'dels': 5,
+            'added_units': 0,
+            'removed_units': 6
         },
         'insert': {
             'adds': 0,
-            'dels': 25
+            'dels': 25,
+            'added_units': 0,
+            'removed_units': 44
         }
 
     }
     assert changed_truth == get_changed_functions(
-        *func_ranges_truth, *parsing_result, separate=True)
+        *func_ranges_truth, *parsing_result, old_src, new_src, separate=True)
 
 
 def test_patch_parser():

From a834459d0e5be01663bdf564974d96d1392796cb Mon Sep 17 00:00:00 2001
From: YingjieLiu <18706819589@163.com>
Date: Sun, 26 May 2019 12:24:32 +0800
Subject: [PATCH 39/39] fix test_analyzer.py

---
 test/test_analytics/test_analyzer.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/test/test_analytics/test_analyzer.py b/test/test_analytics/test_analyzer.py
index fd011d463aa..3ed8ca8b7b5 100644
--- a/test/test_analytics/test_analyzer.py
+++ b/test/test_analytics/test_analyzer.py
@@ -32,23 +32,23 @@ async def test_analyzer_master_only(az):
 
     history_truth = {
         'K': {
-            'display': {'adds': 0, 'dels': 5}
+            'display': {'adds': 0, 'dels': 5, 'added_units': 0, 'removed_units': 10}
         },
         'F': {
-            'display': {'adds': 14, 'dels': 0},
-            'count': {'adds': 12, 'dels': 0}
+            'display': {'adds': 14, 'dels': 0, 'added_units': 23, 'removed_units': 0},
+            'count': {'adds': 12, 'dels': 0, 'added_units': 19, 'removed_units': 0}
         },
         'E': {
-            'append': {'adds': 29, 'dels': 0},
-            'add': {'adds': 11, 'dels': 0}
+            'append': {'adds': 29, 'dels': 0, 'added_units': 44, 'removed_units': 0},
+            'add': {'adds': 11, 'dels': 0, 'added_units': 25, 'removed_units': 0}
         },
         'D': {
-            'str_replace': {'adds': 26, 'dels': 0}
+            'str_replace': {'adds': 26, 'dels': 0, 'added_units': 76, 'removed_units': 0}
         },
         # TODO: fix \No newline at the end of file
         'C': {
-            'str_append_chr': {'adds': 30, 'dels': 4},
-            'str_equals': {'adds': 0, 'dels': 1}
+            'str_append_chr': {'adds': 30, 'dels': 4, 'added_units': 78, 'removed_units': 21},
+            'str_equals': {'adds': 0, 'dels': 1, 'added_units': 0, 'removed_units': 0}
         },
         # Commit `B` is an example of imperfect diff,
         # it removes `str_append` and adds a new function `str_append_chr`
@@ -56,15 +56,16 @@ async def test_analyzer_master_only(az):
         # diff doesn't separate these changes into two chunks
         # please see here: https://github.com/UltimateBeaver/test_feature_branch/commit/caaac10f604ea7ac759c2147df8fb2b588ee2a27
         'B': {
-            'str_append': {'adds': 6, 'dels': 3},
-            'str_append_chr': {'adds': 3, 'dels': 2},
-            'str_equals': {'adds': 11, 'dels': 0}
+            'str_append': {'adds': 6, 'dels': 3, 'added_units': 29, 'removed_units': 21},
+            'str_append_chr': {'adds': 3, 'dels': 2, 'added_units': 21, 'removed_units': 15},
+            'str_equals': {'adds': 11, 'dels': 0, 'added_units': 27, 'removed_units': 0}
         },
         'A': {
-            'str_append': {'adds': 7, 'dels': 0},
-            'str_len': {'adds': 6, 'dels': 0}
+            'str_append': {'adds': 7, 'dels': 0, 'added_units': 29, 'removed_units': 0},
+            'str_len': {'adds': 6, 'dels': 0, 'added_units': 13, 'removed_units': 0}
         },
 
+
         # # branch J from commit A, merge back through F
         # 'J': {
         #     'count': {'adds': 12, 'dels': 0},