typilus · Jun 10, 2020 · Jun 10, 2020 · Jun 12, 2020 · Jun 12, 2020
Showing with 218 additions and 10 deletions.

+38 −0 README.md

+14 −7 entrypoint.py

+2 −0 src/changeutils.py

+3 −3 src/graph_generator/extract_graphs.py

+161 −0 typilus.py
diff --git a/README.md b/README.md
@@ -27,6 +27,44 @@ suggestions with only a partial context, at the cost of suggesting some false
 positives.
 
 
+### Run locally from CLI
+
+```
+git clone <your-python-repo> test-repo
+cat <<EOF >pr.json
+{
+    "action": "opened",
+    "pull_request": {
+      "url": "https://api.github.com/repos/<your-python-repo>/pulls/1",
+      "review_comments_url": "https://api.github.com/repos/<your-python-repo>/pulls/1/comments",
+      "head": {
+        "sha": "ec26c3e57ca3a959ca5aad62de7213c562f8c821"
+      }
+    }
+}
+EOF
+
+docker build -t typilus:v0.9-cli-auth-rename .
+docker run -it \
+    -v "$PWD":/data \
+    -e TY_DRY_RUN="1" \
+    -e TY_REPO_PATH=/data/test-repo \
+    -e GITHUB_USER="<your-github-username>" \
+    -e GITHUB_TOKEN="<your-personal-access-token>" \
+    -e GITHUB_EVENT_NAME=pull_request \
+    -e GITHUB_EVENT_PATH=/data/pr.json \
+    typilus:v0.9-cli-auth-rename .
+```
+
+### CLI reporting tool
+
+```sh
+pip3 install -r requirements.txt
+wget https://github.com/typilus/typilus-action/releases/download/v0.1/typilus20200507.pkl.gz
+
+./typilus.py --model typilus20200507.pkl.gz --repo . --file ./entrypoint.py
+```
+
 ### Install Action in your Repository
 
 To use the GitHub action, create a workflow file. For example,

diff --git a/entrypoint.py b/entrypoint.py
@@ -53,15 +53,15 @@ def __repr__(self) -> str:
     os.environ["GITHUB_EVENT_NAME"] == "pull_request"
 ), "This action runs only on pull request events."
 github_token = os.environ["GITHUB_TOKEN"]
-debug = False
+debug = os.getenv("TY_DEBUG", False)
 
 with open(os.environ["GITHUB_EVENT_PATH"]) as f:
     event_data = json.load(f)
     if debug:
         print("Event data:")
         print(json.dumps(event_data, indent=4))
 
-repo_path = "."  # TODO: Is this always true?
+repo_path = os.getenv("TY_REPO_PATH", "/usr/src") #"."  # TODO: Is this always true?
 
 if debug:
     print("ENV Variables")
@@ -71,9 +71,10 @@ def __repr__(self) -> str:
 diff_rq = requests.get(
     event_data["pull_request"]["url"],
     headers={
-        "authorization": f"Bearer {github_token}",
+        #"authorization": f"Bearer {github_token}",
         "Accept": "application/vnd.github.v3.diff",
     },
+    auth=(os.environ["GITHUB_USER"], github_token),
 )
 print("Diff GET Status Code: ", diff_rq.status_code)
 
@@ -164,7 +165,7 @@ def data_iter():
     for suggestion in type_suggestions:
         if suggestion.symbol_kind == "class-or-function":
             suggestion.annotation_lineno = find_annotation_line(
-                suggestion.filepath[1:], suggestion.file_location, suggestion.name
+                os.path.join(repo_path, suggestion.filepath[1:]), suggestion.file_location, suggestion.name
             )
         else:  # when the underlying symbol is a parameter
             suggestion.annotation_lineno = suggestion.file_location[0]
@@ -188,11 +189,12 @@ def report_confidence(suggestions):
             for s in suggestions
         )
 
+    dry_run = os.environ["TY_DRY_RUN"]
     for same_line_suggestions in grouped_suggestions:
         suggestion = same_line_suggestions[0]
         path = suggestion.filepath[1:]  # No slash in the beginning
         annotation_lineno = suggestion.annotation_lineno
-        with open(path) as file:
+        with open(os.path.join(repo_path, path)) as file:
             target_line = file.readlines()[annotation_lineno - 1]
         data = {
             "path": path,
@@ -210,7 +212,12 @@ def report_confidence(suggestions):
             "authorization": f"Bearer {github_token}",
             "Accept": "application/vnd.github.v3.raw+json",
         }
-        r = requests.post(comment_url, data=json.dumps(data), headers=headers)
+
+        if dry_run:
+            print("Skip posting actual comment to Github")
+        else:
+            r = requests.post(comment_url, data=json.dumps(data), headers=headers)
+
         if debug:
             print("URL: ", comment_url)
-            print(f"Data: {data}. Status Code: {r.status_code}. Text: {r.text}")
+            print(f"Data: {data}" + (f" Status Code: {r.status_code}. Text: {r.text}" if not dry_run else ""))
diff --git a/src/changeutils.py b/src/changeutils.py
@@ -47,6 +47,8 @@ def get_changed_files(diff: str, suffix=".py") -> Dict[str, Set[int]]:
         elif file_diff_lines[1].startswith("similarity"):
             assert file_diff_lines[2].startswith("rename")
             assert file_diff_lines[3].startswith("rename")
+            if len(file_diff_lines) == 4:
+                continue # skip file renames \wo any changes
             assert file_diff_lines[4].startswith("index")
             assert file_diff_lines[5].startswith("--- a/")
             assert file_diff_lines[6].startswith("+++ b/")

diff --git a/src/graph_generator/extract_graphs.py b/src/graph_generator/extract_graphs.py
@@ -65,13 +65,13 @@ def explore_files(
         if not os.path.isfile(file_path):
             continue
         with open(file_path, encoding="utf-8", errors="ignore") as f:
-            monitoring.increment_count()
-            monitoring.enter_file(file_path)
-
             # import pdb; pdb.set_trace()
             if file_path[len(root_dir) :] not in files_to_extract:
                 continue
 
+            monitoring.increment_count()
+            monitoring.enter_file(file_path)
+
             graph = build_graph(f.read(), monitoring, type_lattice)
             if graph is None or len(graph["supernodes"]) == 0:
                 continue

diff --git a/typilus.py b/typilus.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+import argparse
+import os
+import sys
+import uuid
+
+from glob import iglob
+from os.path import dirname
+from pathlib import Path
+from typing import Tuple, List
+sys.path.append(os.path.join(dirname(__file__), "src"))
+
+from dpu_utils.utils import load_jsonl_gz
+from ptgnn.implementations.typilus.graph2class import Graph2Class
+
+from annotationutils import (
+    annotate_line,
+    find_annotation_line,
+    group_suggestions,
+    annotation_rewrite,
+)
+from changeutils import get_changed_files
+from graph_generator.extract_graphs import extract_graphs
+
+import warnings
+warnings.filterwarnings("ignore")
+
+# copy, as entyrpoint does not have main(), so import will trigger the execution
+# from entrypoint import TypeSuggestion
+class TypeSuggestion:
+    def __init__(
+        self,
+        filepath: str,
+        name: str,
+        file_location: Tuple[int, int],
+        suggestion: str,
+        symbol_kind: str,
+        confidence: float,
+        annotation_lineno: int = 0,
+        is_disagreement: bool = False,
+    ):
+        self.filepath = filepath
+        self.name = name
+        self.file_location = file_location
+        self.suggestion = suggestion
+        self.symbol_kind = symbol_kind
+        self.confidence = confidence
+        self.annotation_lineno = annotation_lineno
+        self.is_disagreement = is_disagreement
+
+    def __repr__(self) -> str:
+        return (
+            f"Suggestion@{self.filepath}:{self.file_location} "
+            f"Symbol Name: `{self.name}` Suggestion `{self.suggestion}` "
+            f"Confidence: {self.confidence:.2%}"
+        )
+
+
+
+parser = argparse.ArgumentParser(description='Inference from the pretained model using https://github.com/typilus/typilus')
+parser.add_argument('--model', dest="model_path", required=True, help='path to the pretrained model in .pkl.gz format')
+parser.add_argument('--repo', dest="repo_path", required=True, help='path to source code repository to analyzer')
+parser.add_argument('--file', dest="file_path", required=True, help='suggest type only for a given file (must be under --repo)')
+parser.add_argument('-v', dest="debug", action="store_true", default=False, help='verbose debug output')
+# parser.add_argument('-', dest="diff_stdin", action="store_true", default=False, help="suggest types only for the changed files (read diff from stdin)")
+
+# Usage:
+# wget https://github.com/typilus/typilus-action/releases/download/v0.1/typilus20200507.pkl.gz
+# ./typilus.py --model typilus20200507.pkl.gz --repo . --file entrypoint.py
+
+# TODO(bzz):
+# ./typilus.py --model typilus20200507.pkl.gz --repo .
+# ./typilus.py --model typilus20200507.pkl.gz --repo . - < git diff master^
+
+def main():
+    args = parser.parse_args()
+    debug = args.debug
+    model_path = args.model_path
+    repo_path = args.repo_path
+    out_dir = os.path.join("graph", str(uuid.uuid4()))
+    print(f"Intermediate output is saved under '{out_dir}'")
+
+    # if args.file_path:
+    changed_files = {args.file_path[len(repo_path) :]: set()}
+    # else:
+    #     #TODO list all files under "path" by default
+
+    # if args.diff_stdin:
+    #     # diff = <read diff from stdin>
+    #     changed_files = get_changed_files(diff)
+
+    if len(changed_files) == 0:
+      print("No relevant changes found.")
+      return
+
+    Path(out_dir).mkdir(parents=True)
+    typing_rules_path = os.path.join(dirname(__file__), "src", "metadata", "typingRules.json")
+    assert Path(typing_rules_path).exists()
+    extract_graphs(
+        repo_path, typing_rules_path, files_to_extract=set(changed_files), target_folder=out_dir,
+    )
+
+    ## the rest is exactly the same as entrypoint.py
+    def data_iter():
+        for datafile_path in iglob(os.path.join(out_dir, "*.jsonl.gz")):
+            print(f"\nLooking into {datafile_path}...")
+            for graph in load_jsonl_gz(datafile_path):
+                yield graph
+
+#    model_path = os.getenv("MODEL_PATH", "/usr/src/model.pkl.gz")
+    model, nn = Graph2Class.restore_model(model_path, "cpu")
+
+    type_suggestions: List[TypeSuggestion] = []
+    for graph, predictions in model.predict(data_iter(), nn, "cpu"):
+        # predictions has the type: Dict[int, Tuple[str, float]]
+        filepath = graph["filename"]
+
+        if debug:
+            print("Predictions:", predictions)
+            print("SuperNodes:", graph["supernodes"])
+
+        for supernode_idx, (predicted_type, predicted_prob) in predictions.items():
+            supernode_data = graph["supernodes"][str(supernode_idx)]
+            if supernode_data["type"] == "variable":
+                continue  # Do not suggest annotations on variables for now.
+            lineno, colno = supernode_data["location"]
+            suggestion = TypeSuggestion(
+                filepath,
+                supernode_data["name"],
+                (lineno, colno),
+                annotation_rewrite(predicted_type),
+                supernode_data["type"],
+                predicted_prob,
+                is_disagreement=supernode_data["annotation"] != "??"
+                and supernode_data["annotation"] != predicted_type,
+            )
+
+            print("\t", suggestion)
+
+            if lineno not in changed_files[filepath]:
+                continue
+            elif suggestion.name == "%UNK%":
+                continue
+
+            if (
+                supernode_data["annotation"] == "??"
+                and suggestion.confidence > suggestion_confidence_threshold
+            ):
+                type_suggestions.append(suggestion)
+            elif (
+                suggestion.is_disagreement
+                # and suggestion.confidence > diagreement_confidence_threshold
+            ):
+                pass  # TODO: Disabled for now: type_suggestions.append(suggestion)
+
+    print(f"Done, {len(type_suggestions)} suggestions found.")
+
+
+if __name__ == "__main__":
+    main()