From a80b6a1a56900bbc62fa96932c35a58180ae162d Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Mon, 29 Apr 2024 04:10:29 -0700 Subject: [PATCH] i #282 add issues_or_pr endpoint and networks By testing the notebook and comparing outputs, found search endpoint only download issues. Also added a new notebook to display the communication network for github. Minor changes to graph display in a few notebooks so graph is enlarged in interactive form in the notebook. Signed-off-by: Carlos Paradis --- NAMESPACE | 1 + R/identity.R | 1 + conf/kaiaulu.yml | 4 ++-- vignettes/download_github_comments.Rmd | 32 +++++++++++++++++++------- vignettes/gitlog_entity_showcase.Rmd | 2 +- vignettes/gitlog_showcase.Rmd | 2 +- 6 files changed, 30 insertions(+), 12 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 0fb77822..08d904eb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -186,6 +186,7 @@ importFrom(stringi,stri_c) importFrom(stringi,stri_cmp_eq) importFrom(stringi,stri_detect_regex) importFrom(stringi,stri_match_all) +importFrom(stringi,stri_match_first) importFrom(stringi,stri_match_first_regex) importFrom(stringi,stri_replace_all_regex) importFrom(stringi,stri_replace_first) diff --git a/R/identity.R b/R/identity.R index b11728e9..94f52f47 100644 --- a/R/identity.R +++ b/R/identity.R @@ -185,6 +185,7 @@ identity_match <- function(project_log,name_column,assign_identity_function, # Various imports #' @importFrom stringi stri_replace_last #' @importFrom stringi stri_replace_first +#' @importFrom stringi stri_match_first #' @importFrom stringi stri_c #' @importFrom stringi stri_cmp_eq #' @importFrom stringi stri_replace_last_regex diff --git a/conf/kaiaulu.yml b/conf/kaiaulu.yml index fc26a928..d834ca39 100644 --- a/conf/kaiaulu.yml +++ b/conf/kaiaulu.yml @@ -36,7 +36,7 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/kaiaulu/.git + log: ../../rawdata/kaiaulu/git/kaiaulu_clone/.git # From where the git log was downloaded? log_url: https://github.com/sailuh/kaiaulu # List of branches used for analysis @@ -69,7 +69,7 @@ issue_tracker: # Download using `download_github_comments.Rmd` issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ - issue_search: ../..rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ + issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ pull_request: ../../kaiaulu/github/pull_request/sailuh_kaiaulu/ commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ # project_key_2: diff --git a/vignettes/download_github_comments.Rmd b/vignettes/download_github_comments.Rmd index 262fbf11..4f8e6f41 100644 --- a/vignettes/download_github_comments.Rmd +++ b/vignettes/download_github_comments.Rmd @@ -52,8 +52,8 @@ save_path_pull_request <- conf[["issue_tracker"]][["github"]][["project_key_1"]] save_path_issue_or_pr_comments <- conf[["issue_tracker"]][["github"]][["project_key_1"]][["issue_or_pr_comment"]] save_path_commit <- conf[["issue_tracker"]][["github"]][["project_key_1"]][["commit"]] # Path you wish to save all raw data. A folder with the repo name and sub-folders will be created. -owner <- conf[["issue_tracker"]][["github"]][["owner"]] # Has to match github organization (e.g. github.com/sailuh) -repo <- conf[["issue_tracker"]][["github"]][["repo"]] # Has to match github repository (e.g. github.com/sailuh/perceive) +owner <- conf[["issue_tracker"]][["github"]][["project_key_1"]][["owner"]] # Has to match github organization (e.g. github.com/sailuh) +repo <- conf[["issue_tracker"]][["github"]][["project_key_1"]][["repo"]] # Has to match github repository (e.g. github.com/sailuh/perceive) # your file github_token (a text file) contains the GitHub token API token <- scan("~/.ssh/github_token",what="character",quiet=TRUE) ``` @@ -84,7 +84,7 @@ If you would like to retrieve only issues **before** a certain date, set `date_l ```{r Collect issues by date x, eval = FALSE} -created_lower_bound_issue <- "2020-01-01" +created_lower_bound_issue <- "1990-01-01" created_upper_bound_issue <- "2021-01-01" # make initial API CALL gh_response <- github_api_project_issue_by_date(owner, @@ -209,6 +209,8 @@ github_api_iterate_pages(token,gh_response, prefix="issue", verbose=TRUE) ``` + + ```{r} all_issue_or_pr_comments <- lapply(list.files(save_path_issue_or_pr_comments, full.names = TRUE),read_json) @@ -219,6 +221,8 @@ all_issue_or_pr_comments <- rbindlist(all_issue_or_pr_comments,fill=TRUE) head(all_issue_or_pr_comments,2) %>% gt(auto_align = FALSE) ``` + + ## Refresh Issue or PR Comment Similar to the refresh of the issues, this chunk allows for the downloading of comments that have been created and/or updated since the most recently created date among data already downloaded. This allows us to 'refresh' the comments, downloading comments made or updated since that date or continue downloading if a rate limit was reached. @@ -232,7 +236,9 @@ Because the endpoint this function relies on is based on the updated timestamp, # get the data gh_response_issue_or_pr_comment <- github_api_project_issue_or_pr_comment_refresh(owner, repo, - token, save_path_issue_or_pr_comments, verbose=TRUE) + token, + save_path_issue_or_pr_comments, + verbose=TRUE) # create directory and iterate over data #dir.create(save_path_issue_or_pr_comments) @@ -275,11 +281,9 @@ github_api_iterate_pages(token,gh_response, ``` -## Issues Endpoint +## Issue or PR Endpoint -```{r eval= FALSE} -save_path_issue <- paste0(save_path,"/issue/") -``` +The refresh search endpoint does **not** include pull requests. If the intent is to download both issues and pull requests, then the `/issue/` endpoint should be used instead: ```{r Collect all issues from issue endpoint, eval = FALSE} @@ -290,6 +294,18 @@ github_api_iterate_pages(token,gh_response, prefix="issue") ``` +```{r} +all_issue_or_pr <- lapply(list.files(save_path_issue, + full.names = TRUE),read_json) +all_issue_or_pr <- lapply(all_issue_or_pr, + github_parse_project_issue_or_pr_comments) +all_issue_or_pr <- rbindlist(all_issue_or_pr,fill=TRUE) + +head(all_issue_or_pr,2) %>% + gt(auto_align = FALSE) +``` + + ## Pull Requests Endpoint Similarly to the Issue API, we can also obtain other metadata from pull requests, including their labels. diff --git a/vignettes/gitlog_entity_showcase.Rmd b/vignettes/gitlog_entity_showcase.Rmd index d6e4cbb8..b5efcea3 100644 --- a/vignettes/gitlog_entity_showcase.Rmd +++ b/vignettes/gitlog_entity_showcase.Rmd @@ -108,7 +108,7 @@ id_project_git <- project_log[["project_git"]] Author-Entity Network -```{r} +```{r , out.width="800px" , out.height="800px"} project_collaboration_network <- transform_gitlog_to_entity_bipartite_network(last(id_project_git, 1000), mode = "author-entity") diff --git a/vignettes/gitlog_showcase.Rmd b/vignettes/gitlog_showcase.Rmd index 9e4a95b7..adaa237d 100644 --- a/vignettes/gitlog_showcase.Rmd +++ b/vignettes/gitlog_showcase.Rmd @@ -159,7 +159,7 @@ project_collaboration_network <- transform_gitlog_to_bipartite_network(project_g And then we plot it! Try moving the mouse cursor over the image, and then zooming in or dragging by hold the left mouse button. When zoomed in sufficiently, the labels will be displayed on the nodes. -```{r} +```{r , out.width="800px" , out.height="800px"} plot_project_collaboration_network <- igraph::graph_from_data_frame(d=project_collaboration_network[["edgelist"]], directed = TRUE, vertices = project_collaboration_network[["nodes"]])