From 2e1b477af112041ea52a5d5085bea046f4be37d8 Mon Sep 17 00:00:00 2001 From: Mark Burgess Date: Tue, 10 Dec 2024 17:49:55 -1000 Subject: [PATCH] i #324 Re-added refresh function --- R/github.R | 135 +++++++++++++++++++++++++ vignettes/download_github_comments.Rmd | 2 +- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/R/github.R b/R/github.R index 33028e2..995ab95 100644 --- a/R/github.R +++ b/R/github.R @@ -414,6 +414,7 @@ github_parse_project_commits <- function(api_responses){ #' available requests in the used user's token). #' The user can also define the maximum number of pages to download. #' Downloaded JSON files use POSIXct format (%Y-%m-%dT%H:%M:%SZ, UTC) +#' The function only downloads the first 100 comments for each discussion. #' #' @param token Your Github API token #' @param owner Github's repository owner (e.g. sailuh) @@ -439,6 +440,8 @@ github_api_discussions <- function(token, owner, repo, save_folder_path, max_pag discussions (first: 100', if(!is.null(cursor)) paste0(', after: "', cursor,'"'),') { pageInfo { hasNextPage + hasPreviousPage + startCursor endCursor } edges { @@ -504,6 +507,138 @@ github_api_discussions <- function(token, owner, repo, save_folder_path, max_pag } } +#' Refresh for Github Discussions downloader +#' +#' Download Discussions from GraphQL API endpoint. +#' Uses a query to only obtain data defined by the user. +#' Checks if the folder to download is empty, and calls the regular downloader it it is. +#' It then checks the downloaded JSON filenames to compare with the most recent discussion createdAt dates, +#' formatted as POSIXct (%Y-%m-%dT%H:%M:%SZ, UTC). +#' GitHub API endpoints return data in pages, each containing by default 100 entries. +#' This function by default iterates over the next page in order to download all the +#' project's data available from the endpoint (up to the remaining +#' available requests in the used user's token). +#' The user can also define the maximum number of pages to download. +#' The function only downloads the first 100 comments of each discussion. +#' +#' @param token Your Github API token +#' @param owner Github's repository owner (e.g. sailuh) +#' @param repo Github's repository name (e.g. kaiaulu) +#' @param save_folder_path A folder path to save the downloaded json pages "as-is". +#' @references For details, see \url{https://docs.github.com/en/graphql/guides/using-the-graphql-api-for-discussions} +#' @export +github_api_discussions_refresh <- function(token, owner, repo, save_folder_path, verbose=TRUE) { + # List all json files within the save_path_folder + contents <- list.files(save_folder_path) + + # If there are no json files, download all discussions + if (length(contents) == 0 ) { + # Run the regular downloader + discussions <- github_api_discussions(token, owner, repo, save_folder_path) + return (discussions) + } + + # Get the name of the file with the most recent date + latest_discussion <- contents[which.max(sapply(contents, function (filename) { + # Use regex to get timestamp + as.numeric(sub(".*_(\\d+)\\.json$", "\\1", basename(filename))) + }))] + # Read the JSON file + json_data <- fromJSON(file.path(save_folder_path, latest_discussion), simplifyVector = FALSE) + # Get the created_at values + created_at <- sapply(json_data[["data"]][["repository"]][["discussions"]][["edges"]], + function (edge) edge[["node"]][["createdAt"]]) + + # Find the latest created_at + created_at <- max(created_at) + + if (verbose) { + message("Latest discussion file: ", latest_discussion) + message("latest created at: ", created_at) + } + + # Initialize with startCursor field, if it is initialized with endCursor, then duplicate values may be downloaded + cursor <- json_data[["data"]][["repository"]][["discussions"]][["pageInfo"]][["startCursor"]] + + while (TRUE) { + # Form a new query + query <- paste0('query { + repository (owner:"', owner, '", name:"', repo, '") { + discussions (first: 100, before: "', cursor, '") { + pageInfo { + hasNextPage + hasPreviousPage + startCursor + endCursor + } + edges { + node { + title + bodyText + author { login } + createdAt + category { name } + id + answer { id } + comments(first: 100) { + edges { + node { + discussion { id } + bodyText + author { login } + id + createdAt + } + } + } + } + } + } + } + }') + # Make a new API call with the query + gh_response <- gh::gh("POST /graphql", query=query, .token=token) + + # Check if response has new discussions + if (length(gh_response[["data"]][["repository"]][["discussions"]][["edges"]]) == 0 && verbose) { + message("No new discussions") + break + } + + # Make the list of all created_dates + created_dates <- sapply(gh_response[["data"]][["repository"]][["discussions"]][["edges"]], + function(edge) edge[["node"]][["createdAt"]]) + # Remove NULL entries from created_dates + created_dates <- Filter(Negate(is.null), created_dates) + + # Convert to POSIXct date objects + date_objects <- as.POSIXct(created_dates, format="%Y-%m-%dT%H:%M:%SZ", tz="UTC") + + # Find the greatest and smallest date + latest_date <- max(date_objects) + latest_date_unix <- as.numeric(latest_date) + oldest_date <- min(date_objects) + oldest_date_unix <- as.numeric(oldest_date) + + # Construct the file_name + file_name <- paste0(save_folder_path, + owner, "_", repo, "_", oldest_date_unix, "_", latest_date_unix, + ".json") + # Save the json to the folder path + write_json(gh_response, file_name, pretty=TRUE, auto_unbox=TRUE) + + # Check if more pages need to be downloaded + has_previous_page <- gh_response[["data"]][["repository"]][["discussions"]][["pageInfo"]][["hasPreviousPage"]] + + if (has_previous_page){ + cursor <- gh_response[["data"]][["repository"]][["discussions"]][["pageInfo"]][["startCursor"]] + } + else { + break + } + } +} + #' Parse Discussions JSON to Table #' #' @description This function parses through the JSON of Github Discussions diff --git a/vignettes/download_github_comments.Rmd b/vignettes/download_github_comments.Rmd index 8d6c6f5..c45cc0c 100644 --- a/vignettes/download_github_comments.Rmd +++ b/vignettes/download_github_comments.Rmd @@ -410,7 +410,7 @@ The function by default iterates through all available discussion pages (if user The user can define the maximum number of pages to download in the function parameter (e.g. max_pages=1) ```{r Collect Github Discussions, eval = FALSE, warning=FALSE} -gh_response <- github_api_discussions(token, owner, repo, save_path_discussions) +gh_response <- github_api_discussions_refresh(token, owner, repo, save_path_discussions) ``` Two separate tables can be created from the api response.