Skip to content

Update README.Rmd #17

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 210 additions & 69 deletions README.Rmd
Original file line number Diff line number Diff line change
@@ -11,116 +11,254 @@ This project's main contents are located in the project's [Wiki](wiki#welcome-to
# USCbiostats R packages

```{r setup, include=FALSE}
library(httr)
library(stringr)
library(knitr)
library(scholar) # <--- The key difference
```


```{r, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```

```{r listing-pkgs, echo = FALSE}
```{r, include=FALSE}
# We'll assume `packages.csv` has columns:
# name, repo, on_bioc, scholar_id, pubid, google_scholar, description
# Lines starting with '#' in CSV are ignored.
pkgs <- read.csv("packages.csv", comment.char = "#", stringsAsFactors = FALSE)
# Alphabetically ordered
pkgs <- pkgs[order(pkgs$name),,drop=FALSE]
# If on_bioc does not exist, create it
if (!"on_bioc" %in% names(pkgs)) {
pkgs$on_bioc <- FALSE
} else {
# Convert text "TRUE"/"FALSE" to logical
pkgs$on_bioc <- ifelse(pkgs$on_bioc %in% c("TRUE","True","true"), TRUE, FALSE)
}
# Checking cran status
# Check CRAN status
pkgs$on_cran <- TRUE
for (i in seq_len(nrow(pkgs))) {
pkg_status <- tryCatch(
httr::GET(sprintf("https://cran.r-project.org/package=%s", pkgs$name[i])),
error = function(e) e
)
# Error fetching a status
if (inherits(pkg_status, "error")) {
pkgs$on_cran[i] <- FALSE
next
}
if (httr::status_code(pkg_status) != 200) {
nm <- pkgs$name[i]
url <- sprintf("https://cran.r-project.org/package=%s", nm)
resp <- tryCatch(GET(url), error = function(e) e)
if (inherits(resp,"error") || status_code(resp) != 200) {
pkgs$on_cran[i] <- FALSE
next
}
}
dat <- with(pkgs, data.frame(
Name = sprintf(
"[**%s**](%s)", name, ifelse(!is.na(repo) & repo != "", repo, paste0("https://github.com/USCbiostats/", name))
),
Description = paste(
description,
sprintf(
"[![CRAN status](https://www.r-pkg.org/badges/version/%s)](https://CRAN.R-project.org/package=%1$s)",
name
),
# Sort packages by name
pkgs <- pkgs[order(pkgs$name), , drop=FALSE]
pkgs <- pkgs[!(is.na(pkgs$name) | pkgs$name == ""), ]
# Build the data frame that will become our final table
dat <- data.frame(
Name = character(nrow(pkgs)),
Description = character(nrow(pkgs)),
Citations = character(nrow(pkgs)), # will fill in
stringsAsFactors = FALSE
)
for (i in seq_len(nrow(pkgs))) {
nm <- pkgs$name[i]
repo_url <- if (!is.na(pkgs$repo[i]) && nzchar(pkgs$repo[i])) {
pkgs$repo[i]
} else {
paste0("https://github.com/USCbiostats/", nm)
}
# The clickable package name
dat$Name[i] <- sprintf("[**%s**](%s)", nm, repo_url)
desc_txt <- pkgs$description[i] # base description
# If on CRAN, add badges
if (pkgs$on_cran[i]) {
desc_txt <- paste(
desc_txt,
sprintf("[![CRAN status](https://www.r-pkg.org/badges/version/%1$s)](https://CRAN.R-project.org/package=%1$s)", nm),
sprintf("[![CRAN downloads](https://cranlogs.r-pkg.org/badges/grand-total/%1$s)](https://CRAN.R-project.org/package=%1$s)", nm)
)
}
# If on Bioc, add a badge
if (pkgs$on_bioc[i]) {
desc_txt <- paste(
desc_txt,
# Build status shield
sprintf(
"[![CRAN downloads](http://cranlogs.r-pkg.org/badges/grand-total/%s)](https://cran.r-project.org/package=%1$s)",
name
"[![BioC build status](https://bioconductor.org/shields/build/release/bioc/%s.svg)](https://bioconductor.org/packages/release/bioc/html/%1$s.html)",
nm
),
# Downloads rank shield
sprintf(
"[![status](https://tinyverse.netlify.com/badge/%s)](https://CRAN.R-project.org/package=%1$s)",
name
"[![BioC downloads](https://bioconductor.org/shields/downloads/release/%s.svg)](https://bioconductor.org/packages/release/bioc/html/%1$s.html)",
nm)
)
), stringsAsFactors = FALSE))
test <- with(pkgs, !on_cran & is.na(on_bioc))
dat$Description[!pkgs$on_cran] <- pkgs$description[!pkgs$on_cran]
for (pkg in pkgs$name[which(pkgs$on_bioc)]) {
dat[which(pkgs$name == pkg), "Description"] <-
paste0(pkgs$description[pkgs$name == pkg],
sprintf("[![](https://img.shields.io/badge/Bioconductor%%20version-1.0.0-green.svg)](https://www.bioconductor.org/packages/%s)", pkg),
badger::badge_bioc_download(pkg, "total", "blue", "total"))
}
dat$Description[i] <- desc_txt
}
# Initialize Citations
dat$Citations <- ""
```

```{r citations, include=FALSE}
regex <- "([0-9,]+)[\\s\\n]+results?[\\s\\n]+\\([\\s\\n]*[0-9]+" #[\\s\\n]+[(][0-9]*[.]?[0-9]+\\s+secs?
dat$Citations <- ""
tot_citations <- 0L
for (i in seq_len(nrow(pkgs))) {
```{r, include=FALSE}
# -----------------------------
# 1) Scholar approach:
# -----------------------------
get_scholar_citation_count <- function(sid, pubid, pkg_name) {
# If there's a specific publication ID
if (!is.na(pubid) && nzchar(pubid)) {
# Use get_article_cite_history() + sum the 'cites'
article_hist <- tryCatch(
get_article_cite_history(sid, pubid),
error = function(e) NULL
)
if (is.data.frame(article_hist) && nrow(article_hist) > 0 && "cites" %in% names(article_hist)) {
return(sum(article_hist$cites))
} else {
return(NA_integer_)
}
} else {
# Otherwise, fallback to the fuzzy match on package name in get_publications()
pubs <- tryCatch(
get_publications(sid),
error = function(e) NULL
)
if (!is.null(pubs) && is.data.frame(pubs) && nrow(pubs) > 0) {
idx <- which(grepl(pkg_name, pubs$title, ignore.case=TRUE))
if (length(idx) > 0) {
# Return the first match's total cites
return(pubs$cites[idx[1]])
}
}
return(NA_integer_)
}
}
# -----------------------------
# 2) Old HTML scraping approach:
# -----------------------------
# We'll define a function that tries to parse a Google Scholar URL (like ?cites=...)
# using readLines or GET+iconv, then run a regex to find "XXX results" lines.
# If found, return XXX as integer. Otherwise NA.
get_html_scrape_citation_count <- function(gs_url) {
# If no URL, then continue
if (nchar(pkgs$google_scholar[i]) == 0) {
next
if (is.na(gs_url) || !nzchar(gs_url)) {
return(NA_integer_)
}
# Otherwise, take a look at the cictations
address <- pkgs$google_scholar[i]
# We'll fetch as raw and convert.
page_txt <- tryCatch({
resp <- httr::GET(gs_url)
if (httr::status_code(resp) != 200) {
stop("HTTP status not 200")
}
raw_ct <- httr::content(resp, as="raw")
txt <- iconv(rawToChar(raw_ct, multiple=TRUE), from="UTF-8", to="UTF-8", sub="byte")
txt
}, error = function(e) {
return(NULL)
})
page <- tryCatch(readLines(address, warn = FALSE), error = function(e) e)
if (inherits(page, "error"))
next
if (is.null(page_txt)) {
return(NA_integer_)
}
# We'll split into lines
lines <- strsplit(page_txt, "\n", fixed=TRUE)[[1]]
# Removing blocks of bold, italic, etc
page <- gsub("\\<[[:alnum:]_/-]+\\>", "",page, perl = TRUE)
# Remove some tags. (Might or might not help.)
lines <- gsub("\\<[[:alnum:]_/-]+\\>", "", lines, perl=TRUE)
citations <- which(grepl(regex, page, perl = TRUE))
if (!length(citations))
next
# The old code used a regex looking for something like "123 results (0.23 sec)"
# e.g. "([0-9,]+)[\\s\\n]+results?[\\s\\n]+\\([\\s\\n]*[0-9]+"
# But Scholar might say "About 123 results..."
# So we can attempt a simpler approach:
# "About X results" or "X results"
re <- "About\\s+([0-9,]+)\\s+results\\s*(\\([^)]*\\))?|
([0-9,]+)\\s+results\\s*(\\([^)]*\\))?"
# We'll try both capturing groups
m <- regexpr(re, lines, perl=TRUE, ignore.case=TRUE)
# Find the first line that matches
line_idx <- which(m != -1)
if (length(line_idx) == 0) {
return(NA_integer_)
}
# We'll just pick the first match
line_of_interest <- lines[line_idx[1]]
# Extract the numeric portion
# We'll do two sub captures, so:
match_txt <- regmatches(line_of_interest, m[1])
citations <- stringr::str_extract(page[citations], "[0-9,]+(?=[\\s\\n]+results?)")
citations <- as.integer(gsub("[,.]", "", citations, perl = TRUE))
tot_citations <- tot_citations + citations
dat$Citations[i] <- sprintf("[%i](%s)", as.integer(citations), address)
# We'll use a simpler approach with stringr if you prefer:
library(stringr)
# This pattern tries to find numbers in the text
nums_found <- str_extract_all(line_of_interest, "[0-9,]+")[[1]]
if (length(nums_found) == 0) {
return(NA_integer_)
}
# Convert e.g. "1,234" -> 1234
cites_int <- as.integer(gsub("[^0-9]", "", nums_found[1]))
cites_int
}
if (tot_citations == 0L)
stop("There can't be 0 citations! Make sure things are running as expected!")
tot_citations <- 0L
# Now we loop over each package row
for (i in seq_len(nrow(pkgs))) {
pkg_name <- pkgs$name[i]
sid <- if ("scholar_id" %in% names(pkgs)) pkgs$scholar_id[i] else NA_character_
pubid <- if ("pubid" %in% names(pkgs)) pkgs$pubid[i] else NA_character_
old_link <- if ("google_scholar" %in% names(pkgs)) pkgs$google_scholar[i] else NA_character_
cval <- NA_integer_
# 1) Try scholar approach if sid is not empty
if (!is.na(sid) && nzchar(sid)) {
cval <- get_scholar_citation_count(sid, pubid, pkg_name)
if (!is.na(cval) && cval >= 0) {
# If we got a valid integer from Scholar
if (!is.na(pubid) && nzchar(pubid)) {
# We have a link to the actual publication
dat$Citations[i] <- sprintf("[%d](%s)", cval, old_link)
} else {
# We only have the count, no direct pub link
dat$Citations[i] <- as.character(cval)
}
tot_citations <- tot_citations + cval
next # Done with this package
}
}
# 2) Fallback: old HTML approach using google_scholar column
cval_html <- get_html_scrape_citation_count(old_link)
if (!is.na(cval_html) && cval_html >= 0) {
dat$Citations[i] <- sprintf("[%d](%s)", cval_html, old_link)
tot_citations <- tot_citations + cval_html
}
}
```

As of `r Sys.Date()`, the packages listed here have been cited **`r tot_citations`** times
(source: Google Scholar).

```{r printing, echo = FALSE}
knitr::kable(dat, row.names = FALSE)
```

As of `r Sys.Date()`, the packages listed here have been cited **`r tot_citations`** times
(source: Google Scholar).

To update this list, modify the file [packages.csv](packages.csv). The
`README.md` file is updated automatically using GitHub Actions, so there's no
need to "manually" recompile the README file after updating the list.


# Coding Standards

1. [Coding Standards](wiki#coding-standards)
@@ -148,3 +286,6 @@ We do have some direct guidelines developed as issue templates [here](templates)
The Happy Scientist Seminars are educational seminars sponsored by Core D of IMAGE, the Biostats Program Project award. This series, the "Happy Scientist" seminar series, is aimed at providing educational material for members of Biostats, both students and faculty, about a variety of tools and methods that might prove useful to them. If you have any suggestions for subjects that you would like to learn about in future, please send email to Kim Siegmund at ([email protected]). Our agenda will be driven by your specific interests as far as is possible.

A list of past seminars with material can be found [here](/happy_scientist/).