fix wordcloud

nuest · nuest · commit ead63288210d · 2021-06-28T19:06:56.000+02:00
diff --git a/.binder/environment.yml b/.binder/environment.yml
@@ -19,7 +19,7 @@ dependencies:
   - r-ggpubr
   - r-ggthemes
   - r-here
-  - libstdcxx-ng
+  - r-bibtex
   - conda-build
   - autopep8
   - entrez-direct
diff --git a/.binder/start b/.binder/start
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# source: https://discourse.jupyter.org/t/glibcxx-3-4-26-not-found-from-rstudio/7778/8
+set -e
+export LD_LIBRARY_PATH=${NB_PYTHON_PREFIX}/lib:${LD_LIBRARY_PATH}
+exec "$@"
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,5 @@ src/timeline.html
 .bashrc
 
 .jupyter-server-log.txt
+
+src/timeline/timeline.html
diff --git a/.here b/.here
diff --git a/src/wordcloud/wordcloud.R b/src/wordcloud/wordcloud.R
@@ -1,31 +1,45 @@
-library(bib2df)
-library(dplyr)
-library(tidytext)
-library(stringr)
-library(wordcloud)
-library(knitr)
-library(readr)
+library("bibtex")
+library("dplyr")
+library("tidytext")
+library("stringr")
+library("wordcloud")
+library("knitr")
+library("readr")
+library("here")
 
-pal <- brewer.pal(8,"Dark2")
+pal <- brewer.pal(8, "Dark2")
 
-useFullText<-TRUE
+useFullText <- FALSE
 
-if(useFullText==TRUE){
-  #full text from pdfs
-  readr::read_file("../data/citations/tokens.txt.gz") %>% 
-    stringr::str_replace_all("'","") %>% 
-    stringr::str_replace_all("\\[","") %>% 
-    stringr::str_replace_all("\\]","") %>% 
-    stringr::str_replace_all(" ","") %>% 
-    stringr::str_split(pattern=',',simplify = TRUE) %>%
+if (useFullText == TRUE) {
+  #full text from pdfs, cannot be shared publicly
+  readr::read_file("../data/citations/tokens.txt.gz") %>%
+    stringr::str_replace_all("'", "") %>%
+    stringr::str_replace_all("\\[", "") %>%
+    stringr::str_replace_all("\\]", "") %>%
+    stringr::str_replace_all(" ", "") %>%
+    stringr::str_split(pattern = ",", simplify = TRUE) %>%
     stringr::str_to_lower() -> tokenvec
-    data.frame(word=tokenvec) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% ungroup() -> tokens_clean
-}else{
+    data.frame(word = tokenvec) %>%
+      anti_join(stop_words) %>%
+      count(word, sort = TRUE) %>%
+      ungroup() -> tokens_clean
+} else {
   #just the abstracts
-  path<-"../data/citations/metadata-in-rcr-refs.bib"
-  df <- bib2df(path)
-  df %>% dplyr::filter(!is.na(ABSTRACT)) %>% unnest_tokens(word,ABSTRACT) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% ungroup() -> tokens_clean
+  path <- here::here("data/citations/metadata-in-rcr-refs.bib")
+  bib <- bibtex::read.bib(path)
+  df <- data.frame(`ABSTRACT` = unlist(
+    sapply(bib, function(b) { b$abstract })))
+  df %>% dplyr::filter(!is.na(ABSTRACT)) %>%
+    unnest_tokens(word, ABSTRACT) %>%
+    anti_join(stop_words) %>%
+    count(word, sort = TRUE) %>%
+    ungroup() -> tokens_clean
 }
 
 tokens_clean %>%
-with(wordcloud(word, n, random.order = FALSE, max.words = 100, colors=pal)) -> word_cloud
+  with(wordcloud(word,
+    n,
+    random.order = FALSE,
+    max.words = 100,
+    colors = pal)) -> word_cloud