|
1 |
| -library(bib2df) |
2 |
| -library(dplyr) |
3 |
| -library(tidytext) |
4 |
| -library(stringr) |
5 |
| -library(wordcloud) |
6 |
| -library(knitr) |
7 |
| -library(readr) |
| 1 | +library("bibtex") |
| 2 | +library("dplyr") |
| 3 | +library("tidytext") |
| 4 | +library("stringr") |
| 5 | +library("wordcloud") |
| 6 | +library("knitr") |
| 7 | +library("readr") |
| 8 | +library("here") |
8 | 9 |
|
9 |
| -pal <- brewer.pal(8,"Dark2") |
| 10 | +pal <- brewer.pal(8, "Dark2") |
10 | 11 |
|
11 |
| -useFullText<-TRUE |
| 12 | +useFullText <- FALSE |
12 | 13 |
|
13 |
| -if(useFullText==TRUE){ |
14 |
| - #full text from pdfs |
15 |
| - readr::read_file("../data/citations/tokens.txt.gz") %>% |
16 |
| - stringr::str_replace_all("'","") %>% |
17 |
| - stringr::str_replace_all("\\[","") %>% |
18 |
| - stringr::str_replace_all("\\]","") %>% |
19 |
| - stringr::str_replace_all(" ","") %>% |
20 |
| - stringr::str_split(pattern=',',simplify = TRUE) %>% |
| 14 | +if (useFullText == TRUE) { |
| 15 | + #full text from pdfs, cannot be shared publicly |
| 16 | + readr::read_file("../data/citations/tokens.txt.gz") %>% |
| 17 | + stringr::str_replace_all("'", "") %>% |
| 18 | + stringr::str_replace_all("\\[", "") %>% |
| 19 | + stringr::str_replace_all("\\]", "") %>% |
| 20 | + stringr::str_replace_all(" ", "") %>% |
| 21 | + stringr::str_split(pattern = ",", simplify = TRUE) %>% |
21 | 22 | stringr::str_to_lower() -> tokenvec
|
22 |
| - data.frame(word=tokenvec) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% ungroup() -> tokens_clean |
23 |
| -}else{ |
| 23 | + data.frame(word = tokenvec) %>% |
| 24 | + anti_join(stop_words) %>% |
| 25 | + count(word, sort = TRUE) %>% |
| 26 | + ungroup() -> tokens_clean |
| 27 | +} else { |
24 | 28 | #just the abstracts
|
25 |
| - path<-"../data/citations/metadata-in-rcr-refs.bib" |
26 |
| - df <- bib2df(path) |
27 |
| - df %>% dplyr::filter(!is.na(ABSTRACT)) %>% unnest_tokens(word,ABSTRACT) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% ungroup() -> tokens_clean |
| 29 | + path <- here::here("data/citations/metadata-in-rcr-refs.bib") |
| 30 | + bib <- bibtex::read.bib(path) |
| 31 | + df <- data.frame(`ABSTRACT` = unlist( |
| 32 | + sapply(bib, function(b) { b$abstract }))) |
| 33 | + df %>% dplyr::filter(!is.na(ABSTRACT)) %>% |
| 34 | + unnest_tokens(word, ABSTRACT) %>% |
| 35 | + anti_join(stop_words) %>% |
| 36 | + count(word, sort = TRUE) %>% |
| 37 | + ungroup() -> tokens_clean |
28 | 38 | }
|
29 | 39 |
|
30 | 40 | tokens_clean %>%
|
31 |
| -with(wordcloud(word, n, random.order = FALSE, max.words = 100, colors=pal)) -> word_cloud |
| 41 | + with(wordcloud(word, |
| 42 | + n, |
| 43 | + random.order = FALSE, |
| 44 | + max.words = 100, |
| 45 | + colors = pal)) -> word_cloud |
0 commit comments