-
Notifications
You must be signed in to change notification settings - Fork 13
/
index.Rmd
88 lines (76 loc) · 2.18 KB
/
index.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
---
title: "HSDN Symptoms"
output:
html_document:
theme: cosmo
highlight: pygments
---
```{r, message=FALSE}
library(dplyr)
library(DT)
library(ggplot2)
options(stringsAsFactors=FALSE)
```
Symptoms from the [HSDN](https://dx.doi.org/10.1038/ncomms5212) (Human symptoms--disease network)
```{r}
# read Disease Ontology to MESH mapping
mesh.df <-
#'http://git.dhimmel.com/disease-ontology/data/xrefs-prop-slim.tsv' %>%
'http://git.dhimmel.com/disease-ontology/data/xrefs-slim.tsv' %>%
read.delim() %>%
dplyr::filter(resource == 'MSH') %>%
dplyr::select(-resource) %>%
dplyr::rename(disease_id = resource_id)
# read HSDN supplement that was mapped to MESH
hsdn.df <-
'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Combined-Output.tsv' %>%
read.delim(check.names = FALSE, row.names=1) %>%
dplyr::rename(
symptom_id = `MeSH Symptom ID`,
symptom_name = `MeSH Symptom Term`,
disease_id = `MeSH Disease ID`,
disease_name = `MeSH Disease Term`,
tfidf_score = `TFIDF score`,
cooccurs = `PubMed occurrence`
)
# add MESH to hsdn.df
hsdn.df <- hsdn.df %>%
dplyr::inner_join(mesh.df)
path <- file.path('data', 'symptoms-DO.tsv')
write.table(hsdn.df, path, sep='\t', row.names=FALSE, quote=FALSE)
DT::datatable(hsdn.df %>% dplyr::filter(tfidf_score >= 25))
```
```{r}
# Distribution of tfidf_scores
hsdn.df %>%
ggplot(aes(x = tfidf_score)) +
geom_histogram(alpha=0.6) +
scale_x_log10() + theme_bw()
# Distribution of tfidf_scores (zoomed in)
hsdn.df %>%
ggplot(aes(x = tfidf_score)) +
geom_histogram(alpha=0.6, binwidth=1) +
xlim(c(0, 50)) +
theme_bw()
```
```{r, fig.height=40}
# distribution of symptom scores by DO slim disease
hsdn.df %>%
ggplot(aes(x = doid_name, y = tfidf_score)) +
geom_violin(fill='darkgreen', color=NA) +
#geom_jitter(alpha = 0.4) +
scale_y_log10() +
theme_bw() +
coord_flip()
```
```{r, fig.height=40}
# number of symptoms (and percent with tfidf_score >= 25) per DO slim disease
hsdn.df %>%
dplyr::group_by(doid_code, doid_name) %>%
dplyr::summarize(
n_symptoms = n(),
number_above_25 = sum(tfidf_score >= 25),
percent_above_25 = round(100 * mean(tfidf_score >= 25), 3)
) %>%
DT::datatable()
```