-
Notifications
You must be signed in to change notification settings - Fork 1
/
chipClinvarMatch
95 lines (78 loc) · 3.03 KB
/
chipClinvarMatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#########23andME data processing script
#package that allows for R interactions with NCBI APIs
library("rentrez")
#take in CSV files, isolate the RsID column and turn it into a data frame
rsids <- read.csv(file = "C:/Users/luist/Desktop/GSA-24v2-0_A1_b150_rsids.csv")
RSID <- rsids$RsID
RSIDD <- as.data.frame(RSID)
library("tidyr")
library("dplyr")
#Separate RSID that have multiple values in one row and give them their own row separating by ","
RSIDDD <- RSIDD %>%
mutate(RSID = strsplit(as.character(RSID),",")) %>%
unnest(RSID)
#subset dataset to exclude rows with "."
subID <- subset(RSIDDD, RSID != ".")
# ------ search fromm cliinvar --------
subID$g37 = subID$loc37 = subID$g38 = subID$loc38 = NA
for (row in 1:nrow(subID)){
i = subID$RSID[row]
cat("\n i: ", i, " row ", row," ")
result = entrez_search(db = "clinvar", term = i)
if (length(result$ids) > 0){
id = result$ids
summary = entrez_summary(db = "clinvar", id = id)
if (!is.null(summary$variation_set$variation_loc)){
g38 = summary$variation_set$variation_loc[[1]]$start[1]
g37 = summary$variation_set$variation_loc[[1]]$start[2]
chr38 = summary$variation_set$variation_loc[[1]]$chr[1]
chr37 = summary$variation_set$variation_loc[[1]]$chr[2]
}
if (length(result$ids) > 1) {
cat("multiple records found")
}
subID$g37[row] = g37
subID$loc37[row] = chr37
subID$g38[row] = g38
subID$loc38[row] = chr38
}
}
########## AncestryDNA processing script
#package that allows for R interactions with NCBI APIs
library("rentrez")
library("tidyr")
library("dplyr")
#take in CSV files, isolate the RsID column and turn it into a data frame
ancestrychipraw <- read.csv(file = "C:/Users/luist/Desktop/InfiniumOmniExpress-24v1-3_A1_b150_rsids.csv")
ancestrychipid <- ancestrychipraw$RsID
ancestrychipdf <- as.data.frame(ancestrychipid)
#Separate RSID that have multiple values in one row and give them their own row separating by ","
ANchip <- ancestrychipdf %>%
mutate(ancestrychipid = strsplit(as.character(ancestrychipid),",")) %>%
unnest(ancestrychipid)
#subset dataset to exclude rows with "."
ANchips <- subset(ANchip, ancestrychipid != ".")
# ------ search fromm clinvar --------
ANchips$g37 = ANchips$loc37 = ANchips$g38 = ANchips$loc38 = NA
for (row in 1:nrow(ANchips)){
i = ANchips$ancestrychipid[row]
cat("\n i: ", i, " row ", row," ")
result = entrez_search(db = "clinvar", term = i)
if (length(result$ids) > 0){
id = result$ids
summary = entrez_summary(db = "clinvar", id = id)
if (!is.null(summary$variation_set$variation_loc)){
g38 = summary$variation_set$variation_loc[[1]]$start[1]
g37 = summary$variation_set$variation_loc[[1]]$start[2]
chr38 = summary$variation_set$variation_loc[[1]]$chr[1]
chr37 = summary$variation_set$variation_loc[[1]]$chr[2]
}
if (length(result$ids) > 1) {
cat("multiple records found")
}
ANchips$g37[row] = g37
ANchips$loc37[row] = chr37
ANchips$g38[row] = g38
ANchips$loc38[row] = chr38
}
}