-
Notifications
You must be signed in to change notification settings - Fork 0
/
Filter_PP.R
206 lines (145 loc) · 8.72 KB
/
Filter_PP.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
######################################################################
#FILTER_PP:
#This script create a xml file with filted data from Protein Pilot export
######################################################################
list.of.packages <- c("readr", "readxl","WriteXLS", "openxlsx")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library('readr'); library('readxl'); library('WriteXLS'); library("openxlsx")
#Suppress warnings globally
options(warn = -1)
print("Ejecutando... Puede tardar un poco. ")
#Count files
files_glob_peptides <- (Sys.glob("*_PeptideSummary.txt"))
files_glob_proteins <- (Sys.glob("*_ProteinSummary.txt"))
if (length(files_glob_peptides) != length(files_glob_proteins)) {
print("??WARNING!! Different number of files: Protein and Peptides")
} else {
i <- 1
while(i <= length(files_glob_peptides)){
#Suppress warnings globally
options(warn = -1)
print("Ejecutando... Puede tardar un poco. ")
#Loading data
# Proteins_PP <- read_delim("Y:/ENRIQUE/Filtro PP en R/17-42_Muestra 161_NCBISmMt_ProteinSummary.txt",
# "\t", escape_double = FALSE, trim_ws = TRUE)
#
# Peptidos_PP <- read_delim("Y:/ENRIQUE/Filtro PP en R/17-42_Muestra 161_NCBISmMt_PeptideSummary.txt",
# "\t", escape_double = FALSE, trim_ws = TRUE)
Proteins_PP <- read_delim(files_glob_proteins[i],
"\t", escape_double = FALSE, trim_ws = TRUE)
Peptidos_PP <- read_delim(files_glob_peptides[i],
"\t", escape_double = FALSE, trim_ws = TRUE)
#choose the columns that we want.
Proteins_PP2 <- data.frame(Proteins_PP$N,Proteins_PP$Unused,Proteins_PP$`%Cov(95)`,Proteins_PP$`Peptides(95%)`,Proteins_PP$Accession,Proteins_PP$Name,Proteins_PP$Species)
Peptidos_PP2 <- data.frame(Peptidos_PP$N,Peptidos_PP$Unused,Peptidos_PP$`%Cov(95)`,Peptidos_PP$Accessions,Peptidos_PP$Names,Peptidos_PP$Contrib,Peptidos_PP$Conf,Peptidos_PP$Sequence,Peptidos_PP$Modifications,Peptidos_PP$Cleavages,Peptidos_PP$dMass,Peptidos_PP$`Prec MW`,Peptidos_PP$`Prec m/z`,Peptidos_PP$`Theor MW`,Peptidos_PP$`Theor m/z`,Peptidos_PP$`Theor z`,Peptidos_PP$Sc,Peptidos_PP$Spectrum,Peptidos_PP$Time)
#Remove rows with REVERSED as string
Proteins_PP3 <- Proteins_PP2[!grepl("REVERSED", Proteins_PP2$Proteins_PP.Name),]
Peptidos_PP3 <- Peptidos_PP2[!grepl("REVERSED", Peptidos_PP2$Peptidos_PP.Names),]
#Proteins_PP2[- grep("^REVERSED ", Proteins_PP2$Name),]
#Proteins_PP2[ grep("REVERSED ", Proteins_PP2$Name, invert = TRUE) , ]
#Remove rows with Unised value => 1,3
Proteins_PP4 <- subset(Proteins_PP3, Proteins_PP3$Proteins_PP.Unused > 1.3 )
Proteins_PP4 <- subset(Proteins_PP4, Proteins_PP4$Proteins_PP...Cov.95.. > 0)
Proteins_PP4 <- subset(Proteins_PP4, Proteins_PP4$Proteins_PP..Peptides.95... > 0)
#Filter peptide and filters
Peptidos_PP4 <- subset(Peptidos_PP3, Peptidos_PP3$Peptidos_PP.Unused > 1.3)
Peptidos_PP4 <- subset(Peptidos_PP4, Peptidos_PP4$Peptidos_PP.Contrib > 0)
Peptidos_PP4 <- subset(Peptidos_PP4, Peptidos_PP4$Peptidos_PP.Conf > 95)
Peptidos_PP4 <- subset(Peptidos_PP4, Peptidos_PP4$Peptidos_PP...Cov.95.. > 0)
#Round numbers
colnames(Proteins_PP4) <- c("N", "Unused", "%Cov(95)", "Peptides(95%)", "Accession","Name","Species")
colnames(Peptidos_PP4) <- c("N","Unused", "%Cov(95)", "Accessions", "Names", "Contrib", "Conf", "Sequence", "Modifications", "Cleavages", "dMass", "Prec MW", "Prec m/z", "Theor MW", "Theor m/z", "Theor z", "Sc", "Spectrum", "Time")
round_df <- function(df, digits) {
nums <- vapply(df, is.numeric, FUN.VALUE = logical(1))
df[,nums] <- round(df[,nums], digits = digits)
(df)
}
Proteins_PP5 <- round_df(Proteins_PP4, digits=2)
Peptidos_PP5 <- round_df(Peptidos_PP4, digits=2)
##############
#comparar tablas test:
duplicates <- data.frame(Peptidos_PP5$Accession)
duplicate2 <- aggregate(list(numdup=rep(1,nrow(duplicates))), duplicates, length)
duplicate3 <- with(duplicate2, duplicate2[order(duplicate2$Peptidos_PP5.Accession) , ])
#Protein accesion sorted
duplicate4 <- with(Proteins_PP5, Proteins_PP5[order(Proteins_PP5$Accession) , ])
#Replace in protein table
#If we need to remove specific protein from Proteins file you can use this.
#duplicate4 <- duplicate4[ ! ( ( duplicate4$N ==192)) , ]
#In order to remove this duplicates, we use if lenght to compare. Then we create
#a new table with the reverse elements in peptides, remove letters after ; and duplicates
#
# if (length(duplicate4$`Peptides(95%)`)!= length(duplicate3$numdup)){
# tabla_reversed <- data.frame(Peptidos_PP2[grepl("REVERSED", Peptidos_PP2$Peptidos_PP.Names),])
# tabla_reversed$Peptidos_PP.Accessions <- gsub("\\;.*","", tabla_reversed$Peptidos_PP.Accessions)
# tabla_reversed <- unique(tabla_reversed$Peptidos_PP.Accessions)
#
# #Remove element from the list
# duplicate4 <- duplicate4[ ! duplicate4$Accession %in% tabla_reversed, ]
#
# }
#If we want to have the same number of proteins and peptides, we've to remove those who are in proteins but
#not in peptides, for that reason we are going to create two new tables.
#We take the data that we want and remove everthing that is after the first ";".
df1 <- data.frame("Accesion"=gsub("\\;.*","", duplicate4$Accession), stringsAsFactors = FALSE)
df2 <- data.frame("Accesion2"=gsub("\\;.*","", duplicate3$Peptidos_PP5.Accession), stringsAsFactors = FALSE)
#Create a list
lst <- list(df1, df2)
#Create a list where we can test similars
alltests <- unique(trimws(unlist(lst, recursive = TRUE)))
df_final <- as.data.frame(
setNames(lapply(lst, function(a) alltests[ match(alltests, a[,1]) ]),
sapply(lst, names)),
stringsAsFactors = FALSE
)
#Create a data frame with only element of the column that have
df_final_NA <- df_final[is.na(df_final$Accesion2),][1]
for (i in df_final_NA$Accesion){
print(i)
duplicate4 <- duplicate4[!duplicate4$Accession == as.character(i), ]
}
duplicate4$`Peptides(95%)` <- duplicate3$numdup
duplicate4$Accession <- gsub("\\;.*","", duplicate4$Accession)
#Sort columns by peptides
Proteins_PP6 <- duplicate4[ order(-duplicate4[,4], -duplicate4[,2]), ]
Proteins_PP6$N <- 1:nrow(Proteins_PP6)
Proteins_PP6$Species <- gsub('\\d+\\.?','', Proteins_PP6$Species)
#Compare and create a new column with element assigned by comparation with N peptide table.
Peptidos_PP6 <- Peptidos_PP5[order(Peptidos_PP5[,1],-Peptidos_PP5[,6]),]
Peptidos_PP6$Accessions <- gsub("\\;.*","", Peptidos_PP6$Accessions)
for (i in 1:length(Peptidos_PP6$Accessions)) {
#print(match(prueba1_2$Peptidos_PP6.Accessions[i], prueba1$Proteins_PP6.Accession, nomatch=NA))
Peptidos_PP6$N[i] <- match(Peptidos_PP6$Accessions[i],Proteins_PP6$Accession, nomatch=NA)
}
Peptidos_PP6 <- Peptidos_PP6[order(Peptidos_PP6[,1],-Peptidos_PP6[,6]),]
#Export xml
# WriteXLS(Proteins_PP6, ExcelFileName = "Proteins_summary.xls", SheetNames = NULL, perl = "perl",
# verbose = FALSE, Encoding = c("UTF-8", "latin1", "cp1252"),
# row.names = FALSE, col.names = TRUE,
# AdjWidth = FALSE, AutoFilter = FALSE, BoldHeaderRow = FALSE,
# na = "",
# FreezeRow = 0, FreezeCol = 0,
# envir = parent.frame())
#Check if there is any different between N and Proteins
test <- data.frame(Peptidos_PP6$N)
colnames(test) <- c("N")
for (z in 1:nrow(test)){
if (z==1){
test$New[z] <- NA
test$New[z] <- 1
}
else if (test$N[z]==test$N[z-1]){
test$New[z] <- test$New[z-1]
}
else if (test$N[z]!=test$N[z-1]){
test$New[z] <- test$New[z-1]+1
}
}
Peptidos_PP6$N <- test$New
x <- list(Proteins = data.frame(Proteins_PP6), Peptides = data.frame(Peptidos_PP6))
#WriteXLS(x, paste(gsub("*?PeptideSummary.txt","",files_glob_peptides[i]), "Summary.xlsx", sep = "", na=""), names(x))
write.xlsx(x, file = paste(gsub("*?PeptideSummary.txt","",files_glob_peptides[i]), "Summary.xlsx", sep = "", na=""))
i <- i + 1
}
}