-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Filtering
55 lines (46 loc) · 2.27 KB
/
Data Filtering
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
##------------------------------------------------------------------------------------------------------
## GeneFamilies analysis
## Date - 27 September 2024
##------------------------------------------------------------------------------------------------------
##------------------------------------------------------------------------------------------------------
# (1) Install the required package
install.packages("readr")
# Load the installed Package
library(readr)
##------------------------------------------------------------------------------------------------------
## (2) set working directory
setwd("C:/Users/smeeta.shrestha/Desktop/2024_human_choline_bgi_2/humann3_genefam/")
##------------------------------------------------------------------------------------------------------
## (3) Load in the genefamilies relab
genefam <- read.csv("merge_genefamilies_relab_uniref90.tsv", stringsAsFactors = FALSE, header = TRUE);head(dat)
# Reading the contents of TSV file using read_tsv() method
genefam<-readr::read_tsv("merge_genefamilies_relab.tsv")
dim(genefam)
# Read tsv files using read.delim() method
df<-read.delim("merge_genefamilies_relab_uniref90.tsv",sep="\t")
dim(df)
head(df)
class(df)
columns <- colnames(df);columns
##------------------------------------------------------------------------------------------------------
## (4) clean the data
## (4a) Filter - remove the row where we see the word "unclassified"
## How do I drop rows that contain a specific string in R?
df1 <- df[!grepl('unclassified', df$X..Gene.Family),]
dim(df1)
head(df1)
## (4b) Filter - remove the row where we see the word "NO_NAME"
df2 <- df1[!grepl('NO_NAME',df1$X..Gene.Family),]
dim(df2)
head(df2)
## (4c) Keep only those rows which have bacteria/taxa information along with the uniref90 gene name
# Selecting rows in dataframe in R based on partial row content.
# Selecting data frame rows based on partial string match in a column
library(data.table)
df3 <- df2[df2$X..Gene.Family %like% "g__", ]
dim(df3)
##------------------------------------------------------------------------------------------------------
## (5) export
write.csv(df3,"genefam_uniref90_tax_clean.csv")
## End of Cleaning script
##------------------------------------------------------------------------------------------------------