-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_fasta.R
66 lines (50 loc) · 2.36 KB
/
read_fasta.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
mlreadFASTA = function (file = "/Users/mliku/Desktop/perl_tests/R_scriptsNES/orf_trans.fasta",
legacy.mode = TRUE, seqonly = FALSE) {
#Read the fasta file as a vector of strings
lines=readLines(file)
#lines=readLines("orf_trans.fasta") #reads line by line like in perl
# Remove comment lines starting with a semicolon ';' Can try without this argument
if (legacy.mode) {
comments = grep("^;", lines)
if (length(comments) > 0) {
lines = lines[-comments]
}
}
#Get the line numbers where sequencs names are
ind=which(substr(lines, 1L, 1L)==">") #stores the line numbers in which ">" are located. It starts at 1 and ends at 1
#compute the total number of sequences
nseq=length(ind) #the number of positions where there is ">"
if(nseq==0) stop("no line starting with a > character found")
#Localize sequence data. start is a list of second lines where sequence begins after line with >.
#end is a list of lines preceeding > the first line with > is excluded. These are the end of sequences
start=ind+1
end=ind-1
end=c(end[-1], length(lines))
#since end does not contain the last line, length(lines) is appended to the list and
#to keep the index consistent, the first value "0" is removed, since there is nothing preceeding it
#Read in sequences
sequences=lapply(seq_len(nseq), function (i) paste(lines[start[i]:end[i]], collapse=""))
if (seqonly) return(sequences)
#Read in sequence names
nomseq=lapply(seq_len(nseq), function (i)
{
firstword=strsplit(lines[ind[i]], " ")[[1]][1]
substr(firstword, 2, nchar(firstword))
})
#Read the common gene name
alias=lapply(seq_len(nseq), function (i)
{
secondword=strsplit(lines[ind[i]], " ")[[1]][2]
})
#examples to show how it is structured
#strsplit(lines[ind[1]], " ") #just typing this returns all the substrings generated by using " " space
#separator for the first indexed line
#> strsplit(lines[ind[1]], " ")[[1]][1]
#[1] ">YAL001C"
#> strsplit(lines[ind[2]], " ")[[1]][1]
#[1] ">YAL002W"
#the lines indexed by "ind" of
#thatdata<-cbind(nomseq, alias, sequences) This didn't work
thisdata<-c(nomseq, alias, sequences)
return(as.data.frame(matrix(unlist(thisdata), nrow=nseq, byrow=FALSE), stringsAsFactors=FALSE))
}