read_fasta.R

mlreadFASTA = function (file = "/Users/mliku/Desktop/perl_tests/R_scriptsNES/orf_trans.fasta",
                        legacy.mode = TRUE, seqonly = FALSE) {
  
  #Read the fasta file as a vector of strings
  lines=readLines(file)
  #lines=readLines("orf_trans.fasta")  #reads line by line like in perl
  
  # Remove comment lines starting with a semicolon ';' Can try without this argument
  
  if (legacy.mode) {
    comments = grep("^;", lines)
    if (length(comments) > 0) {
      lines = lines[-comments]
    }
  }
  
  #Get the line numbers where sequencs names are 
  ind=which(substr(lines, 1L, 1L)==">") #stores the line numbers in which ">" are located. It starts at 1 and ends at 1
  
  #compute the total number of sequences
  nseq=length(ind) #the number of positions where there is ">"
  
  if(nseq==0) stop("no line starting with a > character found")
  
  #Localize sequence data. start is a list of second lines where sequence begins after line with >.
  #end is a list of lines preceeding > the first line with > is excluded. These are the end of sequences
  start=ind+1
  end=ind-1
  end=c(end[-1], length(lines))
  
  #since end does not contain the last line, length(lines) is appended to the list and 
  #to keep the index consistent, the first value "0" is removed, since there is nothing preceeding it
  
  #Read in sequences
  sequences=lapply(seq_len(nseq), function (i) paste(lines[start[i]:end[i]], collapse=""))
  if (seqonly) return(sequences)
  
  #Read in sequence names
  nomseq=lapply(seq_len(nseq), function (i) 
  {
    firstword=strsplit(lines[ind[i]], " ")[[1]][1] 
    substr(firstword, 2, nchar(firstword))
  })
  #Read the common gene name
  alias=lapply(seq_len(nseq), function (i) 
  {
    secondword=strsplit(lines[ind[i]], " ")[[1]][2]  
  })
  
  #examples to show how it is structured
  #strsplit(lines[ind[1]], " ") #just typing this returns all the substrings generated by using " " space 
  #separator for the first indexed line
  #> strsplit(lines[ind[1]], " ")[[1]][1]
  #[1] ">YAL001C"
  #> strsplit(lines[ind[2]], " ")[[1]][1]
  #[1] ">YAL002W"
  #the lines indexed by "ind" of 
  
  #thatdata<-cbind(nomseq, alias, sequences) This didn't work
  
  thisdata<-c(nomseq, alias, sequences)
  
  return(as.data.frame(matrix(unlist(thisdata), nrow=nseq, byrow=FALSE), stringsAsFactors=FALSE))
  
}