forked from eacooper400/Rio
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparseAssemblytics.R
108 lines (89 loc) · 4.96 KB
/
parseAssemblytics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
setwd("~/Desktop/Rio_Analysis/New_Mummer_032018/assemblytics/")
options(stringsAsFactors=FALSE)
source("../../scripts/pan_transcriptome_fxns.R")
### Read in Gene Blocks for unique Mums and SV events
block.table = read.table("GeneBlocks_wAssemblyticsCoords.txt", header=T)
block.table2 = read.table("StrucVar_GeneBlocks.txt", header=T)
### Count relocated duplications as genes where the Rio ID
### appears only once, but the BTx ID appears more than once
### Be sure to exclude "-" matches with no transcript in BTx
reloc.dup = intersect(which(table(block.table$AlignmentA)[block.table$AlignmentA]==1), which(table(block.table$AlignmentB)[block.table$AlignmentB]>1 & table(block.table$AlignmentB)[block.table$AlignmentB]<4))
reloc.dup.rio = unique(block.table$AlignmentA[reloc.dup])
reloc.dup.btx = unique(block.table$AlignmentB[reloc.dup])
### Count relocated deletions as genes where the Rio ID
### appears more than once, but the BTx ID only appears once
reloc.del = intersect(which(table(block.table$AlignmentB)[block.table$AlignmentB]==1), which(table(block.table$AlignmentA)[block.table$AlignmentA]>1 & table(block.table$AlignmentA)[block.table$AlignmentA]<4))
reloc.del.rio = unique(block.table$AlignmentA[reloc.del])
reloc.del.btx = unique(block.table$AlignmentB[reloc.del])
### Read in the transcript file for each
rio.genes=parse.gff("../../Annotation_Files/SbicolorRiov2.1.gene.gff3")
btx.genes = parse.gff("../../Annotation_Files/Sbicolor_454_v3.1.1.gene.gff3")
### Determine which genes are not in any Mummer blocks
rio1 = rio.genes$Name[which(is.na(match(rio.genes$Name, block.table$AlignmentA)))]
btx1 = btx.genes$Name[which(is.na(match(btx.genes$Name, block.table$AlignmentB)))]
### Determine which missing Rio genes can be found in assemblytics blocks
m1 = match(rio1, block.table2$AlignmentA)
r = which(!(is.na(m1)))
### Of these, how many match a missing Btx gene?
b = block.table2$AlignmentB[m1[r]]
m2 = match(btx1, b)
### Add Feature Types to the Mummer Table
block.table$Type = rep("Collinear", nrow(block.table))
reloc = which(block.table$BtxChr != block.table$RioChr)
block.table$Type[reloc] = "Relocation"
block.table$Type[reloc.dup] = "Relocation_wDuplication_Rio"
block.table$Type[reloc.del] = "Relocation_wDeletion_Rio"
block.table$Type[which(block.table$AlignmentA=="-")] = "NoTranscript_Rio"
block.table$Type[which(block.table$AlignmentB=="-")] = "NoTranscript_Btx"
block.table$Type[which(block.table$RioS > block.table$RioE)] = "Inversion"
### Set aside genes in expansions, then merge
### the blocks with missing Rio genes into the block.table
to.add = block.table2[m1[r],]
rio.exp = to.add$AlignmentA[which(b=="-")]
to.add$Type = rep("Collinear_Gap", nrow(to.add))
to.add$Type[which(b=="-")] = "Expansion_Rio"
bdup = seq(1, length(b))[-unique(c(m2[which(!(is.na(m2)))], which(b=="-")))]
to.add$Type[bdup] = "Other_Duplication_Rio"
block.table = rbind(block.table, to.add)
### Subtract the found rio genes and found BTx genes
### from the lists of missing genes
rio1 = rio1[-r]
btx1 = btx1[-(which(!(is.na(m2))))]
### Determine which additional btx genes can be found in Assemblytics blocks
block.table3 = block.table2[-(m1[r]),]
m3 = match(btx1, block.table3$AlignmentB)
b = which(!(is.na(m3)))
block.table4 = block.table3[m3[which(!(is.na(m3)))],]
block.table4$Type = rep("Gap", nrow(block.table4))
block.table4$Type[grep("-", block.table4$AlignmentA)] = "Contraction_Rio"
rio.del = block.table4$AlignmentB[grep("-", block.table4$AlignmentA)]
block.table4$Type[-(grep("-", block.table4$AlignmentA))] = "Other_Duplication_BTx"
btx1 = btx1[-(which(!(is.na(m3))))]
block.table = rbind(block.table, block.table4)
### For the remaining missing genes,
### filter out those that map to a chromosome other than 1-10
rio1 = rio1[-(which(rio.genes$Chr[match(rio1, rio.genes$Name)]>10))]
btx1 = btx1[-(which(btx.genes$Chr[match(btx1, btx.genes$Name)]>10))]
### Run Blat on missing genes against whole genome sequence,
### then read in the results
btx.blat = parse.blat("BtxMissing_v_Rio.psl")
rio.blat = parse.blat("RioMissing_v_Btx.psl")
btx.blat = btx.blat[which(btx.blat$Qcoverage>=0.8),]
rawblat = read.table("RioMissing_v_Btx.psl", header=F, skip=5)
rawblat = rawblat[which(rio.blat$Qcoverage>=0.8),]
rio.blat = rio.blat[which(rio.blat$Qcoverage>=0.8),]
### Sort the alignment table
block.table3 = block.table[order(block.table$RioChr, block.table$RioS),]
rblock.m = data.frame(block=rep(NA, length(rio1)),
BtxS=rep(NA, length(rio1)),
BtxE=rep(NA, length(rio1)),
RioS=rio.genes$Start[m],
RioE=rio.genes$End[m],
BtxL=rep(NA, length(rio1)),
RioL=(rio.genes$End[m]-rio.genes$Start[m]),
BtxChr=rep(NA, length(rio1)),
RioChr=rio.genes$Chr[m],
AlignmentA=rio1,
AlignmentB=rep("-", length(rio1)),
IDY=rep(NA, length(rio1)),
Type=rep("Unique", length(rio1)))