Skip to content

Commit

Permalink
[build] Add mergeTours()
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Jul 22, 2018
1 parent 7cf2674 commit 16de042
Show file tree
Hide file tree
Showing 10 changed files with 94 additions and 73 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ allhic optimize tests/test.counts_GATC.2g2.txt tests/test.clm
Build genome release, including `.agp` and `.fasta` output.

```console
allhic build tests/test.counts_GATC.2g1.tour seq.fasta.gz
allhic build tests/test.counts_GATC.2g?.tour tests/seq.fasta.gz tests/asm-2g.chr.fasta
```

### <kbd>Plot</kbd>
Expand All @@ -89,7 +89,7 @@ allhic plot tests/test.bam tests/test.counts_GATC.2g1.tour

![allhicplot](images/allhic-plot-s.png)

## Pipeline
## <kbd>Pipeline</kbd>

Following the 4 steps of `prune`, `extract`, `partition`, `optimize`, as described above.
In summary, we have:
Expand All @@ -99,7 +99,7 @@ allhic extract tests/test.bam tests/seq.fasta.gz
allhic partition tests/test.counts_GATC.txt tests/test.pairs.txt 2
allhic optimize tests/test.counts_GATC.2g1.txt tests/test.clm
allhic optimize tests/test.counts_GATC.2g2.txt tests/test.clm
allhic build tests/test.tour seq.fasta.gz
allhic build tests/test.counts_GATC.2g?.txt tests/seq.fasta.gz tests/asm-2g.chr.fasta
```

Or, in a single step:
Expand All @@ -121,10 +121,9 @@ FASTA file.
- [x] Use clustering when k = 1
- [x] Isolate matrix generation to "plot"
- [x] Add "pipeline" to simplify execution
- [ ] Make "build" to merge subgroup tours
- [x] Make "build" to merge subgroup tours
- [x] Provide better error messages for "file not found"
- [ ] Plot the boundary of the contigs in "plot" using genome.json
- [ ] Provide better error messages for "file not found"
- [ ] Merge tours from multiple partitions back to a single file
- [ ] Add dot plot to "plot"
- [ ] Compare numerical output with Lachesis
- [ ] Improve Ler0 results
Expand Down
6 changes: 3 additions & 3 deletions agp.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ package allhic
import (
"bufio"
"bytes"
"os"
"strconv"
"strings"

Expand Down Expand Up @@ -86,7 +85,7 @@ func (r *AGP) Add(row string) {
// buildFasta builds target FASTA based on info from agpfile
func buildFasta(agpfile string, seqs map[string]*seq.Seq) {
agp := NewAGP(agpfile)
file, _ := os.Open(agpfile)
file := mustOpen(agpfile)

log.Noticef("Parse agpfile `%s`", agpfile)
scanner := bufio.NewScanner(file)
Expand All @@ -95,7 +94,7 @@ func buildFasta(agpfile string, seqs map[string]*seq.Seq) {
}

var buf bytes.Buffer
outFile := RemoveExt(agpfile) + ".chr.fasta"
outFile := RemoveExt(agpfile) + ".fasta"
outfh, _ := xopen.Wopen(outFile)
prevObject := ""
for _, line := range agp.lines {
Expand Down Expand Up @@ -124,6 +123,7 @@ func buildFasta(agpfile string, seqs map[string]*seq.Seq) {
// Last one
writeRecord(prevObject, buf, outfh)
buf.Reset()
log.Noticef("Assembly FASTA file `%s` built", outFile)
}

// writeRecord writes the FASTA record to the file
Expand Down
2 changes: 1 addition & 1 deletion anchor.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ func (r *Anchorer) makeTrivialPaths(contigs []*Contig, flanksize int64) PathSet

// ExtractInterContigLinks extracts links from the Bamfile
func (r *Anchorer) ExtractInterContigLinks() {
fh, _ := os.Open(r.Bamfile)
fh := mustOpen(r.Bamfile)
prefix := RemoveExt(r.Bamfile)
disfile := prefix + ".dis"
idsfile := prefix + ".ids"
Expand Down
8 changes: 2 additions & 6 deletions assess.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,7 @@ func (r *Assesser) writePostProb(outfile string) {

// readBed parses the bedfile to extract the start and stop for all the contigs
func (r *Assesser) readBed() {
fh, err := os.Open(r.Bedfile)
if err != nil {
log.Errorf("bedfile `%s` does not exist", r.Bedfile)
os.Exit(1)
}
fh := mustOpen(r.Bedfile)
log.Noticef("Parse bedfile `%s`", r.Bedfile)
reader := bufio.NewReader(fh)

Expand Down Expand Up @@ -142,7 +138,7 @@ func checkInRange(pos, start, end int) bool {

// extractContigLinks builds the probability distribution of link sizes
func (r *Assesser) extractContigLinks() {
fh, _ := os.Open(r.Bamfile)
fh := mustOpen(r.Bamfile)
log.Noticef("Parse bamfile `%s`", r.Bamfile)
br, _ := bam.NewReader(fh, 0)
defer br.Close()
Expand Down
18 changes: 17 additions & 1 deletion base.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ func Percentage(a, b int) string {
func ReadCSVLines(filename string) [][]string {
log.Noticef("Parse csvfile `%s`", filename)

fh, _ := os.Open(filename)
fh := mustOpen(filename)
defer fh.Close()

var data [][]string
Expand Down Expand Up @@ -384,3 +384,19 @@ func sortInt64s(a []int64) {
func searchInt64(a []int64, x int64) int {
return sort.Search(len(a), func(i int) bool { return a[i] >= x })
}

// mustExist panics when a file is not found
func mustExist(filename string) {
if _, err := os.Stat(filename); os.IsNotExist(err) {
log.Fatal(err)
}
}

// mustOpen wraps os.Open but panics if file not found
func mustOpen(filename string) *os.File {
f, err := os.Open(filename)
if err != nil {
log.Fatal(err)
}
return f
}
49 changes: 25 additions & 24 deletions build.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ import (

// Builder reconstructs the genome release AGP and FASTA files
type Builder struct {
Tourfile string
Tourfiles []string
Fastafile string
AGPfile string
// Output file
OutAGPfile string
OutFastafile string
}

// OOLine describes a simple contig entry in a scaffolding experiment
Expand Down Expand Up @@ -59,15 +61,6 @@ func (r *OO) getFastaSizes(fastafile string) {
}
}

// readFiles initializes OO object
func (r *Builder) readFiles() *OO {
oo := new(OO)
oo.getFastaSizes(r.Fastafile)
oo.parseLastTour(r.Tourfile)

return oo
}

// Add instantiates a new OOLine object and add to the array in OO
func (r *OO) Add(scaffold, ctg string, ctgsize int, strand byte) {
o := OOLine{scaffold, ctg, ctgsize, strand}
Expand All @@ -76,7 +69,7 @@ func (r *OO) Add(scaffold, ctg string, ctgsize int, strand byte) {

// writeAGP converts the simplistic OOLine into AGP format
func (r *Builder) writeAGP(oo *OO) {
filename := r.AGPfile
r.OutAGPfile = RemoveExt(r.OutFastafile) + ".agp"
gapSize := 100
gapType := "scaffold"
linkage := "yes"
Expand All @@ -86,7 +79,7 @@ func (r *Builder) writeAGP(oo *OO) {
objectEnd := 1
partNumber := 0
componentType := 'W'
f, _ := os.Create(filename)
f, _ := os.Create(r.OutAGPfile)
defer f.Close()
w := bufio.NewWriter(f)
components := 0
Expand Down Expand Up @@ -120,28 +113,36 @@ func (r *Builder) writeAGP(oo *OO) {
components++
}
w.Flush()
log.Noticef("A total of %d tigs written to `%s`", components, filename)
log.Noticef("A total of %d tigs written to `%s`", components, r.OutAGPfile)
}

// Run kicks off the Builder
// Run kicks off the Build and constructs molecule using component FASTA sequence
func (r *Builder) Run() {
r.AGPfile = RemoveExt(r.Tourfile) + ".agp"
r.Build(r.readFiles())
oo := new(OO)
oo.getFastaSizes(r.Fastafile)
// oo.parseLastTour(r.Tourfile)
oo.mergeTours(r.Tourfiles)
r.writeAGP(oo)
buildFasta(r.OutAGPfile, oo.seqs)
log.Notice("Success")
}

// Build constructs molecule using component FASTA sequence
func (r *Builder) Build(oo *OO) {
r.writeAGP(oo)
buildFasta(r.AGPfile, oo.seqs)
// mergeTours merges a number of tours typically generated by partition and optimize
// In contrast to parseLastTour which only parse one tour
func (r *OO) mergeTours(tourfiles []string) {
for i, tourfile := range tourfiles {
seqid := fmt.Sprintf("g%d", i+1)
log.Noticef("Import `%s` => %s", tourfile, seqid)
r.parseLastTour(tourfile, seqid)
}
}

// parseLastTour reads tour from file
//
// A tour file has the following format:
// > name
// contig1+ contig2- contig3?
func (r *OO) parseLastTour(tourfile string) {
func (r *OO) parseLastTour(tourfile string, seqid string) {
words := parseTourFile(tourfile)
var strand byte
for _, tig := range words {
Expand All @@ -151,7 +152,7 @@ func (r *OO) parseLastTour(tourfile string) {
} else {
strand = '?'
}
r.Add("FINAL", tig, r.seqs[tig].Length(), strand)
r.Add(seqid, tig, r.seqs[tig].Length(), strand)
}
}

Expand All @@ -163,7 +164,7 @@ func (r *OO) parseLastTour(tourfile string) {
func (r *OO) ParseAllTours(tourfile string) {
log.Noticef("Parse tourfile `%s`", tourfile)

file, _ := os.Open(tourfile)
file := mustOpen(tourfile)
scanner := bufio.NewScanner(file)
var (
name string
Expand Down
5 changes: 2 additions & 3 deletions clm.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"io"
"math"
"math/rand"
"os"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -113,7 +112,7 @@ func NewCLM(Clmfile, REfile string) *CLM {
// tig00035238 46779 recover
// tig00030900 119291
func (r *CLM) readRE() {
file, _ := os.Open(r.REfile)
file := mustOpen(r.REfile)
log.Noticef("Parse REfile `%s`", r.REfile)
scanner := bufio.NewScanner(file)
idx := 0
Expand All @@ -140,7 +139,7 @@ func rr(b byte) byte {

// readClmLines parses the clmfile into a slice of CLMLine
func readClmLines(clmfile string) []CLMLine {
file, _ := os.Open(clmfile)
file := mustOpen(clmfile)
log.Noticef("Parse clmfile `%s`", clmfile)
reader := bufio.NewReader(file)

Expand Down
Loading

0 comments on commit 16de042

Please sign in to comment.