Skip to content

Commit

Permalink
[cmd] Add pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Jul 22, 2018
1 parent e6c66e9 commit 7cf2674
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 47 deletions.
31 changes: 23 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ Given a set of Hi-C contacts between contigs, as specified in the
clmfile, reconstruct the highest scoring ordering and orientations
for these contigs.

Optimize uses Genetic Algorithm (GA) to search for the best scoring solution. GA has been successfully applied to genome scaffolding tasks in the past (see ALLMAPS; [Tang et al. *Genome Biology*, 2015](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0573-1)).
Optimize uses Genetic Algorithm (GA) to search for the best scoring solution.
GA has been successfully applied to genome scaffolding tasks in the past
(see ALLMAPS; [Tang et al. *Genome Biology*, 2015](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0573-1)).

![ga](images/test-movie.gif)

Expand All @@ -74,7 +76,7 @@ allhic optimize tests/test.counts_GATC.2g2.txt tests/test.clm
Build genome release, including `.agp` and `.fasta` output.

```console
allhic build tests/test.counts_GATC.2g1.tour seq.fasta
allhic build tests/test.counts_GATC.2g1.tour seq.fasta.gz
```

### <kbd>Plot</kbd>
Expand All @@ -93,24 +95,37 @@ Following the 4 steps of `prune`, `extract`, `partition`, `optimize`, as describ
In summary, we have:

```console
allhic extract tests/{test.bam,seq.fasta.gz}
allhic partition tests/{test.counts_GATC.txt,test.pairs.txt} 2
allhic optimize tests/{test.counts_GATC.2g1.txt,test.clm}
allhic optimize tests/{test.counts_GATC.2g2.txt,test.clm}
allhic build tests/{test.tour,seq.fasta}
allhic extract tests/test.bam tests/seq.fasta.gz
allhic partition tests/test.counts_GATC.txt tests/test.pairs.txt 2
allhic optimize tests/test.counts_GATC.2g1.txt tests/test.clm
allhic optimize tests/test.counts_GATC.2g2.txt tests/test.clm
allhic build tests/test.tour seq.fasta.gz
```

Or, in a single step:

```console
allhic pipeline tests/test.bam tests/seq.fasta.gz 2
```

In summary, the pipeline requires a BAM file and the contigs FASTA file.
The user then needs to specify the Restriction Enzyme used, the number
`k` groups to partition into. Output include reconstructed chromosome
AGP file (containing how the contigs are linked together) and chromosomal
FASTA file.

## WIP features

- [x] Add restriction enzyme for better normalization of contig lengths
- [x] Add partition split inside "partition"
- [x] Use clustering when k = 1
- [x] Isolate matrix generation to "plot"
- [x] Add "pipeline" to simplify execution
- [ ] Make "build" to merge subgroup tours
- [ ] Plot the boundary of the contigs in "plot" using genome.json
- [ ] Provide better error messages for "file not found"
- [ ] Merge tours from multiple partitions back to a single file
- [ ] Add dot plot to "plot"
- [ ] Add "pipeline" to simplify execution
- [ ] Compare numerical output with Lachesis
- [ ] Improve Ler0 results
- [ ] Translate "prune" from C++ code to golang
Expand Down
7 changes: 5 additions & 2 deletions cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,12 @@ func (r *Partitioner) Cluster() {
break
}
}
log.Noticef("Merge #%d: Clusters\t%d + %d -> %d, Linkage = %g",
nMerges, bestMerge.a, bestMerge.b, newClusterID, bestMerge.score)

if nMerges%50 == 0 {
log.Noticef("Merge #%d: Clusters\t%d + %d -> %d, Linkage = %g",
nMerges, bestMerge.a, bestMerge.b, newClusterID, bestMerge.score)

}
merges = newMerges
}

Expand Down
162 changes: 125 additions & 37 deletions cmd/allhic.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,29 @@
package main

import (
"fmt"
"os"
"strconv"
"strings"
"time"

".."
logging "github.com/op/go-logging"
"github.com/urfave/cli"
)

var log = logging.MustGetLogger("main")

// var format = logging.MustStringFormatter(
// `%{color}%{time:15:04:05} | main | %{level:.6s} %{color:reset} ** %{message} **`,
// )

// // Backend is the default stderr output
// var Backend = logging.NewLogBackend(os.Stderr, "", 0)

// // BackendFormatter contains the fancy debug formatter
// var BackendFormatter = logging.NewBackendFormatter(Backend, format)

// init customizes how cli layout the command interface
// Logo banner (Varsity style):
// http://patorjk.com/software/taag/#p=testall&f=3D-ASCII&t=ALLHIC
Expand All @@ -34,6 +48,14 @@ _/ / \ \_ _| |__/ | _| |__/ | _| | | |_ _| |_\ ` + "`" + `.___.'\
` + cli.AppHelpTemplate
}

// banner prints the separate steps
func banner(message string) {
message = "* " + message + " *"
log.Noticef(strings.Repeat("*", len(message)))
log.Noticef(message)
log.Noticef(strings.Repeat("*", len(message)))
}

// main is the entrypoint for the entire program, routes to commands
func main() {
logging.SetBackend(allhic.BackendFormatter)
Expand All @@ -45,6 +67,45 @@ func main() {
app.Usage = "Genome scaffolding based on Hi-C data"
app.Version = allhic.Version

extractFlags := []cli.Flag{
cli.StringFlag{
Name: "RE",
Usage: "Restriction site pattern",
Value: "GATC",
},
}

optimizeFlags := []cli.Flag{
cli.BoolFlag{
Name: "skipGA",
Usage: "Skip GA step",
},
cli.BoolFlag{
Name: "resume",
Usage: "Resume from existing tour file",
},
cli.Int64Flag{
Name: "seed",
Usage: "Random seed",
Value: 42,
},
cli.IntFlag{
Name: "npop",
Usage: "Population size",
Value: 100,
},
cli.IntFlag{
Name: "ngen",
Usage: "Number of generations for convergence",
Value: 5000,
},
cli.Float64Flag{
Name: "mutpb",
Usage: "Mutation prob in GA",
Value: .2,
},
}

app.Commands = []cli.Command{
{
Name: "prune",
Expand Down Expand Up @@ -79,13 +140,7 @@ Given a bamfile, the goal of the extract step is to calculate an empirical
distribution of Hi-C link size based on intra-contig links. The Extract function
also prepares for the latter steps of ALLHIC.
`,
Flags: []cli.Flag{
cli.StringFlag{
Name: "RE",
Usage: "Restriction site pattern",
Value: "GATC",
},
},
Flags: extractFlags,
Action: func(c *cli.Context) error {
if len(c.Args()) < 2 {
cli.ShowSubcommandHelp(c)
Expand All @@ -95,6 +150,7 @@ also prepares for the latter steps of ALLHIC.
bamfile := c.Args().Get(0)
fastafile := c.Args().Get(1)
RE := c.String("RE")

p := allhic.Extracter{Bamfile: bamfile, Fastafile: fastafile, RE: RE}
p.Run()
return nil
Expand Down Expand Up @@ -165,36 +221,7 @@ order appearing in "clusters.txt". Typically, if there are k clusters, we
can start k separate "optimize" commands for parallelism (for example,
on a cluster).
`,
Flags: []cli.Flag{
cli.BoolFlag{
Name: "skipGA",
Usage: "Skip GA step",
},
cli.BoolFlag{
Name: "resume",
Usage: "Resume from existing tour file",
},
cli.Int64Flag{
Name: "seed",
Usage: "Random seed",
Value: 42,
},
cli.IntFlag{
Name: "npop",
Usage: "Population size",
Value: 100,
},
cli.IntFlag{
Name: "ngen",
Usage: "Number of generations for convergence",
Value: 5000,
},
cli.Float64Flag{
Name: "mutpb",
Usage: "Mutation prob in GA",
Value: .2,
},
},
Flags: optimizeFlags,
Action: func(c *cli.Context) error {
if len(c.Args()) < 2 {
cli.ShowSubcommandHelp(c)
Expand Down Expand Up @@ -287,6 +314,67 @@ as a quality assessment step.
return nil
},
},
{
Name: "pipeline",
Usage: "Run extract-partition-optimize-build steps sequentially",
UsageText: `
allhic pipeline bamfile fastafile k [options]
Pipeline:
A convenience driver function. Chain the following steps sequentially.
- extract
- partion
- optimize
- build
`,
Flags: append(extractFlags, optimizeFlags...),
Action: func(c *cli.Context) error {
if len(c.Args()) < 3 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify distfile, clmfile and bamfile", 1)
}

bamfile := c.Args().Get(0)
fastafile := c.Args().Get(1)
k, _ := strconv.Atoi(c.Args().Get(2))
RE := c.String("RE")
runGA := !c.Bool("skipGA")
resume := c.Bool("resume")
seed := c.Int64("seed")
npop := c.Int("npop")
ngen := c.Int("ngen")
mutpb := c.Float64("mutpb")

// Extract the contig pairs, count RE sites
banner(fmt.Sprintf("Extractor started (RE = %s)", RE))
extractor := allhic.Extracter{Bamfile: bamfile, Fastafile: fastafile, RE: RE}
extractor.Run()

// Partition into k groups
banner(fmt.Sprintf("Partition into %d groups", k))
partitioner := allhic.Partitioner{Contigsfile: extractor.OutContigsfile,
Distfile: extractor.OutPairsfile, K: k}
partitioner.Run()

// Optimize the k groups separately
for i, refile := range partitioner.OutREfiles {
banner(fmt.Sprintf("Optimize group %d", i))
optimizer := allhic.Optimizer{REfile: refile,
Clmfile: extractor.OutClmfile,
RunGA: runGA, Resume: resume,
Seed: seed, NPop: npop, NGen: ngen, MutProb: mutpb}
optimizer.Run()
}

// Run the final build
// log.Noticef("***** Build started *****")
// builder := allhic.Builder{Tourfile: tourfile, Fastafile: fastafile}
// builder.Run()

return nil
},
},
}

app.Run(os.Args)
Expand Down
7 changes: 7 additions & 0 deletions extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ type Extracter struct {
contigToIdx map[string]int
model *LinkDensityModel
totalIntraLinks int
// Output file
OutContigsfile string
OutPairsfile string
OutClmfile string
}

// ContigInfo stores results calculated from f
Expand Down Expand Up @@ -172,6 +176,7 @@ func writeRE(outfile string, contigs []*ContigInfo) {
// readFastaAndWriteRE writes out the number of restriction fragments, one per line
func (r *Extracter) readFastaAndWriteRE() {
outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt"
r.OutContigsfile = outfile
reader, _ := fastx.NewDefaultReader(r.Fastafile)
seq.ValidateSeq = false // This flag makes parsing FASTA much faster

Expand Down Expand Up @@ -242,6 +247,7 @@ func (r *Extracter) calcInterContigs() {
}

outfile := RemoveExt(r.Bamfile) + ".pairs.txt"
r.OutPairsfile = outfile
f, _ := os.Create(outfile)
w := bufio.NewWriter(f)
defer f.Close()
Expand Down Expand Up @@ -343,6 +349,7 @@ func (r *Extracter) extractContigLinks() {
fh, _ := os.Open(r.Bamfile)
prefix := RemoveExt(r.Bamfile)
clmfile := prefix + ".clm"
r.OutClmfile = clmfile

log.Noticef("Parse bamfile `%s`", r.Bamfile)
br, _ := bam.NewReader(fh, 0)
Expand Down
3 changes: 3 additions & 0 deletions partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ type Partitioner struct {
matrix [][]int64
longestRE int
clusters Clusters
// Output files
OutREfiles []string
}

// Run is the main function body of partition
Expand Down Expand Up @@ -194,6 +196,7 @@ func (r *Partitioner) splitRE() {
}
outfile := fmt.Sprintf("%s.%dg%d.txt", RemoveExt(r.Contigsfile), r.K, j+1)
writeRE(outfile, contigs)
r.OutREfiles = append(r.OutREfiles, outfile)
}
}

Expand Down

0 comments on commit 7cf2674

Please sign in to comment.