Skip to content

Commit

Permalink
Multiple REs (#14)
Browse files Browse the repository at this point in the history
* [extract] Add support for multipleREs

* [test] Remove support for go 1.11

* [cmd] Move options to the front of usage

* log.DebugF => log.NoticeF
  • Loading branch information
tanghaibao committed Dec 12, 2019
1 parent 61a95f8 commit 3e37155
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 17 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
go: [1.11, 1.12]
go: [1.12, 1.13]

steps:
- name: Set up Go ${{ matrix.go }}
Expand Down
2 changes: 1 addition & 1 deletion base.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (

const (
// Version is the current version of ALLHIC
Version = "0.9.8"
Version = "0.9.12"
// LB is lower bound for GoldenArray
LB = 18
// UB is upper bound for GoldenArray
Expand Down
25 changes: 13 additions & 12 deletions cmd/allhic.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@ func main() {
app := cli.NewApp()
app.Compiled = time.Now()
app.Copyright = "(c) Haibao Tang, Xingtan Zhang 2017-2019"
app.Name = "ALLHIC"
app.Name = "ALLHiC"
app.Usage = "Genome scaffolding based on Hi-C data"
app.Version = allhic.Version

extractFlags := []cli.Flag{
&cli.StringFlag{
Name: "RE",
Usage: "Restriction site pattern",
Usage: "Restriction site pattern, use comma to separate multiple patterns (N is considered as [ACGT]), e.g. 'GATCGATC,GANTGATC,GANTANTC,GATCANTC'",
Value: "GATC",
},
&cli.IntFlag{
Expand Down Expand Up @@ -126,12 +126,12 @@ func main() {
Name: "extract",
Usage: "Extract Hi-C link size distribution",
UsageText: `
allhic extract bamfile fastafile [options]
allhic extract [options] bamfile fastafile
Extract function:
Given a bamfile, the goal of the extract step is to calculate an empirical
distribution of Hi-C link size based on intra-contig links. The Extract function
also prepares for the latter steps of ALLHIC.
also prepares for the latter steps of ALLHiC.
`,
Flags: extractFlags,
Action: func(c *cli.Context) error {
Expand All @@ -144,6 +144,7 @@ also prepares for the latter steps of ALLHIC.
fastafile := c.Args().Get(1)
RE := c.String("RE")
minLinks := c.Int("minLinks")
log.Noticef("RE=%s minLinks=%d", RE, minLinks)

p := allhic.Extracter{Bamfile: bamfile, Fastafile: fastafile, RE: RE, MinLinks: minLinks}
p.Run()
Expand All @@ -154,7 +155,7 @@ also prepares for the latter steps of ALLHIC.
Name: "alleles",
Usage: "Build alleles.table for `prune`",
UsageText: `
allhic alleles genome.paf genome.counts_RE.txt
allhic alleles [options] genome.paf genome.counts_RE.txt
Alleles function:
Given a paf file, we could identify and classify the allelic contigs to be used
Expand Down Expand Up @@ -182,7 +183,7 @@ ALLHiC generates "alleles.table", which can then be used for later steps.
Name: "prune",
Usage: "Prune allelic, cross-allelic and weak links",
UsageText: `
allhic prune alleles.table pairs.txt [options]
allhic prune [options] alleles.table pairs.txt
Prune function:
Given contig pairing, the goal of the prune step is to remove all inter-allelic
Expand Down Expand Up @@ -222,7 +223,7 @@ tig00030660,PRIMARY -> tig00003333,HAPLOTIG
Name: "partition",
Usage: "Separate contigs into k groups",
UsageText: `
allhic partition counts_RE.txt pairs.txt k [options]
allhic partition [options] counts_RE.txt pairs.txt k
Partition function:
Given a target k, number of partitions, the goal of the partitioning is to
Expand Down Expand Up @@ -255,7 +256,7 @@ can be generated with the "extract" sub-command.
Name: "optimize",
Usage: "Order-and-orient tigs in a group",
UsageText: `
allhic optimize counts_RE.txt clmfile [options]
allhic optimize [options] counts_RE.txt clmfile
Optimize function:
Given a set of Hi-C contacts between contigs, as specified in the
Expand Down Expand Up @@ -292,7 +293,7 @@ on a cluster).
Name: "build",
Usage: "Build genome release",
UsageText: `
allhic build tourfile1 tourfile2 ... contigs.fasta asm.chr.fasta [options]
allhic build [options] tourfile1 tourfile2 ... contigs.fasta asm.chr.fasta
Build function:
Convert the tourfile into the standard AGP file, which is then converted
Expand Down Expand Up @@ -324,7 +325,7 @@ into a FASTA genome release.
Name: "plot",
Usage: "Extract matrix of link counts and plot heatmap",
UsageText: `
allhic anchor bamfile tourfile [options]
allhic anchor [options] bamfile tourfile
Anchor function:
Given a bamfile, we extract matrix of link counts and plot heatmap.
Expand All @@ -348,7 +349,7 @@ Given a bamfile, we extract matrix of link counts and plot heatmap.
Name: "assess",
Usage: "Assess the orientations of contigs",
UsageText: `
allhic assess bamfile bedfile chr1
allhic assess [options] bamfile bedfile chr1
Assess function:
Compute the posterior probability of contig orientations after scaffolding
Expand All @@ -372,7 +373,7 @@ as a quality assessment step.
Name: "pipeline",
Usage: "Run extract-partition-optimize-build steps sequentially",
UsageText: `
allhic pipeline bamfile fastafile k [options]
allhic pipeline [options] bamfile fastafile k
Pipeline:
A convenience driver function. Chain the following steps sequentially.
Expand Down
56 changes: 53 additions & 3 deletions extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"io"
"math"
"os"
"regexp"
"sort"
"strings"

Expand Down Expand Up @@ -69,6 +70,13 @@ type ContigPair struct {
label string // allelic/cross-allelic/ok
}

// Pattern is a string pattern that is either simple or a regex
type Pattern struct {
pattern []byte
rePattern *regexp.Regexp
isRegex bool
}

// String outputs the string representation of ContigInfo
func (r ContigInfo) String() string {
return fmt.Sprintf("%s\t%d\t%d", r.name, r.recounts, r.length)
Expand Down Expand Up @@ -177,9 +185,50 @@ func writeRE(outfile string, contigs []*ContigInfo) {
len(contigs), totalCounts, totalBp/int64(totalCounts), outfile)
}

// MakePattern builds a regex-aware pattern that could be passed around and counted
// Multiple patterns will be split at comma (,) and N is converted to [ACGT]
func MakePattern(s string) Pattern {
rePatternStr := s
isRegex := false
if strings.Contains(s, ",") {
rePatternStr = ""
for i, pattern := range strings.Split(s, ",") {
if i != 0 {
rePatternStr += "|"
}
rePatternStr += fmt.Sprintf("(%s)", pattern)
}
isRegex = true
}
if strings.Contains(s, "N") {
rePatternStr = strings.ReplaceAll(rePatternStr, "N", "[ACGT]")
isRegex = true
}
rePattern := regexp.MustCompile(rePatternStr)
if isRegex {
log.Noticef("Compile '%s' => '%s'", s, rePatternStr)
}
return Pattern{
pattern: []byte(s),
rePattern: rePattern,
isRegex: isRegex,
}
}

// CountPattern count how many times a pattern occurs in seq
func CountPattern(seq []byte, pattern Pattern) int {
if pattern.isRegex {
if all := pattern.rePattern.FindAllIndex(seq, -1); all != nil {
return len(all)
}
return 0
}
return bytes.Count(seq, pattern.pattern)
}

// readFastaAndWriteRE writes out the number of restriction fragments, one per line
func (r *Extracter) readFastaAndWriteRE() {
outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt"
outfile := RemoveExt(r.Bamfile) + ".counts_" + strings.ReplaceAll(r.RE, ",", "_") + ".txt"
r.OutContigsfile = outfile
mustExist(r.Fastafile)
reader, _ := fastx.NewDefaultReader(r.Fastafile)
Expand All @@ -189,6 +238,7 @@ func (r *Extracter) readFastaAndWriteRE() {
r.contigToIdx = map[string]int{}
totalCounts := 0
totalBp := int64(0)
pattern := MakePattern(r.RE)

for {
rec, err := reader.Read()
Expand All @@ -199,8 +249,8 @@ func (r *Extracter) readFastaAndWriteRE() {
name := string(rec.Name)
// Strip the sequence name to get the first part up to empty space
name = strings.Fields(name)[0]
// Add pseudocount of 1 to prevent division by zero
count := bytes.Count(rec.Seq.Seq, []byte(r.RE)) + 1
// Add pseudo-count of 1 to prevent division by zero
count := CountPattern(rec.Seq.Seq, pattern) + 1
length := rec.Seq.Length()
totalCounts += count
totalBp += int64(length)
Expand Down
36 changes: 36 additions & 0 deletions extract_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* extract_test.go
* allhic
*
* Created by Haibao Tang on 12/11/19
* Copyright © 2019 Haibao Tang. All rights reserved.
*/

package allhic_test

import (
"github.com/tanghaibao/allhic"
"testing"
)

func TestCountSimplePattern(t *testing.T) {
pattern := "GATC"
seq := []byte("GATCGATCGATC")
simplePattern := allhic.MakePattern(pattern)
got := allhic.CountPattern(seq, simplePattern)
expected := 3
if got != expected {
t.Errorf("CountPattern(#{seq}, #{pattern})=#{got}; want #{expected}")
}
}

func TestCountRegexPattern(t *testing.T) {
pattern := "GATCGATC,GANTGATC,GANTANTC,GATCANTC"
seq := []byte("GATCGATCGGACTGATCGACCGATCACTCACGCTAAATGCAGAATCGATTATTC")
regexPattern := allhic.MakePattern(pattern)
got := allhic.CountPattern(seq, regexPattern)
expected := 4
if got != expected {
t.Errorf("CountPattern(#{seq}, #{pattern})=#{got}; want #{expected}")
}
}

0 comments on commit 3e37155

Please sign in to comment.