From 3e37155a3d1fd97c6cee87842017d45183c8c166 Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Wed, 11 Dec 2019 23:56:44 -0800 Subject: [PATCH] Multiple REs (#14) * [extract] Add support for multipleREs * [test] Remove support for go 1.11 * [cmd] Move options to the front of usage * log.DebugF => log.NoticeF --- .github/workflows/build.yml | 2 +- base.go | 2 +- cmd/allhic.go | 25 +++++++++-------- extract.go | 56 +++++++++++++++++++++++++++++++++++-- extract_test.go | 36 ++++++++++++++++++++++++ 5 files changed, 104 insertions(+), 17 deletions(-) create mode 100644 extract_test.go diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6f0a328..5ee3c2e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - go: [1.11, 1.12] + go: [1.12, 1.13] steps: - name: Set up Go ${{ matrix.go }} diff --git a/base.go b/base.go index 0494ab3..bfbe4a6 100644 --- a/base.go +++ b/base.go @@ -25,7 +25,7 @@ import ( const ( // Version is the current version of ALLHIC - Version = "0.9.8" + Version = "0.9.12" // LB is lower bound for GoldenArray LB = 18 // UB is upper bound for GoldenArray diff --git a/cmd/allhic.go b/cmd/allhic.go index 6bec8f4..4b128de 100644 --- a/cmd/allhic.go +++ b/cmd/allhic.go @@ -55,14 +55,14 @@ func main() { app := cli.NewApp() app.Compiled = time.Now() app.Copyright = "(c) Haibao Tang, Xingtan Zhang 2017-2019" - app.Name = "ALLHIC" + app.Name = "ALLHiC" app.Usage = "Genome scaffolding based on Hi-C data" app.Version = allhic.Version extractFlags := []cli.Flag{ &cli.StringFlag{ Name: "RE", - Usage: "Restriction site pattern", + Usage: "Restriction site pattern, use comma to separate multiple patterns (N is considered as [ACGT]), e.g. 'GATCGATC,GANTGATC,GANTANTC,GATCANTC'", Value: "GATC", }, &cli.IntFlag{ @@ -126,12 +126,12 @@ func main() { Name: "extract", Usage: "Extract Hi-C link size distribution", UsageText: ` - allhic extract bamfile fastafile [options] + allhic extract [options] bamfile fastafile Extract function: Given a bamfile, the goal of the extract step is to calculate an empirical distribution of Hi-C link size based on intra-contig links. The Extract function -also prepares for the latter steps of ALLHIC. +also prepares for the latter steps of ALLHiC. `, Flags: extractFlags, Action: func(c *cli.Context) error { @@ -144,6 +144,7 @@ also prepares for the latter steps of ALLHIC. fastafile := c.Args().Get(1) RE := c.String("RE") minLinks := c.Int("minLinks") + log.Noticef("RE=%s minLinks=%d", RE, minLinks) p := allhic.Extracter{Bamfile: bamfile, Fastafile: fastafile, RE: RE, MinLinks: minLinks} p.Run() @@ -154,7 +155,7 @@ also prepares for the latter steps of ALLHIC. Name: "alleles", Usage: "Build alleles.table for `prune`", UsageText: ` - allhic alleles genome.paf genome.counts_RE.txt + allhic alleles [options] genome.paf genome.counts_RE.txt Alleles function: Given a paf file, we could identify and classify the allelic contigs to be used @@ -182,7 +183,7 @@ ALLHiC generates "alleles.table", which can then be used for later steps. Name: "prune", Usage: "Prune allelic, cross-allelic and weak links", UsageText: ` - allhic prune alleles.table pairs.txt [options] + allhic prune [options] alleles.table pairs.txt Prune function: Given contig pairing, the goal of the prune step is to remove all inter-allelic @@ -222,7 +223,7 @@ tig00030660,PRIMARY -> tig00003333,HAPLOTIG Name: "partition", Usage: "Separate contigs into k groups", UsageText: ` - allhic partition counts_RE.txt pairs.txt k [options] + allhic partition [options] counts_RE.txt pairs.txt k Partition function: Given a target k, number of partitions, the goal of the partitioning is to @@ -255,7 +256,7 @@ can be generated with the "extract" sub-command. Name: "optimize", Usage: "Order-and-orient tigs in a group", UsageText: ` - allhic optimize counts_RE.txt clmfile [options] + allhic optimize [options] counts_RE.txt clmfile Optimize function: Given a set of Hi-C contacts between contigs, as specified in the @@ -292,7 +293,7 @@ on a cluster). Name: "build", Usage: "Build genome release", UsageText: ` - allhic build tourfile1 tourfile2 ... contigs.fasta asm.chr.fasta [options] + allhic build [options] tourfile1 tourfile2 ... contigs.fasta asm.chr.fasta Build function: Convert the tourfile into the standard AGP file, which is then converted @@ -324,7 +325,7 @@ into a FASTA genome release. Name: "plot", Usage: "Extract matrix of link counts and plot heatmap", UsageText: ` - allhic anchor bamfile tourfile [options] + allhic anchor [options] bamfile tourfile Anchor function: Given a bamfile, we extract matrix of link counts and plot heatmap. @@ -348,7 +349,7 @@ Given a bamfile, we extract matrix of link counts and plot heatmap. Name: "assess", Usage: "Assess the orientations of contigs", UsageText: ` - allhic assess bamfile bedfile chr1 + allhic assess [options] bamfile bedfile chr1 Assess function: Compute the posterior probability of contig orientations after scaffolding @@ -372,7 +373,7 @@ as a quality assessment step. Name: "pipeline", Usage: "Run extract-partition-optimize-build steps sequentially", UsageText: ` - allhic pipeline bamfile fastafile k [options] + allhic pipeline [options] bamfile fastafile k Pipeline: A convenience driver function. Chain the following steps sequentially. diff --git a/extract.go b/extract.go index 4769bcc..6d3f291 100644 --- a/extract.go +++ b/extract.go @@ -16,6 +16,7 @@ import ( "io" "math" "os" + "regexp" "sort" "strings" @@ -69,6 +70,13 @@ type ContigPair struct { label string // allelic/cross-allelic/ok } +// Pattern is a string pattern that is either simple or a regex +type Pattern struct { + pattern []byte + rePattern *regexp.Regexp + isRegex bool +} + // String outputs the string representation of ContigInfo func (r ContigInfo) String() string { return fmt.Sprintf("%s\t%d\t%d", r.name, r.recounts, r.length) @@ -177,9 +185,50 @@ func writeRE(outfile string, contigs []*ContigInfo) { len(contigs), totalCounts, totalBp/int64(totalCounts), outfile) } +// MakePattern builds a regex-aware pattern that could be passed around and counted +// Multiple patterns will be split at comma (,) and N is converted to [ACGT] +func MakePattern(s string) Pattern { + rePatternStr := s + isRegex := false + if strings.Contains(s, ",") { + rePatternStr = "" + for i, pattern := range strings.Split(s, ",") { + if i != 0 { + rePatternStr += "|" + } + rePatternStr += fmt.Sprintf("(%s)", pattern) + } + isRegex = true + } + if strings.Contains(s, "N") { + rePatternStr = strings.ReplaceAll(rePatternStr, "N", "[ACGT]") + isRegex = true + } + rePattern := regexp.MustCompile(rePatternStr) + if isRegex { + log.Noticef("Compile '%s' => '%s'", s, rePatternStr) + } + return Pattern{ + pattern: []byte(s), + rePattern: rePattern, + isRegex: isRegex, + } +} + +// CountPattern count how many times a pattern occurs in seq +func CountPattern(seq []byte, pattern Pattern) int { + if pattern.isRegex { + if all := pattern.rePattern.FindAllIndex(seq, -1); all != nil { + return len(all) + } + return 0 + } + return bytes.Count(seq, pattern.pattern) +} + // readFastaAndWriteRE writes out the number of restriction fragments, one per line func (r *Extracter) readFastaAndWriteRE() { - outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt" + outfile := RemoveExt(r.Bamfile) + ".counts_" + strings.ReplaceAll(r.RE, ",", "_") + ".txt" r.OutContigsfile = outfile mustExist(r.Fastafile) reader, _ := fastx.NewDefaultReader(r.Fastafile) @@ -189,6 +238,7 @@ func (r *Extracter) readFastaAndWriteRE() { r.contigToIdx = map[string]int{} totalCounts := 0 totalBp := int64(0) + pattern := MakePattern(r.RE) for { rec, err := reader.Read() @@ -199,8 +249,8 @@ func (r *Extracter) readFastaAndWriteRE() { name := string(rec.Name) // Strip the sequence name to get the first part up to empty space name = strings.Fields(name)[0] - // Add pseudocount of 1 to prevent division by zero - count := bytes.Count(rec.Seq.Seq, []byte(r.RE)) + 1 + // Add pseudo-count of 1 to prevent division by zero + count := CountPattern(rec.Seq.Seq, pattern) + 1 length := rec.Seq.Length() totalCounts += count totalBp += int64(length) diff --git a/extract_test.go b/extract_test.go new file mode 100644 index 0000000..c045c1e --- /dev/null +++ b/extract_test.go @@ -0,0 +1,36 @@ +/* + * extract_test.go + * allhic + * + * Created by Haibao Tang on 12/11/19 + * Copyright © 2019 Haibao Tang. All rights reserved. + */ + +package allhic_test + +import ( + "github.com/tanghaibao/allhic" + "testing" +) + +func TestCountSimplePattern(t *testing.T) { + pattern := "GATC" + seq := []byte("GATCGATCGATC") + simplePattern := allhic.MakePattern(pattern) + got := allhic.CountPattern(seq, simplePattern) + expected := 3 + if got != expected { + t.Errorf("CountPattern(#{seq}, #{pattern})=#{got}; want #{expected}") + } +} + +func TestCountRegexPattern(t *testing.T) { + pattern := "GATCGATC,GANTGATC,GANTANTC,GATCANTC" + seq := []byte("GATCGATCGGACTGATCGACCGATCACTCACGCTAAATGCAGAATCGATTATTC") + regexPattern := allhic.MakePattern(pattern) + got := allhic.CountPattern(seq, regexPattern) + expected := 4 + if got != expected { + t.Errorf("CountPattern(#{seq}, #{pattern})=#{got}; want #{expected}") + } +}