Skip to content

Commit 3aab61c

Browse files
authored
add sequencing functions (#49)
* Add sequencing * add checks for minimap2 * Add external functions needed for sequencing example * add megamash (#50) * changed how generics work * add some docs to megamash * Added changelog
1 parent 919d685 commit 3aab61c

27 files changed

+923
-154
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
7171
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7272

7373
## [Unreleased]
74+
- Added the megamash algorithm [#50](https://github.com/Koeng101/dnadesign/pull/50)
75+
- Changed parsers to return values instead of pointers. Added some sequencing utils [#49](https://github.com/Koeng101/dnadesign/pull/49)
7476
- Added minimap2 and samtools(pileup) integrations in external [#46](https://github.com/Koeng101/dnadesign/pull/46)
7577
- Added sam parser [#5](https://github.com/Koeng101/dnadesign/pull/5)
7678
- Added the LinearFold folding algorithms [#38](https://github.com/Koeng101/dnadesign/pull/38)

external/minimap2/minimap2.go

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,15 @@ For more information on minimap2, please visit Heng Li's git: https://github.com
2020
package minimap2
2121

2222
import (
23+
"context"
2324
"io"
2425
"os"
2526
"os/exec"
27+
28+
"github.com/koeng101/dnadesign/lib/bio"
29+
"github.com/koeng101/dnadesign/lib/bio/fastq"
30+
"github.com/koeng101/dnadesign/lib/bio/sam"
31+
"golang.org/x/sync/errgroup"
2632
)
2733

2834
// Minimap2 aligns sequences using minimap2 over the command line. Right
@@ -65,7 +71,7 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e
6571
tmpFile.Close() // Close the file as it's no longer needed
6672

6773
// Start minimap2 pointing to the temporary file and stdin for sequencing data
68-
cmd := exec.Command("minimap2", "-ax", "map-ont", tmpFile.Name(), "-")
74+
cmd := exec.Command("minimap2", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-")
6975
cmd.Stdout = w
7076
cmd.Stdin = fastqInput
7177
if err := cmd.Start(); err != nil {
@@ -74,3 +80,52 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e
7480

7581
return cmd.Wait()
7682
}
83+
84+
// Minimap2Channeled uses channels rather than io.Reader and io.Writers.
85+
func Minimap2Channeled(fastaTemplates io.Reader, fastqChan <-chan fastq.Read, samChan chan<- sam.Alignment) error {
86+
ctx := context.Background()
87+
g, ctx := errgroup.WithContext(ctx)
88+
89+
// Create a pipe for writing fastq reads and reading them as an io.Reader
90+
fastqPr, fastqPw := io.Pipe()
91+
92+
// Goroutine to consume fastq reads and write them to the PipeWriter
93+
g.Go(func() error {
94+
defer fastqPw.Close()
95+
for read := range fastqChan {
96+
_, err := read.WriteTo(fastqPw)
97+
if err != nil {
98+
return err // return error to be handled by errgroup
99+
}
100+
}
101+
return nil
102+
})
103+
104+
// Create a pipe for SAM alignments.
105+
samPr, samPw := io.Pipe()
106+
107+
// Use Minimap2 function to process the reads and write SAM alignments.
108+
g.Go(func() error {
109+
defer samPw.Close()
110+
return Minimap2(fastaTemplates, fastqPr, samPw) // Minimap2 writes to samPw
111+
})
112+
113+
// Create a SAM parser from samPr (the PipeReader connected to Minimap2 output).
114+
samParser, err := bio.NewSamParser(samPr)
115+
if err != nil {
116+
return err
117+
}
118+
119+
// Parsing SAM and sending to channel.
120+
g.Go(func() error {
121+
return samParser.ParseToChannel(ctx, samChan, false)
122+
})
123+
124+
// Wait for all goroutines in the group to finish.
125+
if err := g.Wait(); err != nil {
126+
return err // This will return the first non-nil error from the group of goroutines
127+
}
128+
129+
// At this point, all goroutines have finished successfully
130+
return nil
131+
}

lib/align/megamash/megamash.go

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
/*
2+
Package megamash is an implementation of the megamash algorithm.
3+
4+
Megamash is an algorithm developed by Keoni Gandall to find templates from
5+
sequencing reactions. For example, you may have a pool of amplicons, and need
6+
to get a count of how many times each amplicon shows up in a given sequencing
7+
reaction.
8+
*/
9+
package megamash
10+
11+
import (
12+
"context"
13+
"fmt"
14+
15+
"github.com/koeng101/dnadesign/lib/bio/fasta"
16+
"github.com/koeng101/dnadesign/lib/bio/fastq"
17+
"github.com/koeng101/dnadesign/lib/transform"
18+
)
19+
20+
// StandardizedDNA returns the alphabetically lesser strand of a double
21+
// stranded DNA molecule.
22+
func StandardizedDNA(sequence string) string {
23+
var deterministicSequence string
24+
reverseComplement := transform.ReverseComplement(sequence)
25+
if sequence > reverseComplement {
26+
deterministicSequence = reverseComplement
27+
} else {
28+
deterministicSequence = sequence
29+
}
30+
return deterministicSequence
31+
}
32+
33+
var (
34+
DefaultKmerSize uint = 16
35+
DefaultMinimalKmerCount uint = 10
36+
DefaultScoreThreshold float64 = 0.2
37+
)
38+
39+
type MegamashMap struct {
40+
Identifiers []string
41+
Kmers []map[string]bool
42+
KmerSize uint
43+
KmerMinimalCount uint
44+
Threshold float64
45+
}
46+
47+
// NewMegamashMap creates a megamash map that can be searched against.
48+
func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount uint, threshold float64) (MegamashMap, error) {
49+
var megamashMap MegamashMap
50+
megamashMap.KmerSize = kmerSize
51+
megamashMap.KmerMinimalCount = kmerMinimalCount
52+
megamashMap.Threshold = threshold
53+
54+
for _, fastaRecord := range sequences {
55+
megamashMap.Identifiers = append(megamashMap.Identifiers, fastaRecord.Identifier)
56+
sequence := fastaRecord.Sequence
57+
58+
// First get all kmers with a given sequence
59+
kmerMap := make(map[string]bool)
60+
for i := 0; i <= len(sequence)-int(kmerSize); i++ {
61+
kmerString := StandardizedDNA(sequence[i : i+int(kmerSize)])
62+
kmerMap[kmerString] = true
63+
}
64+
65+
// Then, get unique kmers for this sequence and only this sequence
66+
uniqueKmerMap := make(map[string]bool)
67+
for kmerBase64 := range kmerMap {
68+
unique := true
69+
for _, otherMegaMashMap := range megamashMap.Kmers {
70+
_, ok := otherMegaMashMap[kmerBase64]
71+
// If this kmer is found, set both to false
72+
if ok {
73+
otherMegaMashMap[kmerBase64] = false
74+
unique = false
75+
break
76+
}
77+
}
78+
if unique {
79+
uniqueKmerMap[kmerBase64] = true
80+
}
81+
}
82+
// Check if we have the minimal kmerCount
83+
var kmerCount uint = 0
84+
for _, unique := range uniqueKmerMap {
85+
if unique {
86+
kmerCount++
87+
}
88+
}
89+
if kmerCount < kmerMinimalCount {
90+
return megamashMap, fmt.Errorf("Got only %d unique kmers of required %d for sequence %s", kmerCount, kmerMinimalCount, fastaRecord.Identifier)
91+
}
92+
93+
// Now we have a unique Kmer map for the given sequence.
94+
// Add it to megamashMap
95+
megamashMap.Kmers = append(megamashMap.Kmers, uniqueKmerMap)
96+
}
97+
return megamashMap, nil
98+
}
99+
100+
// Match contains the identifier and score of a potential match to the searched
101+
// sequence.
102+
type Match struct {
103+
Identifier string
104+
Score float64
105+
}
106+
107+
// Match matches a sequence to all the sequences in a megamash map.
108+
func (m *MegamashMap) Match(sequence string) []Match {
109+
var scores []float64
110+
// The algorithm is as follows:
111+
// - Go through each map.
112+
// - Get the number of matching kmers
113+
// - Divide that by the total kmers available for matching
114+
115+
// First, get the kmer total
116+
var kmerSize int
117+
out:
118+
for _, maps := range m.Kmers {
119+
for kmer := range maps {
120+
kmerSize = len(kmer)
121+
break out
122+
}
123+
}
124+
125+
// Now, iterate through each map
126+
for _, sequenceMap := range m.Kmers {
127+
var score float64
128+
var totalKmers = len(sequenceMap)
129+
var matchedKmers int
130+
for i := 0; i <= len(sequence)-kmerSize; i++ {
131+
kmerString := StandardizedDNA(sequence[i : i+kmerSize])
132+
unique, ok := sequenceMap[kmerString]
133+
if ok && unique {
134+
matchedKmers++
135+
}
136+
}
137+
if totalKmers == 0 {
138+
score = 0
139+
} else {
140+
score = float64(matchedKmers) / float64(totalKmers)
141+
}
142+
scores = append(scores, score)
143+
}
144+
145+
var matches []Match
146+
for i, score := range scores {
147+
if score > m.Threshold {
148+
matches = append(matches, Match{Identifier: m.Identifiers[i], Score: score})
149+
}
150+
}
151+
return matches
152+
}
153+
154+
// FastqMatchChannel processes a channel of fastq.Read and pushes to a channel of matches.
155+
func (m *MegamashMap) FastqMatchChannel(ctx context.Context, sequences <-chan fastq.Read, matches chan<- []Match) error {
156+
for {
157+
select {
158+
case <-ctx.Done():
159+
// Clean up resources, handle cancellation.
160+
return ctx.Err()
161+
case sequence, ok := <-sequences:
162+
if !ok {
163+
close(matches)
164+
return nil
165+
}
166+
sequenceMatches := m.Match(sequence.Sequence)
167+
matches <- sequenceMatches
168+
}
169+
}
170+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package megamash
2+
3+
import (
4+
"testing"
5+
6+
"github.com/koeng101/dnadesign/lib/bio/fasta"
7+
)
8+
9+
func TestMegamash(t *testing.T) {
10+
oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
11+
oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
12+
oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"
13+
14+
samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
15+
m, err := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold)
16+
if err != nil {
17+
t.Errorf("Failed to make NewMegamashMap: %s", err)
18+
}
19+
for _, sample := range samples {
20+
scores := m.Match(sample)
21+
if scores[0].Identifier != "oligo2" {
22+
t.Errorf("Should have gotten oligo2. Got: %s", scores[0].Identifier)
23+
}
24+
}
25+
}
26+
27+
func BenchmarkMegamash(b *testing.B) {
28+
for i := 0; i < b.N; i++ {
29+
oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
30+
oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
31+
oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"
32+
33+
samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
34+
m, _ := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold)
35+
for _, sample := range samples {
36+
_ = m.Match(sample)
37+
}
38+
}
39+
}

0 commit comments

Comments
 (0)