Koeng101
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎external/minimap2/minimap2.go‎
Lines changed: 56 additions & 1 deletion b/‎external/minimap2/minimap2.go‎
Lines changed: 56 additions & 1 deletion
diff --git a/‎lib/align/megamash/megamash.go‎
Lines changed: 170 additions & 0 deletions b/‎lib/align/megamash/megamash.go‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎lib/align/megamash/megamash_test.go‎
Lines changed: 39 additions & 0 deletions b/‎lib/align/megamash/megamash_test.go‎
Lines changed: 39 additions & 0 deletions
@@ -71,6 +71,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+- Added the megamash algorithm [#50](https://github.com/Koeng101/dnadesign/pull/50)
+- Changed parsers to return values instead of pointers. Added some sequencing utils [#49](https://github.com/Koeng101/dnadesign/pull/49)
 - Added minimap2 and samtools(pileup) integrations in external [#46](https://github.com/Koeng101/dnadesign/pull/46)
 - Added sam parser [#5](https://github.com/Koeng101/dnadesign/pull/5)
 - Added the LinearFold folding algorithms [#38](https://github.com/Koeng101/dnadesign/pull/38)
 
@@ -20,9 +20,15 @@ For more information on minimap2, please visit Heng Li's git: https://github.com
 package minimap2
 
 import (
+	"context"
 	"io"
 	"os"
 	"os/exec"
+
+	"github.com/koeng101/dnadesign/lib/bio"
+	"github.com/koeng101/dnadesign/lib/bio/fastq"
+	"github.com/koeng101/dnadesign/lib/bio/sam"
+	"golang.org/x/sync/errgroup"
 )
 
 // Minimap2 aligns sequences using minimap2 over the command line. Right
@@ -65,7 +71,7 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e
 	tmpFile.Close() // Close the file as it's no longer needed
 
 	// Start minimap2 pointing to the temporary file and stdin for sequencing data
-	cmd := exec.Command("minimap2", "-ax", "map-ont", tmpFile.Name(), "-")
+	cmd := exec.Command("minimap2", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-")
 	cmd.Stdout = w
 	cmd.Stdin = fastqInput
 	if err := cmd.Start(); err != nil {
@@ -74,3 +80,52 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e
 
 	return cmd.Wait()
 }
+
+// Minimap2Channeled uses channels rather than io.Reader and io.Writers.
+func Minimap2Channeled(fastaTemplates io.Reader, fastqChan <-chan fastq.Read, samChan chan<- sam.Alignment) error {
+	ctx := context.Background()
+	g, ctx := errgroup.WithContext(ctx)
+
+	// Create a pipe for writing fastq reads and reading them as an io.Reader
+	fastqPr, fastqPw := io.Pipe()
+
+	// Goroutine to consume fastq reads and write them to the PipeWriter
+	g.Go(func() error {
+		defer fastqPw.Close()
+		for read := range fastqChan {
+			_, err := read.WriteTo(fastqPw)
+			if err != nil {
+				return err // return error to be handled by errgroup
+			}
+		}
+		return nil
+	})
+
+	// Create a pipe for SAM alignments.
+	samPr, samPw := io.Pipe()
+
+	// Use Minimap2 function to process the reads and write SAM alignments.
+	g.Go(func() error {
+		defer samPw.Close()
+		return Minimap2(fastaTemplates, fastqPr, samPw) // Minimap2 writes to samPw
+	})
+
+	// Create a SAM parser from samPr (the PipeReader connected to Minimap2 output).
+	samParser, err := bio.NewSamParser(samPr)
+	if err != nil {
+		return err
+	}
+
+	// Parsing SAM and sending to channel.
+	g.Go(func() error {
+		return samParser.ParseToChannel(ctx, samChan, false)
+	})
+
+	// Wait for all goroutines in the group to finish.
+	if err := g.Wait(); err != nil {
+		return err // This will return the first non-nil error from the group of goroutines
+	}
+
+	// At this point, all goroutines have finished successfully
+	return nil
+}
@@ -0,0 +1,170 @@
+/*
+Package megamash is an implementation of the megamash algorithm.
+
+Megamash is an algorithm developed by Keoni Gandall to find templates from
+sequencing reactions. For example, you may have a pool of amplicons, and need
+to get a count of how many times each amplicon shows up in a given sequencing
+reaction.
+*/
+package megamash
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/koeng101/dnadesign/lib/bio/fasta"
+	"github.com/koeng101/dnadesign/lib/bio/fastq"
+	"github.com/koeng101/dnadesign/lib/transform"
+)
+
+// StandardizedDNA returns the alphabetically lesser strand of a double
+// stranded DNA molecule.
+func StandardizedDNA(sequence string) string {
+	var deterministicSequence string
+	reverseComplement := transform.ReverseComplement(sequence)
+	if sequence > reverseComplement {
+		deterministicSequence = reverseComplement
+	} else {
+		deterministicSequence = sequence
+	}
+	return deterministicSequence
+}
+
+var (
+	DefaultKmerSize         uint    = 16
+	DefaultMinimalKmerCount uint    = 10
+	DefaultScoreThreshold   float64 = 0.2
+)
+
+type MegamashMap struct {
+	Identifiers      []string
+	Kmers            []map[string]bool
+	KmerSize         uint
+	KmerMinimalCount uint
+	Threshold        float64
+}
+
+// NewMegamashMap creates a megamash map that can be searched against.
+func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount uint, threshold float64) (MegamashMap, error) {
+	var megamashMap MegamashMap
+	megamashMap.KmerSize = kmerSize
+	megamashMap.KmerMinimalCount = kmerMinimalCount
+	megamashMap.Threshold = threshold
+
+	for _, fastaRecord := range sequences {
+		megamashMap.Identifiers = append(megamashMap.Identifiers, fastaRecord.Identifier)
+		sequence := fastaRecord.Sequence
+
+		// First get all kmers with a given sequence
+		kmerMap := make(map[string]bool)
+		for i := 0; i <= len(sequence)-int(kmerSize); i++ {
+			kmerString := StandardizedDNA(sequence[i : i+int(kmerSize)])
+			kmerMap[kmerString] = true
+		}
+
+		// Then, get unique kmers for this sequence and only this sequence
+		uniqueKmerMap := make(map[string]bool)
+		for kmerBase64 := range kmerMap {
+			unique := true
+			for _, otherMegaMashMap := range megamashMap.Kmers {
+				_, ok := otherMegaMashMap[kmerBase64]
+				// If this kmer is found, set both to false
+				if ok {
+					otherMegaMashMap[kmerBase64] = false
+					unique = false
+					break
+				}
+			}
+			if unique {
+				uniqueKmerMap[kmerBase64] = true
+			}
+		}
+		// Check if we have the minimal kmerCount
+		var kmerCount uint = 0
+		for _, unique := range uniqueKmerMap {
+			if unique {
+				kmerCount++
+			}
+		}
+		if kmerCount < kmerMinimalCount {
+			return megamashMap, fmt.Errorf("Got only %d unique kmers of required %d for sequence %s", kmerCount, kmerMinimalCount, fastaRecord.Identifier)
+		}
+
+		// Now we have a unique Kmer map for the given sequence.
+		// Add it to megamashMap
+		megamashMap.Kmers = append(megamashMap.Kmers, uniqueKmerMap)
+	}
+	return megamashMap, nil
+}
+
+// Match contains the identifier and score of a potential match to the searched
+// sequence.
+type Match struct {
+	Identifier string
+	Score      float64
+}
+
+// Match matches a sequence to all the sequences in a megamash map.
+func (m *MegamashMap) Match(sequence string) []Match {
+	var scores []float64
+	// The algorithm is as follows:
+	// - Go through each map.
+	// - Get the number of matching kmers
+	// - Divide that by the total kmers available for matching
+
+	// First, get the kmer total
+	var kmerSize int
+out:
+	for _, maps := range m.Kmers {
+		for kmer := range maps {
+			kmerSize = len(kmer)
+			break out
+		}
+	}
+
+	// Now, iterate through each map
+	for _, sequenceMap := range m.Kmers {
+		var score float64
+		var totalKmers = len(sequenceMap)
+		var matchedKmers int
+		for i := 0; i <= len(sequence)-kmerSize; i++ {
+			kmerString := StandardizedDNA(sequence[i : i+kmerSize])
+			unique, ok := sequenceMap[kmerString]
+			if ok && unique {
+				matchedKmers++
+			}
+		}
+		if totalKmers == 0 {
+			score = 0
+		} else {
+			score = float64(matchedKmers) / float64(totalKmers)
+		}
+		scores = append(scores, score)
+	}
+
+	var matches []Match
+	for i, score := range scores {
+		if score > m.Threshold {
+			matches = append(matches, Match{Identifier: m.Identifiers[i], Score: score})
+		}
+	}
+	return matches
+}
+
+// FastqMatchChannel processes a channel of fastq.Read and pushes to a channel of matches.
+func (m *MegamashMap) FastqMatchChannel(ctx context.Context, sequences <-chan fastq.Read, matches chan<- []Match) error {
+	for {
+		select {
+		case <-ctx.Done():
+			// Clean up resources, handle cancellation.
+			return ctx.Err()
+		case sequence, ok := <-sequences:
+			if !ok {
+				close(matches)
+				return nil
+			}
+			sequenceMatches := m.Match(sequence.Sequence)
+			matches <- sequenceMatches
+		}
+	}
+}
@@ -0,0 +1,39 @@
+package megamash
+
+import (
+	"testing"
+
+	"github.com/koeng101/dnadesign/lib/bio/fasta"
+)
+
+func TestMegamash(t *testing.T) {
+	oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
+	oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
+	oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"
+
+	samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
+	m, err := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold)
+	if err != nil {
+		t.Errorf("Failed to make NewMegamashMap: %s", err)
+	}
+	for _, sample := range samples {
+		scores := m.Match(sample)
+		if scores[0].Identifier != "oligo2" {
+			t.Errorf("Should have gotten oligo2. Got: %s", scores[0].Identifier)
+		}
+	}
+}
+
+func BenchmarkMegamash(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
+		oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
+		oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"
+
+		samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
+		m, _ := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold)
+		for _, sample := range samples {
+			_ = m.Match(sample)
+		}
+	}
+}