|
| 1 | +/* |
| 2 | +Package megamash is an implementation of the megamash algorithm. |
| 3 | +
|
| 4 | +Megamash is an algorithm developed by Keoni Gandall to find templates from |
| 5 | +sequencing reactions. For example, you may have a pool of amplicons, and need |
| 6 | +to get a count of how many times each amplicon shows up in a given sequencing |
| 7 | +reaction. |
| 8 | +*/ |
| 9 | +package megamash |
| 10 | + |
| 11 | +import ( |
| 12 | + "context" |
| 13 | + "fmt" |
| 14 | + |
| 15 | + "github.com/koeng101/dnadesign/lib/bio/fasta" |
| 16 | + "github.com/koeng101/dnadesign/lib/bio/fastq" |
| 17 | + "github.com/koeng101/dnadesign/lib/transform" |
| 18 | +) |
| 19 | + |
| 20 | +// StandardizedDNA returns the alphabetically lesser strand of a double |
| 21 | +// stranded DNA molecule. |
| 22 | +func StandardizedDNA(sequence string) string { |
| 23 | + var deterministicSequence string |
| 24 | + reverseComplement := transform.ReverseComplement(sequence) |
| 25 | + if sequence > reverseComplement { |
| 26 | + deterministicSequence = reverseComplement |
| 27 | + } else { |
| 28 | + deterministicSequence = sequence |
| 29 | + } |
| 30 | + return deterministicSequence |
| 31 | +} |
| 32 | + |
| 33 | +var ( |
| 34 | + DefaultKmerSize uint = 16 |
| 35 | + DefaultMinimalKmerCount uint = 10 |
| 36 | + DefaultScoreThreshold float64 = 0.2 |
| 37 | +) |
| 38 | + |
| 39 | +type MegamashMap struct { |
| 40 | + Identifiers []string |
| 41 | + Kmers []map[string]bool |
| 42 | + KmerSize uint |
| 43 | + KmerMinimalCount uint |
| 44 | + Threshold float64 |
| 45 | +} |
| 46 | + |
| 47 | +// NewMegamashMap creates a megamash map that can be searched against. |
| 48 | +func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount uint, threshold float64) (MegamashMap, error) { |
| 49 | + var megamashMap MegamashMap |
| 50 | + megamashMap.KmerSize = kmerSize |
| 51 | + megamashMap.KmerMinimalCount = kmerMinimalCount |
| 52 | + megamashMap.Threshold = threshold |
| 53 | + |
| 54 | + for _, fastaRecord := range sequences { |
| 55 | + megamashMap.Identifiers = append(megamashMap.Identifiers, fastaRecord.Identifier) |
| 56 | + sequence := fastaRecord.Sequence |
| 57 | + |
| 58 | + // First get all kmers with a given sequence |
| 59 | + kmerMap := make(map[string]bool) |
| 60 | + for i := 0; i <= len(sequence)-int(kmerSize); i++ { |
| 61 | + kmerString := StandardizedDNA(sequence[i : i+int(kmerSize)]) |
| 62 | + kmerMap[kmerString] = true |
| 63 | + } |
| 64 | + |
| 65 | + // Then, get unique kmers for this sequence and only this sequence |
| 66 | + uniqueKmerMap := make(map[string]bool) |
| 67 | + for kmerBase64 := range kmerMap { |
| 68 | + unique := true |
| 69 | + for _, otherMegaMashMap := range megamashMap.Kmers { |
| 70 | + _, ok := otherMegaMashMap[kmerBase64] |
| 71 | + // If this kmer is found, set both to false |
| 72 | + if ok { |
| 73 | + otherMegaMashMap[kmerBase64] = false |
| 74 | + unique = false |
| 75 | + break |
| 76 | + } |
| 77 | + } |
| 78 | + if unique { |
| 79 | + uniqueKmerMap[kmerBase64] = true |
| 80 | + } |
| 81 | + } |
| 82 | + // Check if we have the minimal kmerCount |
| 83 | + var kmerCount uint = 0 |
| 84 | + for _, unique := range uniqueKmerMap { |
| 85 | + if unique { |
| 86 | + kmerCount++ |
| 87 | + } |
| 88 | + } |
| 89 | + if kmerCount < kmerMinimalCount { |
| 90 | + return megamashMap, fmt.Errorf("Got only %d unique kmers of required %d for sequence %s", kmerCount, kmerMinimalCount, fastaRecord.Identifier) |
| 91 | + } |
| 92 | + |
| 93 | + // Now we have a unique Kmer map for the given sequence. |
| 94 | + // Add it to megamashMap |
| 95 | + megamashMap.Kmers = append(megamashMap.Kmers, uniqueKmerMap) |
| 96 | + } |
| 97 | + return megamashMap, nil |
| 98 | +} |
| 99 | + |
| 100 | +// Match contains the identifier and score of a potential match to the searched |
| 101 | +// sequence. |
| 102 | +type Match struct { |
| 103 | + Identifier string |
| 104 | + Score float64 |
| 105 | +} |
| 106 | + |
| 107 | +// Match matches a sequence to all the sequences in a megamash map. |
| 108 | +func (m *MegamashMap) Match(sequence string) []Match { |
| 109 | + var scores []float64 |
| 110 | + // The algorithm is as follows: |
| 111 | + // - Go through each map. |
| 112 | + // - Get the number of matching kmers |
| 113 | + // - Divide that by the total kmers available for matching |
| 114 | + |
| 115 | + // First, get the kmer total |
| 116 | + var kmerSize int |
| 117 | +out: |
| 118 | + for _, maps := range m.Kmers { |
| 119 | + for kmer := range maps { |
| 120 | + kmerSize = len(kmer) |
| 121 | + break out |
| 122 | + } |
| 123 | + } |
| 124 | + |
| 125 | + // Now, iterate through each map |
| 126 | + for _, sequenceMap := range m.Kmers { |
| 127 | + var score float64 |
| 128 | + var totalKmers = len(sequenceMap) |
| 129 | + var matchedKmers int |
| 130 | + for i := 0; i <= len(sequence)-kmerSize; i++ { |
| 131 | + kmerString := StandardizedDNA(sequence[i : i+kmerSize]) |
| 132 | + unique, ok := sequenceMap[kmerString] |
| 133 | + if ok && unique { |
| 134 | + matchedKmers++ |
| 135 | + } |
| 136 | + } |
| 137 | + if totalKmers == 0 { |
| 138 | + score = 0 |
| 139 | + } else { |
| 140 | + score = float64(matchedKmers) / float64(totalKmers) |
| 141 | + } |
| 142 | + scores = append(scores, score) |
| 143 | + } |
| 144 | + |
| 145 | + var matches []Match |
| 146 | + for i, score := range scores { |
| 147 | + if score > m.Threshold { |
| 148 | + matches = append(matches, Match{Identifier: m.Identifiers[i], Score: score}) |
| 149 | + } |
| 150 | + } |
| 151 | + return matches |
| 152 | +} |
| 153 | + |
| 154 | +// FastqMatchChannel processes a channel of fastq.Read and pushes to a channel of matches. |
| 155 | +func (m *MegamashMap) FastqMatchChannel(ctx context.Context, sequences <-chan fastq.Read, matches chan<- []Match) error { |
| 156 | + for { |
| 157 | + select { |
| 158 | + case <-ctx.Done(): |
| 159 | + // Clean up resources, handle cancellation. |
| 160 | + return ctx.Err() |
| 161 | + case sequence, ok := <-sequences: |
| 162 | + if !ok { |
| 163 | + close(matches) |
| 164 | + return nil |
| 165 | + } |
| 166 | + sequenceMatches := m.Match(sequence.Sequence) |
| 167 | + matches <- sequenceMatches |
| 168 | + } |
| 169 | + } |
| 170 | +} |
0 commit comments