Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update to gracefully handle ambiguity codes in the kmerTable suffix r… #40

Merged
merged 1 commit into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/AwFmKmerTable.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@
#include "AwFmSearch.h"



bool awFmQueryCanUseKmerTable(const struct AwFmIndex *_RESTRICT_ const index,
const char *_RESTRICT_ const kmer, const uint8_t kmerLength){

if (kmerLength < index->config.kmerLengthInSeedTable){
return false;
}
for(uint8_t letterIdx = kmerLength - index->config.kmerLengthInSeedTable; letterIdx < kmerLength; letterIdx++){
if(awFmLetterIsAmbiguous(kmer[letterIdx], index->config.alphabetType)){
return false;
}
}

return true;
}


struct AwFmSearchRange awFmNucleotideKmerSeedRangeFromTable(
const struct AwFmIndex *_RESTRICT_ const index, const char *_RESTRICT_ const kmer, const uint8_t kmerLength) {

Expand Down Expand Up @@ -185,4 +202,6 @@ inline struct AwFmSearchRange awFmAminoPartialKmerSeedRangeFromTable(
return range;
}



#endif
18 changes: 18 additions & 0 deletions src/AwFmKmerTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,22 @@ struct AwFmSearchRange awFmAminoKmerSeedRangeFromTable(
const struct AwFmIndex *_RESTRICT_ const index, const char *_RESTRICT_ const kmer, const uint8_t kmerLength);


/*
* Function: awFmQueryCanUseKmerTable
* --------------------
* Determines if a given kmer can use the kmer seed table. A kmer is ineligible for using
* the kmer seed table if it is too short or contains any ambiguity characters in the
* suffix characters that would be used to query the table.
* Inputs:
* index: AwFmIndex that contains the table to be used
* kmer: pointer to the start of the kmer
* kmerLength: length of the kmer in question.
*
* Returns:
* true if the kmer is eligible for using the kmer seed table, or false otherwise.
*/
bool awFmQueryCanUseKmerTable(const struct AwFmIndex *_RESTRICT_ const index,
const char *_RESTRICT_ const kmer, const uint8_t kmerLength);


#endif /* end of include guard: AW_FM_KMER_TABLE_H */
33 changes: 31 additions & 2 deletions src/AwFmLetter.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include "AwFmLetter.h"
#include "AwFmIndexStruct.h"

#include <stdlib.h>
#include <time.h>

#include "AwFmIndexStruct.h"
#include <ctype.h>


uint8_t awFmAsciiNucleotideToLetterIndex(const uint8_t asciiLetter) {
Expand Down Expand Up @@ -87,3 +87,32 @@ uint8_t awFmAminoAcidCompressedVectorToLetterIndex(const uint8_t compressedVecto

return letterLookup[compressedVectorLetter];
}

bool awFmLetterIsAmbiguous(const char letter, const enum AwFmAlphabetType alphabet){
const char lowercase = tolower(letter);
if(alphabet == AwFmAlphabetAmino){
switch(lowercase){
case 'z':
case 'x':
case 'b':
return true;
default:
return false;
}
}
else{
switch(lowercase){
case 'a':
case 'c':
case 'g':
case 't':
case 'u':
return false;
default:
return true;
}
}

//this return should never occur
return true;
}
15 changes: 15 additions & 0 deletions src/AwFmLetter.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,19 @@ uint8_t awFmAminoAcidLetterIndexToCompressedVector(const uint8_t letterIndex);
*/
uint8_t awFmAminoAcidCompressedVectorToLetterIndex(const uint8_t compressedVectorLetter);


/*
* Function: awFmLetterIsAmbiguous
* --------------------
* Determines if the given character representation is ambiguous.
* Inputs:
* letter: ascii character to check for ambiguity
* alphabet: alphabet the character is from.
*
* Returns:
* true if the letter is an ambiguity code, false if it
* represents a specific nucleotide or amino acid.
*/
bool awFmLetterIsAmbiguous(const char letter, const enum AwFmAlphabetType alphabet);

#endif /* end of include guard: AW_FM_LETTER_H */
33 changes: 22 additions & 11 deletions src/AwFmParallelSearch.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ enum AwFmReturnCode awFmParallelSearchLocate(const struct AwFmIndex *_RESTRICT_
threadBlockStartIndex += AW_FM_NUM_CONCURRENT_QUERIES) {

const size_t threadBlockEndIndex = threadBlockStartIndex + AW_FM_NUM_CONCURRENT_QUERIES > searchList->count ?
searchList->count :
threadBlockStartIndex + AW_FM_NUM_CONCURRENT_QUERIES;
searchList->count : threadBlockStartIndex + AW_FM_NUM_CONCURRENT_QUERIES;

struct AwFmSearchRange ranges[AW_FM_NUM_CONCURRENT_QUERIES];

parallelSearchFindKmerSeedsForBlock(index, searchList, ranges, threadBlockStartIndex, threadBlockEndIndex);
Expand Down Expand Up @@ -185,26 +185,37 @@ void parallelSearchFindKmerSeedsForBlock(const struct AwFmIndex *_RESTRICT_ cons

for(size_t kmerIndex = threadBlockStartIndex; kmerIndex < threadBlockEndIndex; kmerIndex++) {
const struct AwFmKmerSearchData *searchData = &searchList->kmerSearchData[kmerIndex];
const uint8_t kmerLength = searchData->kmerLength;
const char *kmerString = searchData->kmerString;
const uint8_t kmerLength = searchData->kmerLength;
const char *kmerString = searchData->kmerString;

const bool queryCanUseKmerTable = awFmQueryCanUseKmerTable(index, kmerString, kmerLength);

const uint64_t rangesIndex = kmerIndex - threadBlockStartIndex;

//these are used if the kmer is ineligible for using the kmerSeedTable
const uint8_t kmerStringNonSeededStart = kmerLength < index->config.kmerLengthInSeedTable?
0: kmerLength - index->config.kmerLengthInSeedTable;
const uint8_t kmerStringNonSeededLength = kmerLength < index->config.kmerLengthInSeedTable?
kmerLength: index->config.kmerLengthInSeedTable;

if(index->config.alphabetType != AwFmAlphabetAmino) {
// TODO: reimplement partial seeded search when it's implementable
if(kmerLength < index->config.kmerLengthInSeedTable) {
awFmNucleotideNonSeededSearch(index, kmerString, kmerLength, &ranges[rangesIndex]);
if(queryCanUseKmerTable) {
ranges[rangesIndex] = awFmNucleotideKmerSeedRangeFromTable(index, kmerString, kmerLength);
}
else {
ranges[rangesIndex] = awFmNucleotideKmerSeedRangeFromTable(index, kmerString, kmerLength);
awFmNucleotideNonSeededSearch(index, kmerString + kmerStringNonSeededStart,
kmerStringNonSeededLength, &ranges[rangesIndex]);
}
}
else {
if(kmerLength < index->config.kmerLengthInSeedTable) {
awFmAminoNonSeededSearch(index, kmerString, kmerLength, &ranges[rangesIndex]);
}
else {
if(queryCanUseKmerTable) {
ranges[rangesIndex] = awFmAminoKmerSeedRangeFromTable(index, kmerString, kmerLength);
}
else{
awFmAminoNonSeededSearch(index, kmerString + kmerStringNonSeededStart,
kmerStringNonSeededLength, &ranges[rangesIndex]);
}
}
}
}
Expand Down