Skip to content

Commit f5a77d1

Browse files
committed
KFF
1 parent 18e6eb2 commit f5a77d1

13 files changed

+83
-33
lines changed

kmc_tools/kff_db_reader.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
//TODO KFF: for dump operation encoding is important: implement it!
1515
//TODO KFF: considef minimizers sections
16-
//TODO KFF: only blocks with given k value must be prepared before -> for now maybe just reject KFF with multiple k values
1716

1817
//Forward declaration
1918
template<unsigned SIZE> class CKFFDbReaderSorted;

kmc_tools/kff_db_writer.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,18 @@ template<unsigned SIZE> class CKFFDbWriter : public CDbWriter<SIZE>
5555
store_buff();
5656
kff_writer.FinishSection();
5757
}
58+
59+
bool canonical()
60+
{
61+
auto& headers = CConfig::GetInstance().headers;
62+
bool both_stands = true;
63+
64+
for (auto& input : headers)
65+
both_stands = both_stands && input.both_strands; //if any input database is in both strands, output is also in both strands
66+
67+
return both_stands;
68+
}
69+
5870
public:
5971

6072
CKFFDbWriter(CBundle<SIZE>* bundle, COutputDesc& output_desc) :
@@ -64,9 +76,11 @@ template<unsigned SIZE> class CKFFDbWriter : public CDbWriter<SIZE>
6476
bundle(bundle),
6577
kff_writer(
6678
output_desc.file_src + ".kff",
67-
true /*TODO KFF: set this basing on inputs! */,
79+
canonical(),
6880
CConfig::GetInstance().kmer_len,
69-
counter_size
81+
counter_size,
82+
output_desc.cutoff_min,
83+
output_desc.cutoff_max
7084
/*TODO KFF: store encoding info!*/),
7185
output_desc(output_desc)
7286
{

kmc_tools/kff_info_reader.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ CKFFInfoReader::CKFFInfoReader(const std::string& path)
2929
if (!file)
3030
throw std::runtime_error("Error: cannot open file " + path);
3131

32-
3332
// Check markers
3433
char marker[4];
3534
marker[3] = '\0';
@@ -42,8 +41,6 @@ CKFFInfoReader::CKFFInfoReader(const std::string& path)
4241
if (strncmp(marker, "KFF", 3) != 0)
4342
throw std::runtime_error("Error: missing KFF marker at the end of file " + path);
4443

45-
//TODO KFF: index may be at the begining, so maybe there is no need to read footer
46-
4744
my_fseek(file, -23, SEEK_END);
4845
char footer_size_str[12];
4946
fread(footer_size_str, 1, 12, file);
@@ -73,7 +70,6 @@ CKFFInfoReader::CKFFInfoReader(const std::string& path)
7370

7471
//std::cerr << "footer nb_vars: " << nb_vars << "\n";
7572

76-
7773
for (uint64_t i = 0; i < nb_vars; ++i)
7874
{
7975
auto name = ReadVarName();
@@ -82,6 +78,8 @@ CKFFInfoReader::CKFFInfoReader(const std::string& path)
8278
LoadBigEndian(tmp.data(), val);
8379
//std::cerr << name << ": " << val << "\n";
8480

81+
kff_file_struct.footer[name] = val;
82+
8583
if (name == "first_index")
8684
first_index = val;
8785
}
@@ -95,14 +93,13 @@ CKFFInfoReader::CKFFInfoReader(const std::string& path)
9593
fread(tmp.data(), 1, 1, file);
9694
LoadBigEndian(tmp.data(), ver_minor);
9795

98-
uint8_t encoding;
96+
9997
fread(tmp.data(), 1, 1, file);
100-
LoadBigEndian(tmp.data(), encoding);
98+
LoadBigEndian(tmp.data(), kff_file_struct.encoding);
10199

102-
uint8_t unique;
103100
fread(tmp.data(), 1, 1, file);
104-
LoadBigEndian(tmp.data(), unique);
105-
if (unique == 0)
101+
LoadBigEndian(tmp.data(), kff_file_struct.all_unique);
102+
if (kff_file_struct.all_unique == 0)
106103
throw std::runtime_error("Error: only unique k-mers in KFF file are supported, file " + path);
107104

108105
uint8_t canonical;
@@ -130,7 +127,6 @@ CKFFInfoReader::CKFFInfoReader(const std::string& path)
130127
if (first_index == std::numeric_limits<uint64_t>::max())
131128
throw std::runtime_error("Error: no first_index in the footer and first section is not an index, file: " + path);
132129

133-
//std::vector<std::pair<char, int64_t>> index_pairs;
134130

135131
while (first_index)
136132
{
@@ -181,7 +177,6 @@ CKFFInfoReader::CKFFInfoReader(const std::string& path)
181177
throw std::runtime_error("Error: KFF index is inconsistent with file content");
182178
}
183179

184-
//some reading
185180
for (auto e : index)
186181
{
187182
my_fseek(file, e.section_pos, SEEK_SET);
@@ -227,6 +222,8 @@ void CKFFInfoReader::ReadVariableSection()
227222
section.data_size = val;
228223
else if (var_name == "m")
229224
section.minimizer_size = val;
225+
else if (var_name == "ordered")
226+
section.ordered = val;
230227
}
231228
if (kff_file_struct.scopes.size())
232229
{

kmc_tools/kff_info_reader.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
#include <string>
33
#include <vector>
44
#include <limits>
5+
#include <string>
6+
#include <map>
57

6-
/*
7-
TODO KFF: add checking if section is ordered!!!!
8-
*/
98

109
template<typename T>
1110
void LoadBigEndian(const uint8_t* buff, T& data)
@@ -42,12 +41,16 @@ struct CKFFVariables
4241
uint64_t data_size = std::numeric_limits<uint64_t>::max(); //counter size
4342
uint64_t minimizer_size = std::numeric_limits<uint64_t>::max();
4443
uint64_t max_in_block = std::numeric_limits<uint64_t>::max();
44+
bool ordered;
4545
std::vector<CKFFDataSection> data_sections;
4646
};
4747

4848
struct CKFFFileStruct
4949
{
50+
std::map<std::string, uint64_t> footer;
5051
bool both_strands;
52+
uint8_t encoding;
53+
uint8_t all_unique;
5154
std::vector<CKFFVariables> scopes;
5255
};
5356

kmc_tools/kmc1_db_writer.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,9 +330,10 @@ template<unsigned SIZE> void CKMC1DbWriter<SIZE>::finish_writing()
330330
write_header_part(output_desc.cutoff_max);
331331
write_header_part(added_kmers);
332332

333-
bool both_stands = false;
333+
bool both_stands = true;
334+
334335
for (auto& input : config.headers)
335-
both_stands = both_stands || input.both_strands; //if any input database is in both strands, output is also in both strands
336+
both_stands = both_stands && input.both_strands; //if any input database is in not canonical, output is also not canonical
336337

337338
write_header_part(!both_stands);
338339

kmc_tools/kmc_tools.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ template<unsigned SIZE> class CTools
158158
{
159159
std::cout << "This is KFF file, summary:\n";
160160
std::cout << "canonical : " << (header.kff_file_struct.both_strands ? "yes" : "no") << "\n";
161+
std::cout << "all k-mers unique : " << (header.kff_file_struct.all_unique ? "yes" : "no") << "\n";
161162
//TODO KFF: add encoding printing
162163
std::set<uint64_t> k_values;
163164
for (auto& e : header.kff_file_struct.scopes)
@@ -167,6 +168,12 @@ template<unsigned SIZE> class CTools
167168
std::cout << "max : " << e.max_in_block << "\n";
168169
if(e.minimizer_size != (std::numeric_limits<uint64_t>::max)())
169170
std::cout << "m : " << e.minimizer_size << "\n";
171+
172+
std::cerr << "footer values:\n";
173+
for (const auto& e : header.kff_file_struct.footer)
174+
{
175+
std::cerr << "\t" << e.first << " : " << e.second << "\n";
176+
}
170177
std::cout << "Data sections:\n";
171178
uint64_t tot_nb_blocks{};
172179
for (auto& s : e.data_sections)
@@ -178,8 +185,8 @@ template<unsigned SIZE> class CTools
178185
type = "raw";
179186
break;
180187
case KFFDataSectionType::MINIMIZER:
181-
type = "minimizer";
182-
break;
188+
type = "minimizer";
189+
break;
183190
default:
184191
{
185192
std::cerr << "Error: this should never happen, please contact authors: " << __FILE__ << "\t" << __LINE__ << "\n";

kmc_tools/kmc_tools.vcxproj.user

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<?xml version="1.0" encoding="utf-8"?>
22
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
33
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
4-
<LocalDebuggerCommandArguments>transform o sort ox -okff</LocalDebuggerCommandArguments>
4+
<LocalDebuggerCommandArguments>info o</LocalDebuggerCommandArguments>
55
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
66
</PropertyGroup>
77
</Project>

kmc_tools/kmer_file_header.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,12 +131,26 @@ void CKmerFileHeader::read_from_kff_file(const std::string& fname)
131131
for (auto& s : kff_file_struct.scopes)
132132
{
133133
k_values.insert(s.kmer_size);
134+
if (!s.ordered)
135+
{
136+
std::cerr << "Error: kmc_tools requires all KFF sections to be ordered\n";
137+
exit(1);
138+
}
139+
140+
for (auto& section: s.data_sections)
141+
if (!(section.type == KFFDataSectionType::RAW))
142+
{
143+
std::cerr << "Error: currently kmc_tools supports only raw KFF sections\n";
144+
exit(1);
145+
}
146+
134147
if (s.data_size > this->counter_size)
135148
this->counter_size = s.data_size;
136149
}
137150

138-
this->min_count = 1; //TODO KFF: try to read this values from KFF file, and if not present set defaults
139-
this->max_count = std::numeric_limits<uint32>::max();
151+
this->min_count = GetFromFooterOrDefault("min_count", 1);
152+
this->max_count = GetFromFooterOrDefault("max_count", std::numeric_limits<uint32>::max());
153+
this->both_strands = kff_file_struct.both_strands;
140154

141155
if (k_values.size() == 1)
142156
kmer_len = *k_values.begin();

kmc_tools/kmer_file_header.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,15 @@ struct CKmerFileHeader
2626
bool is_kff_file(std::string& fname);
2727
void read_from_kff_file(const std::string& fname);
2828
CKFFFileStruct kff_file_struct;
29+
30+
uint64_t GetFromFooterOrDefault(const std::string& name, uint64_t default)
31+
{
32+
const auto& m = kff_file_struct.footer;
33+
auto r = m.find(name);
34+
if (r != m.end())
35+
return r->second;
36+
return default;
37+
}
2938
public:
3039
uint32 kmer_len = 0;
3140
uint32 mode = 0;

kmer_counter/kb_completer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ void CKmerBinCompleter::ProcessBinsFirstStage()
108108
}
109109
else if (output_type == OutputType::KFF)
110110
{
111-
kff_writer = new CKFFWriter(file_name + ".kff", both_strands, kmer_len, counter_size);
111+
kff_writer = new CKFFWriter(file_name + ".kff", both_strands, kmer_len, counter_size, cutoff_min, cutoff_max);
112112
}
113113
else
114114
{

0 commit comments

Comments
 (0)