Skip to content

Commit

Permalink
release 0.3.2
Browse files Browse the repository at this point in the history
  • Loading branch information
mklarqvist committed Feb 18, 2018
1 parent 7401777 commit 6ca779f
Show file tree
Hide file tree
Showing 10 changed files with 131 additions and 42 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[![Build Status](https://travis-ci.org/mklarqvist/tomahawk.svg?branch=master)](https://travis-ci.org/mklarqvist/tomahawk)
[![Release](https://img.shields.io/badge/Release-beta_0.3-blue.svg)](https://github.com/mklarqvist/tomahawk/releases)
[![Release](https://img.shields.io/badge/Release-beta_0.3.2-blue.svg)](https://github.com/mklarqvist/tomahawk/releases)
[![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)

![screenshot](tomahawk.png)
Expand Down Expand Up @@ -43,19 +43,19 @@ Tomahawk comprises five primary commands: `import`, `calc`, `view`, `sort`, and
Executing `tomahawk` gives a list of commands with brief descriptions and `tomahawk <command>`
gives detailed details for that command.

All primary Tomahawk commands operate on the binary Tomahawk `twk` and Totempole `twi` file
All primary Tomahawk commands operate on the binary Tomahawk `twk` and Tomahawk output `two` file
format. Interconversions between `twk` and `vcf`/`bcf` is supported through the
commands `import` for `vcf`/`bcf`->`twk` and `view` for `twk`->`vcf`. Linkage
disequilibrium data is written out in `two` and `toi` format.
disequilibrium data is written out in `two` format.

### Importing to Tomahawk
By design Tomahawk only operates on bi-allelic SNVs and as such filters out
indels and complex variants. Tomahawk does not support mixed phasing of genotypes
in the same variant (e.g. `0|0`, `0/1`). If mixed phasing is found in a line,
all genotypes in that line are converted to unphased. Importing a variant document (`vcf`/`bcf`)
to Tomahawk requires the `import` command.
The following command line imports a `vcf` file and outputs `outPrefix.twk` and
`outPrefix.twk.twi` and filters out variants with >20% missingness and deviate
The following command line imports a `vcf` file and outputs `outPrefix.twk` and filters out
variants with >20% missingness and deviate
from Hardy-Weinberg equilibrium with a probability < 0.001
```bash
tomahawk import -i file.vcf -o outPrefix -m 0.2 -H 1e-3
Expand Down Expand Up @@ -91,7 +91,7 @@ Converting a `twk` file to `vcf`
tomahawk view -i file.twk -o file.vcf
```

### Sort `TWO` file
### Sorting `two` files
Partially sort `two` file in 500 MB chunks
```bash
tomahawk sort -i file.two -o partial.two -L 500
Expand Down
13 changes: 13 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Release 0.3.2

## Breaking Changes
* Sorted files needs to be resorted. Using old sorted files does not break functionality

## Major Features And Improvements
* Sorting `two` files not correctly produces two indices used for fast queries
* Viewing sorted two files is significantly faster

## Bug Fixes and Other Changes
* Bug fixes
* Sort merge (`tomahawk sort -M`) now produces the correct index

# Release 0.3.1

## Bug Fixes and Other Changes
Expand Down
13 changes: 11 additions & 2 deletions src/algorithm/sort/output_sorter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ bool OutputSorter::sort(const std::string& input, const std::string& destination
std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << destinationPrefix << "..." << std::endl;
return false;
}
writer.getIndex().setSorted(false);
writer.getIndex().setPartialSorted(true);
writer.writeHeaders(this->reader.getHeader());

if(!SILENT)
Expand Down Expand Up @@ -125,6 +127,8 @@ bool OutputSorter::sortMerge(const std::string& inputFile, const std::string& de
return false;
}
writer.setFlushLimit(block_size);
writer.getIndex().setSorted(true);
writer.getIndex().setPartialSorted(false);
writer.writeHeaders(this->reader.getHeader());

const U32 n_blocks = this->reader.getIndex().size();
Expand Down Expand Up @@ -168,11 +172,15 @@ bool OutputSorter::sortMerge(const std::string& inputFile, const std::string& de
return false;
}

// Set first previous entry and current index bounds
writer.getPreviousEntry() = outQueue.top().data;
writer.getCurrentIndexEntry() = outQueue.top().data;

// while queue is not empty
while(outQueue.empty() == false){
// peek at top entry in queue
const U32 id = outQueue.top().streamID;
writer << outQueue.top().data;
writer.addSorted(outQueue.top().data);

// remove this record from the queue
outQueue.pop();
Expand All @@ -182,13 +190,14 @@ bool OutputSorter::sortMerge(const std::string& inputFile, const std::string& de
outQueue.push( queue_entry(e, id, entry_type::sortDescending) );
break;
}
writer << *e;
writer.addSorted(*e);
}
}

writer.setPartialSorted(false);
writer.setSorted(true);
writer.flush();
writer.getIndex().buildMetaIndex(this->reader.getHeader().getMagic().getNumberContigs());
writer.writeFinal();

if(!SILENT)
Expand Down
24 changes: 12 additions & 12 deletions src/index/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ bool Index::buildMetaIndex(const U32 n_contigs){
return false;

meta_entry_type reference_entry;
reference_entry.index_begin = 0;
reference_entry.index_end = 1;
reference_entry.min_position = this->getContainer()[0].min_position;
reference_entry.max_position = this->getContainer()[0].max_position;
reference_entry.n_variants = this->getContainer()[0].n_variants;
reference_entry.index_begin = 0;
reference_entry.index_end = 1;
reference_entry.min_position = this->getContainer()[0].min_position;
reference_entry.max_position = this->getContainer()[0].max_position;
reference_entry.n_variants = this->getContainer()[0].n_variants;
reference_entry.uncompressed_size = this->getContainer()[0].uncompressed_size;
U32 reference_contig = this->getContainer()[0].contigID;

Expand All @@ -39,17 +39,17 @@ bool Index::buildMetaIndex(const U32 n_contigs){

temp_entries[reference_contig] = reference_entry;
reference_contig = this->getContainer()[i].contigID;
reference_entry.index_begin = i;
reference_entry.index_end = i + 1;
reference_entry.min_position = this->getContainer()[i].min_position;
reference_entry.max_position = this->getContainer()[i].max_position;
reference_entry.n_variants = this->getContainer()[i].n_variants;
reference_entry.index_begin = i;
reference_entry.index_end = i + 1;
reference_entry.min_position = this->getContainer()[i].min_position;
reference_entry.max_position = this->getContainer()[i].max_position;
reference_entry.n_variants = this->getContainer()[i].n_variants;
reference_entry.uncompressed_size = this->getContainer()[i].uncompressed_size;

} else {
++reference_entry.index_end;
reference_entry.max_position = this->getContainer()[i].max_position;
reference_entry.n_variants += this->getContainer()[i].n_variants;
reference_entry.max_position = this->getContainer()[i].max_position;
reference_entry.n_variants += this->getContainer()[i].n_variants;
reference_entry.uncompressed_size += this->getContainer()[i].uncompressed_size;
}
}
Expand Down
18 changes: 9 additions & 9 deletions src/index/index_container.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ void IndexContainer::resize(const size_t new_capacity){
this->n_capacity_ = new_capacity;
}

std::pair<const IndexEntry*, const IndexEntry*> IndexContainer::findOverlap(const S32& contigID) const{
std::pair<U32, U32> IndexContainer::findOverlap(const S32& contigID) const{
// Find first hit
size_t i = 0;
for(; i < this->size(); ++i){
Expand All @@ -88,18 +88,18 @@ std::pair<const IndexEntry*, const IndexEntry*> IndexContainer::findOverlap(cons
}

if(i == this->size())
return(std::pair<const IndexEntry*, const IndexEntry*>(&this->at(this->size()), &this->at(this->size())));
return(std::pair<U32, U32>(0, 0));

const size_t from = i;
for(; i < this->size(); ++i){
if(this->at(i).overlaps(contigID) == false)
break;
}

return(std::pair<const IndexEntry*, const IndexEntry*>(&this->at(from), &this->at(i)));
return(std::pair<U32,U32>(from, i));
}

std::pair<const IndexEntry*, const IndexEntry*> IndexContainer::findOverlap(const S32& contigID, const U64& position) const{
std::pair<U32, U32> IndexContainer::findOverlap(const S32& contigID, const U64& position) const{
// Find first hit
size_t i = 0;
for(; i < this->size(); ++i){
Expand All @@ -108,18 +108,18 @@ std::pair<const IndexEntry*, const IndexEntry*> IndexContainer::findOverlap(cons
}

if(i == this->size())
return(std::pair<const IndexEntry*, const IndexEntry*>(&this->at(this->size()), &this->at(this->size())));
return(std::pair<U32, U32>(0, 0));

const size_t from = i;
for(; i < this->size(); ++i){
if(this->at(i).overlaps(contigID, position) == false)
break;
}

return(std::pair<const IndexEntry*, const IndexEntry*>(&this->at(from), &this->at(i)));
return(std::pair<U32,U32>(from, i));
}

std::pair<const IndexEntry*, const IndexEntry*> IndexContainer::findOverlap(const S32& contigID, const U64& from_position, const U64& to_position) const{
std::pair<U32, U32> IndexContainer::findOverlap(const S32& contigID, const U64& from_position, const U64& to_position) const{
// Find first hit
size_t i = 0;
for(; i < this->size(); ++i){
Expand All @@ -128,15 +128,15 @@ std::pair<const IndexEntry*, const IndexEntry*> IndexContainer::findOverlap(cons
}

if(i == this->size())
return(std::pair<const IndexEntry*, const IndexEntry*>(&this->at(this->size()), &this->at(this->size())));
return(std::pair<U32, U32>(0, 0));

const size_t from = i;
for(; i < this->size(); ++i){
if(this->at(i).overlaps(contigID, from_position, to_position) == false)
break;
}

return(std::pair<const IndexEntry*, const IndexEntry*>(&this->at(from), &this->at(i)));
return(std::pair<U32,U32>(from, i));
}

}
Expand Down
6 changes: 3 additions & 3 deletions src/index/index_container.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ class IndexContainer{

// Overlap functions: find blocks a target interval overlaps
// returns pairs of pointers. If pointerA == pointerB then the data is empty
std::pair<const_pointer, const_pointer> findOverlap(const S32& contigID) const;
std::pair<const_pointer, const_pointer> findOverlap(const S32& contigID, const U64& position) const;
std::pair<const_pointer, const_pointer> findOverlap(const S32& contigID, const U64& from_position, const U64& to_position) const;
std::pair<U32, U32> findOverlap(const S32& contigID) const;
std::pair<U32, U32> findOverlap(const S32& contigID, const U64& position) const;
std::pair<U32, U32> findOverlap(const S32& contigID, const U64& from_position, const U64& to_position) const;

private:
friend std::ofstream& operator<<(std::ofstream& stream, const self_type& container){
Expand Down
15 changes: 12 additions & 3 deletions src/index/index_entry.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef TOTEMPOLEENTRY_H_
#define TOTEMPOLEENTRY_H_

#include "../tomahawk/two/output_entry.h"
#include <fstream>

namespace Tomahawk{
Expand All @@ -10,7 +11,8 @@ namespace Totempole{

struct IndexEntry{
public:
typedef IndexEntry self_type;
typedef IndexEntry self_type;
typedef IO::OutputEntry entry_type;

public:
IndexEntry() :
Expand Down Expand Up @@ -53,6 +55,13 @@ struct IndexEntry{
inline void operator++(void){ ++this->n_variants; }
inline U64 sizeBytes(void) const{ return(this->byte_offset_end - this->byte_offset); }

inline self_type& operator=(const entry_type& entry){
this->contigID = entry.AcontigID;
this->min_position = entry.Aposition;
this->max_position = entry.Aposition;
return(*this);
}

friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){
stream << entry.byte_offset << '\t' << entry.byte_offset_end << '\t' << entry.contigID << '\t' << entry.min_position << '-' << entry.max_position << '\t' << entry.n_variants << '\t' << entry.uncompressed_size;
return stream;
Expand Down Expand Up @@ -98,8 +107,8 @@ struct IndexEntry{
}
inline const bool overlaps(const S32& contigID, const U64& from_position, const U64& to_position) const{
if(this->contigID != contigID) return false;
if(from_position < this->min_position) return false;
if(to_position > this->max_position) return false;
if(to_position < this->min_position) return false;
if(from_position > this->max_position) return false;
return true;
}

Expand Down
27 changes: 27 additions & 0 deletions src/io/output_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ class OutputWriter{
inline const bool isSorted(void) const{ return(this->writing_sorted_); }
inline const bool isPartialSorted(void) const{ return(this->writing_sorted_partial_); }
inline index_type& getIndex(void) const{ return(*this->index_); }
inline entry_type& getPreviousEntry(void){ return(this->previous_entry); }
inline index_entry_type& getCurrentIndexEntry(void){ return(this->index_entry); }

bool open(const std::string& output_file);
int writeHeaders(twk_header_type& twk_header);
Expand Down Expand Up @@ -112,6 +114,30 @@ class OutputWriter{
this->flush();
}

/**<
*
* @param entry
*/
inline void addSorted(const entry_type& entry){
if(this->previous_entry.AcontigID != entry.AcontigID
|| this->previous_entry.BcontigID != entry.BcontigID){
std::cerr << "switch in contig" << std::endl;
this->flush();
this->index_entry = entry;
}

// Check if the buffer has to be flushed after adding this entry
if(this->buffer.size() > this->l_flush_limit){
this->flush();
this->index_entry = entry;
}

this->buffer << entry;
++this->n_entries;
this->previous_entry = entry;
this->index_entry.max_position = entry.Aposition;
}

/**<
* Overloaded operator for adding an entire container of `two` entries
* @param container Target container of entries
Expand Down Expand Up @@ -141,6 +167,7 @@ class OutputWriter{
U32 n_blocks; // number of index blocks writtenflush_limit
U32 l_flush_limit;
U32 l_largest_uncompressed;
entry_type previous_entry;
index_entry_type index_entry; // keep track of sort order
std::ofstream* stream;
buffer_type buffer;
Expand Down
2 changes: 1 addition & 1 deletion src/support/MagicConstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ extern std::string INTERPRETED_COMMAND;

// Versioning
const float PROGRAM_VERSION_MAJOR = 0.3; // major
const float PROGRAM_VERSION_MINOR = 0;
const float PROGRAM_VERSION_MINOR = 2;

const double ALLOWED_ROUNDING_ERROR = 0.001;

Expand Down
43 changes: 37 additions & 6 deletions src/tomahawk/two/TomahawkOutputReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "../../support/helpers.h"
#include "../two/TomahawkOutputStats.h"
#include "../../io/output_writer.h"
#include "../../index/index_entry.h"

namespace Tomahawk {

Expand Down Expand Up @@ -547,12 +548,42 @@ bool TomahawkOutputReader::__viewRegion(void){
std::cout << this->getHeader().getLiterals() << '\n';
}

if(this->interval_tree != nullptr){
while(this->parseBlock()){
output_container_reference_type o(this->data_);
for(U32 i = 0; i < o.size(); ++i)
this->__checkRegionNoIndex(o[i]);
} // end while next block
// File is sorted
if(this->getIndex().isSorted()){
if(this->interval_tree == nullptr)
return false;

for(U32 i = 0; i < this->interval_tree_entries->size(); ++i){
// Find this overlap
std::pair<U32, U32> ret = this->getIndex().getContainer().findOverlap(this->interval_tree_entries->at(i).contigID, this->interval_tree_entries->at(i).start, this->interval_tree_entries->at(i).stop);
const U32 n_blocks_hits = ret.second - ret.first;
if(n_blocks_hits == 0)
return false;

for(U32 j = 0; j < n_blocks_hits; ++j){
if(!this->seekBlock(ret.first + j)){
std::cerr << "failed to seek block" << std::endl;
return false;
}

if(!this->parseBlock())
return false;

output_container_reference_type o(this->data_);
for(U32 i = 0; i < o.size(); ++i)
this->__checkRegionNoIndex(o[i]);
}
}
}
// File is not sorted
else {
if(this->interval_tree != nullptr){
while(this->parseBlock()){
output_container_reference_type o(this->data_);
for(U32 i = 0; i < o.size(); ++i)
this->__checkRegionNoIndex(o[i]);
} // end while next block
}
}

return true;
Expand Down

0 comments on commit 6ca779f

Please sign in to comment.