From 9a6edb1e793af17bdce9218e4a1b1c5bce58b8d0 Mon Sep 17 00:00:00 2001 From: Vyacheslav Brover Date: Mon, 15 Jul 2024 15:25:32 -0400 Subject: [PATCH] PD-5038 --ncleotide_output --- .gitignore | 1 + Makefile | 8 +- common.cpp | 2 +- common.hpp | 22 +- fasta_extract.cpp | 267 ++++++++++++++++ stxtyper.cpp | 35 ++- tsv.cpp | 783 ++++++++++++++++++++++++++++++++++++++++++++++ tsv.hpp | 28 +- version.txt | 2 +- 9 files changed, 1127 insertions(+), 21 deletions(-) create mode 100644 fasta_extract.cpp create mode 100644 tsv.cpp diff --git a/.gitignore b/.gitignore index f725ec1..acd8b3e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.o stxtyper fasta_check +fasta_extract stx.prot.* *.got diff --git a/Makefile b/Makefile index cdd8c32..da7a6e5 100644 --- a/Makefile +++ b/Makefile @@ -79,7 +79,7 @@ COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) $(DBDIR) $(TEST_UPDATE_DB) -c .PHONY: all clean install release test -BINARIES= stxtyper fasta_check +BINARIES= stxtyper fasta_check fasta_extract DATABASE= stx.prot all: $(BINARIES) @@ -90,7 +90,7 @@ all: $(BINARIES) common.o: common.hpp common.inc stxtyper.o: common.hpp common.inc -stxtyperOBJS=stxtyper.o common.o +stxtyperOBJS=stxtyper.o common.o tsv.o stxtyper: $(stxtyperOBJS) $(CXX) -o $@ $(stxtyperOBJS) -pthread $(DBDIR) @@ -99,6 +99,10 @@ fasta_checkOBJS=fasta_check.o common.o fasta_check: $(fasta_checkOBJS) $(CXX) -o $@ $(fasta_checkOBJS) +fasta_extract.o: common.hpp common.inc +fasta_extractOBJS=fasta_extract.o common.o +fasta_extract: $(fasta_extractOBJS) + $(CXX) -o $@ $(fasta_extractOBJS) clean: rm -f *.o diff --git a/common.cpp b/common.cpp index 2163008..b5d459f 100644 --- a/common.cpp +++ b/common.cpp @@ -1710,7 +1710,7 @@ void Xml::TextFile::tagStart (const string &tag) string tag_ (tag); replace (tag_, ':', '_'); if (! isIdentifier (tag_, true)) - throw runtime_error (FUNC "Bad tag name: " + strQuote (tag)); + throw runtime_error (FUNC "Bad textual XML tag name: " + strQuote (tag)); printRaw ("<" + tag + ">"); } diff --git a/common.hpp b/common.hpp index 1a0acfd..2249cee 100644 --- a/common.hpp +++ b/common.hpp @@ -156,6 +156,7 @@ void errorExitStr (const string &msg); void beep (); // Requires: !isRedirected() + // SHLVL = 1 ?? @@ -744,9 +745,11 @@ inline void trim (string &s) void trimLeading (string &s, char c); + // Invokes: isSpace() void trimTrailing (string &s, char c); + // Invokes: isSpace() inline void trim (string &s, char c) @@ -1295,13 +1298,13 @@ void copyText (const string &inFName, #ifndef _MSC_VER inline void moveFile (const string &from, const string &to) - { if (::rename (from. c_str (), to. c_str ())) - throw runtime_error ("Cannot move file + " + shellQuote (from) + " to " + shellQuote (to)); + { if (const int code = ::rename (from. c_str (), to. c_str ())) + throw runtime_error ("Cannot move file + " + shellQuote (from) + " to " + shellQuote (to) + " (" + to_string (code) + ")"); } inline void removeFile (const string &fName) - { if (::remove (fName. c_str ())) - throw runtime_error ("Cannot remove file + " + shellQuote (fName)); + { if (const int code = ::remove (fName. c_str ())) + throw runtime_error ("Cannot remove file + " + shellQuote (fName) + " (" + to_string (code) + ")"); } @@ -1892,6 +1895,7 @@ struct Xml struct TextFile : File + // Tag::name: idenifier with possible '-' { private: struct XmlStream : OFStream @@ -1925,7 +1929,7 @@ struct Xml // ::= * 0 0 0 // ::= // Number of different Tag::name's <= 2^16 - // Tag::name has no: '\0', '\n' + // Tag::name: no '\0', '\n' { private: ofstream os; @@ -3913,8 +3917,9 @@ struct TokenInput : Root [[noreturn]] void error (const Token &wrongToken, - const string &expected) const - { throw TextPos::Error (wrongToken. tp, expected, true); } + const string &what, + bool expected = true) const + { throw TextPos::Error (wrongToken. tp, what, expected); } [[noreturn]] void error (const string &what, bool expected = true) const { ci. error (what, expected); } @@ -3948,7 +3953,8 @@ struct TokenInput : Root error (t, Token::type2str (Token::eDouble) + " " + toString (expected)); } void get (char expected) - { if (getNextChar (false) != expected) + { const Token t (get ()); + if (! t. isDelimiter (expected)) error (Token::type2str (Token::eDelimiter) + " " + strQuote (toString (expected), '\'')); } void setLast (Token &&t) diff --git a/fasta_extract.cpp b/fasta_extract.cpp new file mode 100644 index 0000000..786f4f8 --- /dev/null +++ b/fasta_extract.cpp @@ -0,0 +1,267 @@ +// fasta_check.cpp + +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* Author: Vyacheslav Brover +* +* File Description: +* Extract sequences out of a FASTA file +* +*/ + + +#undef NDEBUG + +#include "common.hpp" +using namespace Common_sp; + +#include "common.inc" + + + +namespace +{ + + + +struct Segment +// not circular +{ + size_t start {0}; + size_t stop {0}; + bool strand {true}; + // false <=> negative + string genesymbol; + string name; + + + bool isDna () const + { return stop; } + size_t size () const + { return stop - start; } + void saveText (ostream &os) const + { os << start + << '\t' << stop + << '\t' << strand + << '\t' << genesymbol + << '\t' << name + << endl; + } +}; + + + +char complementaryNucleotide (char wildNucleotide) +{ + char r = ' '; + switch (toLower (wildNucleotide)) + { + case 'a': r = 't'; break; + case 'c': r = 'g'; break; + case 'g': r = 'c'; break; + case 't': r = 'a'; break; + case 'm': r = 'k'; break; + case 'r': r = 'y'; break; + case 'w': r = 'w'; break; + case 's': r = 's'; break; + case 'y': r = 'r'; break; + case 'k': r = 'm'; break; + case 'v': r = 'b'; break; + case 'h': r = 'd'; break; + case 'd': r = 'h'; break; + case 'b': r = 'v'; break; + case 'n': r = 'n'; break; + case '-': r = '-'; break; + default: + throw runtime_error ("Bad nucleotide " + to_string (wildNucleotide)); + } + if (isupper (wildNucleotide)) + r = toUpper (r); + + return r; +} + + + +bool process (const string &id, + string &seq, + const map> &id2segments) +{ + if (id. empty ()) + return false; + const Vector* segments = findPtr (id2segments, id); + if (! segments) + return false; + + replaceStr (seq, "-", ""); + QC_ASSERT (! seq. empty ()); + + for (Segment& seg : var_cast (*segments)) + { + cout << '>' << id; + if (seg. isDna ()) + { + QC_ASSERT (seg. start <= seq. size ()); + minimize (seg. stop, seq. size ()); + QC_ASSERT (seg. start < seg. stop); + cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-'); + } + cout << ' ' << seg. genesymbol << ' ' << seg. name << endl; + string seq1 (seq); + if (seg. isDna ()) + { + ASSERT (seg. stop <= seq1. size ()); + seq1 = seq1. substr (seg. start, seg. size ()); + if (! seg. strand) + { + reverse (seq1); + for (char &c : seq1) + c = complementaryNucleotide (c); + } + //strLower (seq1); // Letter case can indicate nucleotide quality + } + //else + //strUpper (seq1); + constexpr size_t line_len = 60; // PAR + for (size_t i = 0; i < seq1. size (); i += line_len) + cout << seq1. substr (i, line_len) << endl; + } + + return true; +} + + + +struct ThisApplication : Application +{ + ThisApplication () + : Application ("Extract sequences out of a FASTA file") + { + addPositional ("fasta", "FASTA file"); + addPositional ("target", "Target identifiers in the FASTA file to extract.\n\ +Line format for amino acid sequences : \n\ +Line format for nucleotide sequences : =1)> = start)> \ +"); + addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); + version = SVN_REV; + } + + + + void body () const final + { + const string fName = getArg ("fasta"); + const string targetFName = getArg ("target"); + const bool aa = getFlag ("aa"); + + + map> id2segments; + { + LineInput f (targetFName); + string id; + Istringstream iss; + while (f. nextLine ()) + { + iss. reset (f. line); + Segment seg; + iss >> id; + if (! aa) + { + char strand = '\0'; + iss >> seg. start >> seg. stop >> strand; + QC_ASSERT (seg. start); + QC_ASSERT (seg. start <= seg. stop); + seg. start--; + QC_ASSERT ( strand == '+' + || strand == '-' + ); + seg. strand = (strand == '+'); + } + iss >> seg. genesymbol; + seg. name = f. line. substr ((size_t) iss. tellg ()); + trim (seg. name); + QC_ASSERT (aa == ! seg. isDna ()); + id2segments [id] << std::move (seg); + } + } + if (verbose ()) + for (const auto& it : id2segments) + { + cout << it. first << ": " << endl; + for (const Segment& seg : it. second) + { + cout << " "; + seg. saveText (cout); + } + } + if (id2segments. empty ()) + return; + + + size_t processed = 0; + { + LineInput f (fName); + string id; + string seq; + while (f. nextLine ()) + { + trimTrailing (f. line); + if (f. line. empty ()) + continue; + if (f. line [0] == '>') + { + processed += process (id, seq, id2segments); + size_t pos = 1; + while (pos < f. line. size () && ! isspace (f. line [pos])) + pos++; + id = f. line. substr (1, pos - 1); + seq. clear (); + } + else + seq += f. line; + } + processed += process (id, seq, id2segments); + } + if (processed != id2segments. size ()) + throw runtime_error ("Requested identifiers: " + to_string (id2segments. size ()) + ", but processed: " + to_string (processed)); + // Assumed: no duplicate identifiers in FASTA + } +}; + + + +} // namespace + + + +int main (int argc, + const char* argv[]) +{ + ThisApplication app; + return app. run (argc, argv); +} + + + diff --git a/stxtyper.cpp b/stxtyper.cpp index 9def385..176b838 100644 --- a/stxtyper.cpp +++ b/stxtyper.cpp @@ -32,6 +32,7 @@ * Dependencies: NCBI BLAST, gunzip (optional) * * Release changes: +* 1.0.21 07/15/2024 PD-5038 ----nucleotide_output * 1.0.20 05/21/2024 PD-5002 {A|B}_reference_subtype * 1.0.19 03/26/2024 BlastAlignment::targetAlign is removed * 1.0.18 03/19/2024 PD-4910 Element symbol is _operon, Element name contains operon quality attribute" @@ -384,7 +385,7 @@ struct BlastAlignment return (targetStrand == (subunit == 'B') && targetStart <= missed_max) || (targetStrand == (subunit == 'A') && targetLen - targetEnd <= missed_max); } - bool getExtended () const + bool getExtended () const // On C-terminus { ASSERT (! truncated ()); return ! refStart && refEnd + 1 == refLen; } @@ -501,7 +502,7 @@ struct Operon string stxType (getStxType (verboseP)); const string standard ("COMPLETE"); const bool novel = al1->stxClass != al2->stxClass - || getIdentity () < stxClass2identity [al1->stxClass] + || getIdentity () < stxClass2identity [al1->stxClass] // May be due to X's || stxType. size () <= 1; const string operonType = getA () -> frameshift || getB () -> frameshift @@ -787,6 +788,7 @@ struct ThisApplication : ShellApplication addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR"); addFlag ("amrfinder", "Print output in the nucleotide AMRFinderPlus format"); addFlag ("print_node", "Print AMRFinderPlus hierarchy node"); + addKey ("nucleotide_output", "Output nucleotide FASTA file of reported nucleotide sequences", "", '\0', "NUC_FASTA_OUT"); version = SVN_REV; } @@ -802,6 +804,7 @@ struct ThisApplication : ShellApplication string blast_bin = getArg ("blast_bin"); amrfinder = getFlag ("amrfinder"); print_node = getFlag ("print_node"); + const string dna_out = shellQuote (getArg ("nucleotide_output")); if (contains (input_name, '\t')) throw runtime_error ("NAME cannot contain a tab character"); @@ -903,9 +906,10 @@ struct ThisApplication : ShellApplication stxClass2identity ["2n"] = 0.98; stxClass2identity ["2o"] = 0.98; - - Cout out (output); - TsvOut td (& *out, 2, false); + + const string tmpOut (tmp + "/out"); + OFStream fOut (tmpOut); + TsvOut td (& fOut, 2, false); TsvOut logTd (logPtr, 2, false); @@ -1150,6 +1154,27 @@ struct ThisApplication : ShellApplication goodOperons. sort (Operon::reportLess); for (const Operon& op : goodOperons) op. saveTsvOut (td, false); + + // Output + { + TextTable tt (tmpOut); + tt. qc (); + { + Cout out (output); + tt. saveText (*out); + } + if (! emptyArg (dna_out)) + { + const StringVector columns {"target_contig", "target_start", "target_stop", "target_strand", "stx_type", "operon"}; + tt. filterColumns (columns); + tt. saveHeader = false; + tt. qc (); + const string extract (tmp + "/extract"); + tt. saveFile (extract); + prog2dir ["fasta_extract"] = execDir; + exec (fullProg ("fasta_extract") + dna_flat + " " + extract + qcS + " -log " + logFName + " > " + dna_out, logFName); + } + } } }; diff --git a/tsv.cpp b/tsv.cpp new file mode 100644 index 0000000..6bd81b3 --- /dev/null +++ b/tsv.cpp @@ -0,0 +1,783 @@ +// tsv.cpp + +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* Author: Vyacheslav Brover +* +* File Description: +* TSV table +* +*/ + + +#undef NDEBUG + +#include "tsv.hpp" + +#include "common.inc" + + + + +namespace Common_sp +{ + + + +// Date + +Date Date::parse (const string &s, + Format fmt) +{ + istringstream iss (s); + short year = 0; + short month = 0; + short day = 0; + char c1 = '\0'; + char c2 = '\0'; + string tmp; + switch (fmt) + { + case fmt_Year: + iss >> year >> tmp; + if ( tmp. empty () + && isYear (year) + ) + return Date (year); + break; + case fmt_YMD: + iss >> year >> c1 >> month >> c2 >> day >> tmp; + month--; + day--; + if ( tmp. empty () + && isYear (year) + && isMonth (month) + && isDay (day) + && c1 == c2 + ) + return Date (year, (char) month, (char) day); + break; + default: throw runtime_error (FUNC "Unknown date format"); + } + return Date (); +} + + + +bool Date::less (const Date &other, + bool equal) const +{ + LESS_PART (*this, other, year); + LESS_PART (*this, other, month); + LESS_PART (*this, other, day); + return equal; +} + + + +Date Date::operator- (const Date &other) const +{ + Date d ( short (year - other. year) + , char (month - other. month) + , char (day - other. day) + ); + // Normalization + // day < 0 ?? + while (d. month < 0) + { + d. month = char (d. month + 12); + d. year --; + } + return d; +} + + + + +// TextTable + +void TextTable::Header::qc () const +{ + if (! qc_on) + return; + + Named::qc (); + + QC_IMPLY (scientific, numeric); + QC_IMPLY (decimals, numeric); +} + + + +TextTable::TextTable (const string &tableFName, + const string &columnSynonymsFName) +: Named (tableFName) +{ + { + LineInput f (tableFName); + bool dataExists = true; + // header + while (f. nextLine ()) + { + if (verbose ()) + cerr << f. lineNum << endl; + trimTrailing (f. line); + if (f. line. empty ()) + continue; + const bool thisPound = (f. line. front () == '#'); + if (thisPound) + { + pound = true; + f. line. erase (0, 1); + } + if (f. line. empty ()) + continue; + if (header. empty () || thisPound) + { + header. clear (); + StringVector h (f. line, '\t', true); + for (string& s : h) + header << std::move (Header (std::move (s))); + } + ASSERT (! header. empty ()); + if (! thisPound) + { + if (! pound) + dataExists = f. nextLine (); + break; + } + } + if (header. empty ()) + throw Error (*this, "Cannot read the table header"); + // dataExists <=> f.line is valid + // rows[] + while (dataExists) + { + trimTrailing (f. line); + if (! f. line. empty ()) + { + StringVector row (f. line, '\t', true); + FFOR_START (size_t, i, row. size (), header. size ()) + row << noString; + rows << std::move (row); + ASSERT (row. empty ()); + } + dataExists = f. nextLine (); + } + } + + if (! columnSynonymsFName. empty ()) + { + LineInput colF (columnSynonymsFName); + string mainSyn; + while (colF. nextLine ()) + { + trim (colF. line); + const string& syn = colF. line; + if (syn. empty ()) + mainSyn. clear (); + else + { + if (mainSyn. empty ()) + mainSyn = syn; + else + if (mainSyn != syn) + { + const ColNum i = col2num_ (syn); + if (i != no_index) + { + if (hasColumn (mainSyn)) + throw runtime_error ("Table " + strQuote (name) + ": Column " + strQuote (mainSyn) + " already exists"); + else + header [i]. name = mainSyn; + } + } + } + } + } + + setHeader (); +} + + + +void TextTable::setHeader () +{ + RowNum row_num = 0; + for (const StringVector& row : rows) + { + row_num++; + if (row. size () != header. size ()) + throw Error (*this, "Row " + to_string (row_num) + " contains " + to_string (row. size ()) + " columns whereas header has " + to_string (header. size ()) + " columns"); + FFOR (RowNum, i, row. size ()) + { + string field (row [i]); + trim (field); + Header& h = header [i]; + if (field. empty ()) + { + h. null = true; + continue; + } + maximize (h. len_max, field. size ()); + if (h. choices. size () <= Header::choices_max) + h. choices << field; + if (! h. numeric) + continue; + { + char* endptr = nullptr; + strtod (field. c_str (), & endptr); + if (endptr != field. c_str () + field. size ()) + { + h. numeric = false; + h. scientific = false; + h. decimals = 0; + } + } + if (h. numeric) + { + bool hasPoint = false; + streamsize decimals = 0; + if (getDecimals (field, hasPoint, decimals)) + h. scientific = true; + maximize (h. decimals, decimals); + } + } + } + + // Header::len_max for numeric + for (const StringVector& row : rows) + FFOR (RowNum, i, row. size ()) + { + const string& field = row [i]; + if (field. empty ()) + continue; + Header& h = header [i]; + if (h. numeric) + { + bool hasPoint = false; + streamsize decimals = 0; + getDecimals (field, hasPoint, decimals); + maximize (h. len_max, field. size () + (size_t) (h. decimals - decimals) + (! hasPoint)); + } + } +} + + + +void TextTable::qc () const +{ + if (! qc_on) + return; + if (! name. empty ()) + Named::qc (); + + { + StringVector v; v. reserve (header. size ()); + FFOR (size_t, i, header. size ()) + { + const Header& h = header [i]; + try { h. qc (); } + catch (const exception &e) + { + throw runtime_error ("Header column #" + to_string (i + 1) + ": " + e. what ()); + } + v << h. name; + } + v. sort (); + const size_t i = v. findDuplicate (); + if (i != no_index) + throw Error (*this, "Duplicate column name: " + strQuote (v [i])); + } + + FFOR (RowNum, i, rows. size ()) + { + if (rows [i]. size () != header. size ()) + throw Error (*this, "Row " + to_string (i + 1) + " contains " + to_string (rows [i]. size ()) + " columns whereas table has " + to_string (header. size ()) + " columns"); + for (const string& field : rows [i]) + { + if (contains (field, '\t')) + throw Error (*this, "Field " + strQuote (header [i]. name) + " of row " + to_string (i + 1) + " contains a tab character"); + if (contains (field, '\n')) + throw Error (*this, "Field " + strQuote (header [i]. name) + " of row " + to_string (i + 1) + " contains an EOL character"); + } + } +} + + + +void TextTable::saveText (ostream &os) const +{ + if (saveHeader) + { + if (pound) + os << '#'; + bool first = true; + for (const Header& h : header) + { + if (! first) + os << '\t'; + os << h. name; + first = false; + } + os << endl; + } + + for (const StringVector& row : rows) + { + save (os, row, '\t'); + os << endl; + } +} + + + +bool TextTable::getDecimals (string s, + bool &hasPoint, + streamsize &decimals) +{ + strUpper (s); + const size_t ePos = s. find ('E'); + const size_t pointPos = s. find ('.'); + + hasPoint = pointPos != string::npos; + + decimals = 0; + if (ePos == string::npos) + { + if (hasPoint) + decimals = (streamoff) (s. size () - (pointPos + 1)); + } + else + { + if (hasPoint && ePos > pointPos) + decimals = (streamoff) (ePos - (pointPos + 1)); + } + + return ePos != string::npos; +} + + + +void TextTable::printHeader (ostream &os) const +{ + FFOR (size_t, i, header. size ()) + { + os << i + 1 << '\t'; + header [i]. saveText (os); + os << endl; + } +} + + + +TextTable::ColNum TextTable::col2num_ (const string &columnName) const +{ + FFOR (size_t, i, header. size ()) + if (header [i]. name == columnName) + return i; + return no_index; +} + + + +void TextTable::duplicateColumn (const string &columnName_from, + const string &columnName_to) +{ + ASSERT (! columnName_to. empty ()); + const ColNum from = col2num (columnName_from); + if (hasColumn (columnName_to)) + throw runtime_error ("Table already has column " + strQuote (columnName_to)); + header << header [from]; + header. back (). name = columnName_to; + for (StringVector& row : rows) + row << row [from]; + qc (); +} + + + +TextTable::ColNum TextTable::findDate (Date::Format &fmt) const +{ + FFOR (ColNum, dateCol, header. size ()) + { + const Header& h = header [dateCol]; + if ( h. null + || h. scientific + ) + continue; + size_t fmt_ = 0; + while (fmt_ < Date::fmt_None) + { + fmt = Date::Format (fmt_); + bool isDate = true; + for (const StringVector& row : rows) + if (Date::parse (row [dateCol], fmt). empty ()) + { + isDate = false; + break; + } + if (isDate) + return dateCol; + fmt_++; + } + } + return no_index; +} + + + +bool TextTable::isKey (ColNum colNum) const +{ + ASSERT (colNum < header. size ()); + + const Header& h = header [colNum]; + if (h. null) + return false; + if (h. numeric) + if ( h. scientific + || h. decimals > 0 + ) + return false; + + unordered_set values; values. rehash (rows. size ()); + for (const StringVector& row : rows) + { + ASSERT (! row [colNum]. empty ()); + if (! values. insert (row [colNum]). second) + return false; + } + + return true; +} + + + +int TextTable::compare (const StringVector& row1, + const StringVector& row2, + ColNum column) const +{ + const string& s1 = row1 [column]; + const string& s2 = row2 [column]; + + if (header [column]. numeric) + { + const double a = s1. empty () ? 0.0 : stod (s1); + const double b = s2. empty () ? 0.0 : stod (s2); + if (a < b) + return -1; + if (a > b) + return 1; + return 0; + } + + if (s1 < s2) + return -1; + if (s1 > s2) + return 1; + + return 0; +} + + + +void TextTable::filterColumns (const StringVector &newColumnNames) +{ + const Vector colNums (columns2nums (newColumnNames)); + + { + Vector
newHeader; newHeader. reserve (colNums. size ()); + for (const ColNum i : colNums) + newHeader << header [i]; + header = std::move (newHeader); + } + + for (StringVector& row : rows) + { + StringVector newRow; newRow. reserve (colNums. size ()); + for (const ColNum i : colNums) + newRow << row [i]; + row = std::move (newRow); + } +} + + + +void TextTable::sort (const StringVector &by) +{ + const Vector byIndex (columns2nums (by)); + + const auto lt = [&byIndex,this] (const StringVector &a, const StringVector &b) + { for (const ColNum i : byIndex) + switch (this->compare (a, b, i)) + { case -1: return true; + case 1: return false; + } + // Tie resolution + FFOR (size_t, i, a. size ()) + switch (this->compare (a, b, i)) + { case -1: return true; + case 1: return false; + } + return false; + }; + Common_sp::sort (rows, lt); +} + + + +namespace +{ + struct ColumnPartitionQC + { + map name2oper; + + void add (const StringVector &cols, + const string &oper) + { + for (const string& s : cols) + if (name2oper [s]. empty ()) + name2oper [s] = oper; + else + throw runtime_error ("Column " + strQuote (s) + " is used for operations " + strQuote (name2oper [s]) + " and " + strQuote (oper)); + } + }; +} + + + +void TextTable::group (const StringVector &by, + const StringVector &sum, + const StringVector &minV, + const StringVector &maxV, + const StringVector &aggr) +{ + const Vector byIndex (columns2nums (by)); + const Vector sumIndex (columns2nums (sum)); + const Vector minIndex (columns2nums (minV)); + const Vector maxIndex (columns2nums (maxV)); + const Vector aggrIndex (columns2nums (aggr)); + + // QC + { + ColumnPartitionQC cp; + cp. add (by, "group by"); + cp. add (sum, "sum"); + cp. add (minV, "min"); + cp. add (maxV, "max"); + cp. add (aggr, "aggregation"); + } + for (const string& s : sum) + if (! header [col2num (s)]. numeric) + throw runtime_error ("Summation column " + strQuote (s) + " is not numeric"); + + sort (by); + + RowNum i = 0; + FFOR_START (RowNum, j, 1, rows. size ()) + { + ASSERT (i < j); + if (rows [i]. same (rows [j], byIndex)) + merge (i, j, sumIndex, minIndex, maxIndex, aggrIndex); + else + { + i++; + if (i < j) + rows [i] = std::move (rows [j]); + } + } + if (! rows. empty ()) + i++; + + ASSERT (rows. size () >= i); + FFOR (RowNum, k, rows. size () - i) + rows. pop_back (); + + StringVector newColumns; + newColumns << by << sum << minV << maxV << aggr; + filterColumns (newColumns); +} + + + +void TextTable::merge (RowNum toRowNum, + RowNum fromRowNum, + const Vector &sum, + const Vector &minV, + const Vector &maxV, + const Vector &aggr) +{ + ASSERT (toRowNum < fromRowNum); + + StringVector& to = rows [toRowNum]; + const StringVector& from = rows [fromRowNum]; + + for (const ColNum i : sum) + { + const Header& h = header [i]; + ASSERT (h. numeric); + ostringstream oss; + ONumber on (oss, h. decimals, h. scientific); + const string& s1 = to [i]; + const string& s2 = from [i]; + const double d1 = s1. empty () ? 0.0 : stod (s1); + const double d2 = s2. empty () ? 0.0 : stod (s2); + oss << (d1 + d2); + to [i] = oss. str (); + } + + for (const ColNum i : minV) + if ( to [i]. empty () + || ( ! from [i]. empty () + && compare (to, from, i) == 1 + ) + ) + to [i] = from [i]; + + for (const ColNum i : maxV) + if ( to [i]. empty () + || ( ! from [i]. empty () + && compare (to, from, i) == -1 + ) + ) + to [i] = from [i]; + + for (const ColNum i : aggr) + { + if (from [i]. empty ()) + continue; + if (contains (from [i], aggr_sep)) + throw runtime_error ("Cannot aggregate column " + header [i]. name + " for row " + to_string (fromRowNum + 1) + " because it contains " + strQuote (string (1, aggr_sep))); + if (to [i]. empty ()) + to [i] = from [i]; + else + { + StringVector vec (to [i], aggr_sep, true); + vec << from [i]; + vec. sort (); + vec. uniq (); + to [i] = vec. toString (string (1, aggr_sep)); + } + } +} + + + +void TextTable::colNumsRow2values (const Vector &colNums, + RowNum row_num, + StringVector &values) const +{ + values. clear (); + values. reserve (colNums. size ()); + const StringVector& row = rows [row_num]; + FFOR (ColNum, i, colNums. size ()) + values << row [colNums [i]]; +} + + + +TextTable::RowNum TextTable::find (const Vector &colNums, + const StringVector &targetValues, + RowNum row_num_start) const +{ + ASSERT (colNums. size () == targetValues. size ()); + ASSERT (row_num_start != no_index); + StringVector values; + FOR_START (RowNum, i, row_num_start, rows. size ()) + { + colNumsRow2values (colNums, i, values); + if (values == targetValues) + return i; + } + return no_index; +} + + + +StringVector TextTable::col2values (ColNum col) const +{ + QC_ASSERT (col < header. size ()); + + Set s; + for (const StringVector& row : rows) + if (! row [col]. empty ()) + s << row [col]; + + StringVector vec; vec. reserve (s. size ()); + insertAll (vec, s); + + return vec; +} + + + + +// TextTable::Key + + +TextTable::Key::Key (const TextTable &tab, + const StringVector &columns) +: colNums (tab. columns2nums (columns)) +{ + data. rehash (tab. rows. size ()); + StringVector values; + FFOR (RowNum, i, tab. rows. size ()) + { + tab. colNumsRow2values (colNums, i, values); + for (const string& s : values) + if (s. empty ()) + throw Error (tab, "Empty value in key, in row " + to_string (i + 1)); + if (data. find (values) != data. end ()) + throw Error (tab, "Duplicate key " + values. toString (",") + " for the key on " + columns. toString (",")); + ASSERT (i != no_index); + data [values] = i; + } +} + + + + +// TextTable::Index + + +TextTable::Index::Index (const TextTable &tab, + const StringVector &columns) +: colNums (tab. columns2nums (columns)) +{ + data. rehash (tab. rows. size ()); + StringVector values; + FFOR (RowNum, i, tab. rows. size ()) + { + tab. colNumsRow2values (colNums, i, values); + data [values] << i; + } +} + + + + +} + + diff --git a/tsv.hpp b/tsv.hpp index 6e58eba..26af24f 100644 --- a/tsv.hpp +++ b/tsv.hpp @@ -48,7 +48,7 @@ namespace Common_sp struct Date : Root { - enum Format {fmt_Year, fmt_None}; // not complete list ?? + enum Format {fmt_Year, fmt_YMD, fmt_None}; // not complete list ?? short year {0}; char month {0}; // 0 .. 12 - 1 @@ -66,6 +66,10 @@ struct Date : Root {} static bool isYear (short n) { return n > 1000 && n < 2500; } // PAR + static bool isMonth (short n) + { return between (n, 0, 12); } + static bool isDay (short n) + { return between (n, 0, 31); } // Must depend on month ?? static Date parse (const string &s, Format fmt); // Return: !empty() <=> success @@ -94,9 +98,20 @@ struct Date : Root && month == other. month && day == other. day; } - bool operator<= (const Date &other) const; + bool less (const Date &other, + bool equal) const; + bool operator<= (const Date &other) const + { return less (other, true); } + bool operator< (const Date &other) const + { return less (other, false); } Date operator- (const Date &other) const; // Requires: other <= *this + bool year_divisible () const + { return ! month && ! day; } + bool quarter_divisible () const + { return ! (month % 3) && ! day; } + bool month_divisible () const + { return ! day; } }; @@ -117,9 +132,11 @@ struct TextTable : Named bool scientific {false}; streamsize decimals {0}; bool null {false}; + // = can be empty() static constexpr size_t choices_max {7}; // PAR Set choices; // size() <= choices_max + 1 + Header () = default; explicit Header (const string &name_arg) : Named (name_arg) {} @@ -168,8 +185,9 @@ struct TextTable : Named explicit TextTable (const string &tableFName, const string &columnSynonymsFName = noString); - // columnSynonymsFName: syn_format - // Top lines starting with '#': comment + header + // Input: tableFName: format: [{'#' }* '#']
{ >}* + // empty lines are skipped + // columnSynonymsFName: // Rows where number of columns < header size are added empty values static constexpr const char* syn_format {"Column synonyms file with the format: {
{ }* {|}}*"}; TextTable () = default; @@ -217,6 +235,7 @@ struct TextTable : Named // Date column is not empty and has the same format fmt in all rows // Return: no_index <=> not found // Output: fmt, valid if return != no_index + bool isKey (ColNum colNum) const; private: int compare (const StringVector& row1, const StringVector& row2, @@ -232,6 +251,7 @@ struct TextTable : Named const StringVector &minV, const StringVector &maxV, const StringVector &aggr); + // aggr: slow // Invokes: filterColumns(by + sum + aggr) private: void merge (RowNum toRowNum, diff --git a/version.txt b/version.txt index c2320f5..b112f91 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.0.20 +1.0.21