From 9a6edb1e793af17bdce9218e4a1b1c5bce58b8d0 Mon Sep 17 00:00:00 2001
From: Vyacheslav Brover <vyacheslav.brover@nih.gov>
Date: Mon, 15 Jul 2024 15:25:32 -0400
Subject: [PATCH] PD-5038 --ncleotide_output

---
 .gitignore        |   1 +
 Makefile          |   8 +-
 common.cpp        |   2 +-
 common.hpp        |  22 +-
 fasta_extract.cpp | 267 ++++++++++++++++
 stxtyper.cpp      |  35 ++-
 tsv.cpp           | 783 ++++++++++++++++++++++++++++++++++++++++++++++
 tsv.hpp           |  28 +-
 version.txt       |   2 +-
 9 files changed, 1127 insertions(+), 21 deletions(-)
 create mode 100644 fasta_extract.cpp
 create mode 100644 tsv.cpp
diff --git a/.gitignore b/.gitignore
index f725ec1..acd8b3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 *.o
 stxtyper
 fasta_check
+fasta_extract
 stx.prot.*
 *.got
diff --git a/Makefile b/Makefile
index cdd8c32..da7a6e5 100644
--- a/Makefile
+++ b/Makefile
@@ -79,7 +79,7 @@ COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) $(DBDIR) $(TEST_UPDATE_DB) -c
 
 .PHONY: all clean install release test
 
-BINARIES= stxtyper fasta_check 
+BINARIES= stxtyper fasta_check fasta_extract
 DATABASE= stx.prot
 
 all:	$(BINARIES)
@@ -90,7 +90,7 @@ all:	$(BINARIES)
 common.o:	common.hpp common.inc
 
 stxtyper.o:  common.hpp common.inc 
-stxtyperOBJS=stxtyper.o common.o
+stxtyperOBJS=stxtyper.o common.o tsv.o
 stxtyper:	$(stxtyperOBJS)
 	$(CXX) -o $@ $(stxtyperOBJS) -pthread $(DBDIR)
 
@@ -99,6 +99,10 @@ fasta_checkOBJS=fasta_check.o common.o
 fasta_check:	$(fasta_checkOBJS)
 	$(CXX) -o $@ $(fasta_checkOBJS)
 
+fasta_extract.o:	common.hpp common.inc 
+fasta_extractOBJS=fasta_extract.o common.o 
+fasta_extract:	$(fasta_extractOBJS)
+	$(CXX) -o $@ $(fasta_extractOBJS)
 
 clean:
 	rm -f *.o
diff --git a/common.cpp b/common.cpp
index 2163008..b5d459f 100644
--- a/common.cpp
+++ b/common.cpp
@@ -1710,7 +1710,7 @@ void Xml::TextFile::tagStart (const string &tag)
 	string tag_ (tag);
 	replace (tag_, ':', '_');
 	if (! isIdentifier (tag_, true))
-    throw runtime_error (FUNC "Bad tag name: " + strQuote (tag));
+    throw runtime_error (FUNC "Bad textual XML tag name: " + strQuote (tag));
 
 	printRaw ("<" + tag + ">"); 
 }
diff --git a/common.hpp b/common.hpp
index 1a0acfd..2249cee 100644
--- a/common.hpp
+++ b/common.hpp
@@ -156,6 +156,7 @@ void errorExitStr (const string &msg);
 
 void beep ();
   // Requires: !isRedirected()
+  //           SHLVL = 1 ??
     
 
 
@@ -744,9 +745,11 @@ inline void trim (string &s)
 
 void trimLeading (string &s,
                   char c);
+  // Invokes: isSpace()
 
 void trimTrailing (string &s,
                    char c);
+  // Invokes: isSpace()
 
 inline void trim (string &s,
                   char c)
@@ -1295,13 +1298,13 @@ void copyText (const string &inFName,
 #ifndef _MSC_VER
   inline void moveFile (const string &from,
                         const string &to)
-    { if (::rename (from. c_str (), to. c_str ()))
-        throw runtime_error ("Cannot move file + " + shellQuote (from) + " to " + shellQuote (to));
+    { if (const int code = ::rename (from. c_str (), to. c_str ()))
+        throw runtime_error ("Cannot move file + " + shellQuote (from) + " to " + shellQuote (to) + " (" + to_string (code) + ")");
     }
 
   inline void removeFile (const string &fName)
-    { if (::remove (fName. c_str ()))
-        throw runtime_error ("Cannot remove file + " + shellQuote (fName));
+    { if (const int code = ::remove (fName. c_str ()))
+        throw runtime_error ("Cannot remove file + " + shellQuote (fName) + " (" + to_string (code) + ")");
     }
 
       
@@ -1892,6 +1895,7 @@ struct Xml
 
   
   struct TextFile : File
+  // Tag::name: idenifier with possible '-'
   {
   private:
     struct XmlStream : OFStream
@@ -1925,7 +1929,7 @@ struct Xml
   //   <Data> ::= <nameIndex> <Data>* 0 0 <text> 0
   //     <nameIndex> ::= <byte> <byte>
   //   Number of different Tag::name's <= 2^16
-  //   Tag::name has no: '\0', '\n'
+  //   Tag::name: no '\0', '\n'
   {
   private:
   	ofstream os;
@@ -3913,8 +3917,9 @@ struct TokenInput : Root
 
 
   [[noreturn]] void error (const Token &wrongToken,
-                           const string &expected) const
-    { throw TextPos::Error (wrongToken. tp, expected, true); }
+                           const string &what,
+                           bool expected = true) const
+    { throw TextPos::Error (wrongToken. tp, what, expected); }
   [[noreturn]] void error (const string &what,
 	                         bool expected = true) const
 		{ ci. error (what, expected); }  
@@ -3948,7 +3953,8 @@ struct TokenInput : Root
    			error (t, Token::type2str (Token::eDouble) + " " + toString (expected)); 
     }
 	void get (char expected)
-    { if (getNextChar (false) != expected)    
+    { const Token t (get ());
+      if (! t. isDelimiter (expected))
    			error (Token::type2str (Token::eDelimiter) + " " + strQuote (toString (expected), '\'')); 
     }
   void setLast (Token &&t)
diff --git a/fasta_extract.cpp b/fasta_extract.cpp
new file mode 100644
index 0000000..786f4f8
--- /dev/null
+++ b/fasta_extract.cpp
@@ -0,0 +1,267 @@
+// fasta_check.cpp
+
+/*===========================================================================
+*
+*                            PUBLIC DOMAIN NOTICE                          
+*               National Center for Biotechnology Information
+*                                                                          
+*  This software/database is a "United States Government Work" under the   
+*  terms of the United States Copyright Act.  It was written as part of    
+*  the author's official duties as a United States Government employee and 
+*  thus cannot be copyrighted.  This software/database is freely available 
+*  to the public for use. The National Library of Medicine and the U.S.    
+*  Government have not placed any restriction on its use or reproduction.  
+*                                                                          
+*  Although all reasonable efforts have been taken to ensure the accuracy  
+*  and reliability of the software and data, the NLM and the U.S.          
+*  Government do not and cannot warrant the performance or results that    
+*  may be obtained by using this software or data. The NLM and the U.S.    
+*  Government disclaim all warranties, express or implied, including       
+*  warranties of performance, merchantability or fitness for any particular
+*  purpose.                                                                
+*                                                                          
+*  Please cite the author in any work or product based on this material.   
+*
+* ===========================================================================
+*
+* Author: Vyacheslav Brover
+*
+* File Description:
+*   Extract sequences out of a FASTA file
+*
+*/
+   
+   
+#undef NDEBUG 
+
+#include "common.hpp"
+using namespace Common_sp;
+
+#include "common.inc"
+
+
+
+namespace 
+{
+  
+  
+  
+struct Segment
+// not circular
+{
+  size_t start {0};
+  size_t stop {0};
+  bool strand {true};
+    // false <=> negative
+  string genesymbol;
+  string name;
+  
+  
+  bool isDna () const
+    { return stop; }
+  size_t size () const
+    { return stop - start; }
+  void saveText (ostream &os) const
+    { os         << start 
+         << '\t' << stop 
+         << '\t' << strand 
+         << '\t' << genesymbol 
+         << '\t' << name
+         << endl;
+    }
+};
+
+
+
+char complementaryNucleotide (char wildNucleotide)
+{
+  char r = ' ';
+  switch (toLower (wildNucleotide))
+  {
+    case 'a': r = 't'; break;
+    case 'c': r = 'g'; break;
+    case 'g': r = 'c'; break;
+    case 't': r = 'a'; break;
+    case 'm': r = 'k'; break;
+    case 'r': r = 'y'; break;
+    case 'w': r = 'w'; break;
+    case 's': r = 's'; break;
+    case 'y': r = 'r'; break;
+    case 'k': r = 'm'; break;
+    case 'v': r = 'b'; break;
+    case 'h': r = 'd'; break;
+    case 'd': r = 'h'; break;
+    case 'b': r = 'v'; break;
+    case 'n': r = 'n'; break;
+    case '-': r = '-'; break;
+    default: 
+    	throw runtime_error ("Bad nucleotide " + to_string (wildNucleotide));
+  }
+  if (isupper (wildNucleotide))
+    r = toUpper (r);
+
+  return r;
+}
+
+
+
+bool process (const string &id, 
+              string &seq, 
+              const map<string/*id*/,Vector<Segment>> &id2segments)
+{
+  if (id. empty ())
+    return false;
+  const Vector<Segment>* segments = findPtr (id2segments, id);
+  if (! segments)
+    return false;
+    
+  replaceStr (seq, "-", "");
+  QC_ASSERT (! seq. empty ());
+  
+  for (Segment& seg : var_cast (*segments))
+  {
+    cout << '>' << id;
+    if (seg. isDna ())
+    {
+      QC_ASSERT (seg. start <= seq. size ());
+      minimize (seg. stop, seq. size ());
+      QC_ASSERT (seg. start < seg. stop);
+      cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-');
+    }
+    cout << ' ' << seg. genesymbol << ' ' << seg. name << endl;
+    string seq1 (seq);
+    if (seg. isDna ())
+    {
+      ASSERT (seg. stop <= seq1. size ());
+      seq1 = seq1. substr (seg. start, seg. size ());
+      if (! seg. strand)
+      {
+        reverse (seq1);
+        for (char &c : seq1)
+          c = complementaryNucleotide (c);
+      }
+    //strLower (seq1);  // Letter case can indicate nucleotide quality
+    }    
+  //else
+    //strUpper (seq1);
+    constexpr size_t line_len = 60;  // PAR
+    for (size_t i = 0; i < seq1. size (); i += line_len)
+      cout << seq1. substr (i, line_len) << endl;
+  }
+  
+  return true;
+}
+
+
+
+struct ThisApplication : Application
+{
+  ThisApplication ()
+    : Application ("Extract sequences out of a FASTA file")
+    {
+      addPositional ("fasta", "FASTA file");
+      addPositional ("target", "Target identifiers in the FASTA file to extract.\n\
+Line format for amino acid sequences : <id> <gene symbol> <product name>\n\
+Line format for nucleotide sequences : <id> <start (>=1)> <stop (>= start)> <strand (+/-)> <gene symbol> <product name>\
+");
+      addFlag ("aa", "Amino acid sequenes, otherwise nucleotide");
+	    version = SVN_REV;
+    }
+
+
+
+  void body () const final
+  {
+    const string fName       = getArg ("fasta");
+    const string targetFName = getArg ("target");
+    const bool aa            = getFlag ("aa");
+    
+    
+    map<string/*id*/,Vector<Segment>> id2segments;
+    {
+      LineInput f (targetFName);
+      string id;
+      Istringstream iss;
+      while (f. nextLine ())
+      {
+        iss. reset (f. line);
+        Segment seg;
+        iss >> id;
+        if (! aa)
+        {
+          char strand = '\0';
+          iss >> seg. start >> seg. stop >> strand;
+          QC_ASSERT (seg. start);
+          QC_ASSERT (seg. start <= seg. stop);
+          seg. start--;
+          QC_ASSERT (   strand == '+' 
+                     || strand == '-'
+                    );
+          seg. strand = (strand == '+');
+        }
+        iss >> seg. genesymbol;
+        seg. name = f. line. substr ((size_t) iss. tellg ());
+        trim (seg. name);
+        QC_ASSERT (aa == ! seg. isDna ());
+        id2segments [id] << std::move (seg);
+      }
+    }
+    if (verbose ())
+      for (const auto& it : id2segments)
+      {
+        cout << it. first << ": " << endl;
+        for (const Segment& seg : it. second)
+        {
+          cout << "  ";
+          seg. saveText (cout);
+        }
+      }
+    if (id2segments. empty ())
+      return;
+    
+
+    size_t processed = 0;
+    {
+      LineInput f (fName); 
+      string id;
+      string seq;
+      while (f. nextLine ())
+      {
+        trimTrailing (f. line);
+        if (f. line. empty ())
+        	continue;
+      	if (f. line [0] == '>')
+      	{
+      	  processed += process (id, seq, id2segments);
+      		size_t pos = 1;
+      		while (pos < f. line. size () && ! isspace (f. line [pos]))
+      		  pos++;
+      		id = f. line. substr (1, pos - 1);
+      		seq. clear ();
+      	}
+      	else 
+      	  seq += f. line;
+  	  }
+   	  processed += process (id, seq, id2segments);
+   	}
+   	if (processed != id2segments. size ())  
+   	  throw runtime_error ("Requested identifiers: " + to_string (id2segments. size ()) + ", but processed: " + to_string (processed));
+   	  // Assumed: no duplicate identifiers in FASTA
+  }
+};
+
+
+
+}  // namespace
+
+
+
+int main (int argc, 
+          const char* argv[])
+{
+  ThisApplication app;
+  return app. run (argc, argv);  
+}
+
+
+
diff --git a/stxtyper.cpp b/stxtyper.cpp
index 9def385..176b838 100644
--- a/stxtyper.cpp
+++ b/stxtyper.cpp
@@ -32,6 +32,7 @@
 * Dependencies: NCBI BLAST, gunzip (optional)
 *
 * Release changes:
+*  1.0.21 07/15/2024 PD-5038  ----nucleotide_output 
 *  1.0.20 05/21/2024 PD-5002  {A|B}_reference_subtype
 *  1.0.19 03/26/2024          BlastAlignment::targetAlign is removed
 *  1.0.18 03/19/2024 PD-4910  Element symbol is <stx type>_operon, Element name contains operon quality attribute"
@@ -384,7 +385,7 @@ struct BlastAlignment
       return    (targetStrand == (subunit == 'B') && targetStart           <= missed_max)
              || (targetStrand == (subunit == 'A') && targetLen - targetEnd <= missed_max);
     }
-  bool getExtended () const
+  bool getExtended () const  // On C-terminus
     { ASSERT (! truncated ());
       return ! refStart && refEnd + 1 == refLen; 
     }
@@ -501,7 +502,7 @@ struct Operon
         string stxType (getStxType (verboseP));
         const string standard ("COMPLETE");
         const bool novel =    al1->stxClass != al2->stxClass 
-                           || getIdentity () < stxClass2identity [al1->stxClass]
+                           || getIdentity () < stxClass2identity [al1->stxClass]  // May be due to X's
                            || stxType. size () <= 1;
         const string operonType =    getA () -> frameshift
                                   || getB () -> frameshift
@@ -787,6 +788,7 @@ struct ThisApplication : ShellApplication
     	addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR");
     	addFlag ("amrfinder", "Print output in the nucleotide AMRFinderPlus format");
     	addFlag ("print_node", "Print AMRFinderPlus hierarchy node");
+      addKey ("nucleotide_output", "Output nucleotide FASTA file of reported nucleotide sequences", "", '\0', "NUC_FASTA_OUT");
 
       version = SVN_REV;
     }
@@ -802,6 +804,7 @@ struct ThisApplication : ShellApplication
           string blast_bin  =             getArg ("blast_bin");
                  amrfinder  =             getFlag ("amrfinder");
                  print_node =             getFlag ("print_node");
+    const string  dna_out   = shellQuote (getArg ("nucleotide_output"));
     
     if (contains (input_name, '\t'))
       throw runtime_error ("NAME cannot contain a tab character");
@@ -903,9 +906,10 @@ struct ThisApplication : ShellApplication
     stxClass2identity ["2n"] = 0.98;
     stxClass2identity ["2o"] = 0.98;
     
-    
-    Cout out (output);
-    TsvOut td (& *out, 2, false);
+
+    const string tmpOut (tmp + "/out");
+    OFStream fOut (tmpOut);
+    TsvOut td (& fOut, 2, false);
     TsvOut logTd (logPtr, 2, false);
 
     
@@ -1150,6 +1154,27 @@ struct ThisApplication : ShellApplication
     goodOperons. sort (Operon::reportLess);     
   	for (const Operon& op : goodOperons)
    	  op. saveTsvOut (td, false);
+
+    // Output
+    {
+      TextTable tt (tmpOut);
+      tt. qc ();
+      {
+        Cout out (output);
+   		  tt. saveText (*out);
+   		}
+      if (! emptyArg (dna_out))
+      {
+        const StringVector columns {"target_contig", "target_start", "target_stop", "target_strand", "stx_type", "operon"};
+        tt. filterColumns (columns);
+        tt. saveHeader = false;
+        tt. qc ();
+        const string extract (tmp + "/extract");
+        tt. saveFile (extract);
+        prog2dir ["fasta_extract"] = execDir;
+        exec (fullProg ("fasta_extract") + dna_flat + " " + extract + qcS + " -log " + logFName + " > " + dna_out, logFName);  
+      }
+    }
   }
 };
 
diff --git a/tsv.cpp b/tsv.cpp
new file mode 100644
index 0000000..6bd81b3
--- /dev/null
+++ b/tsv.cpp
@@ -0,0 +1,783 @@
+// tsv.cpp
+
+/*===========================================================================
+*
+*                            PUBLIC DOMAIN NOTICE                          
+*               National Center for Biotechnology Information
+*                                                                          
+*  This software/database is a "United States Government Work" under the   
+*  terms of the United States Copyright Act.  It was written as part of    
+*  the author's official duties as a United States Government employee and 
+*  thus cannot be copyrighted.  This software/database is freely available 
+*  to the public for use. The National Library of Medicine and the U.S.    
+*  Government have not placed any restriction on its use or reproduction.  
+*                                                                          
+*  Although all reasonable efforts have been taken to ensure the accuracy  
+*  and reliability of the software and data, the NLM and the U.S.          
+*  Government do not and cannot warrant the performance or results that    
+*  may be obtained by using this software or data. The NLM and the U.S.    
+*  Government disclaim all warranties, express or implied, including       
+*  warranties of performance, merchantability or fitness for any particular
+*  purpose.                                                                
+*                                                                          
+*  Please cite the author in any work or product based on this material.   
+*
+* ===========================================================================
+*
+* Author: Vyacheslav Brover
+*
+* File Description:
+*   TSV table
+*
+*/
+
+
+#undef NDEBUG
+
+#include "tsv.hpp"
+
+#include "common.inc"
+
+
+
+
+namespace Common_sp
+{
+ 
+
+
+// Date
+
+Date Date::parse (const string &s,
+                  Format fmt)
+{ 
+  istringstream iss (s);
+  short year = 0;
+  short month = 0;
+  short day = 0;
+  char c1 = '\0';
+  char c2 = '\0';
+  string tmp;
+  switch (fmt)
+  {
+    case fmt_Year:
+      iss >> year >> tmp;
+      if (   tmp. empty ()
+          && isYear (year)
+         )
+        return Date (year);
+      break;
+    case fmt_YMD:
+      iss >> year >> c1 >> month >> c2 >> day >> tmp;
+      month--;
+      day--;
+      if (   tmp. empty ()
+          && isYear  (year)
+          && isMonth (month)
+          && isDay   (day)
+          && c1 == c2
+         )
+        return Date (year, (char) month, (char) day);
+      break;
+    default: throw runtime_error (FUNC "Unknown date format");
+  }
+  return Date ();
+}
+
+
+
+bool Date::less (const Date &other,
+                 bool equal) const
+{
+  LESS_PART (*this, other, year);
+  LESS_PART (*this, other, month);
+  LESS_PART (*this, other, day);
+  return equal;
+}
+
+
+
+Date Date::operator- (const Date &other) const
+{ 
+  Date d ( short (year  - other. year)
+         , char  (month - other. month)
+         , char  (day   - other. day)
+         );
+  // Normalization 
+  // day < 0 ??
+  while (d. month < 0)
+  {
+    d. month = char (d. month + 12);
+    d. year --;
+  }
+  return d;
+}
+
+
+
+
+// TextTable
+
+void TextTable::Header::qc () const
+{
+  if (! qc_on)
+    return;
+
+  Named::qc ();
+    
+  QC_IMPLY (scientific, numeric);
+  QC_IMPLY (decimals, numeric);
+}
+
+
+
+TextTable::TextTable (const string &tableFName,
+                      const string &columnSynonymsFName)
+: Named (tableFName)
+{  
+  {
+    LineInput f (tableFName);
+    bool dataExists = true;
+    // header
+    while (f. nextLine ())
+    {
+      if (verbose ())
+        cerr << f. lineNum << endl;
+      trimTrailing (f. line);
+      if (f. line. empty ())
+        continue;
+      const bool thisPound = (f. line. front () == '#');
+      if (thisPound)
+      {
+        pound = true;
+        f. line. erase (0, 1);
+      }
+      if (f. line. empty ())
+        continue;
+      if (header. empty () || thisPound)
+      {
+        header. clear ();
+        StringVector h (f. line, '\t', true);
+        for (string& s : h)
+          header << std::move (Header (std::move (s)));
+      }
+      ASSERT (! header. empty ());
+      if (! thisPound)
+      {
+        if (! pound)
+          dataExists = f. nextLine ();
+        break;
+      }
+    }
+    if (header. empty ())
+      throw Error (*this, "Cannot read the table header");
+    // dataExists <=> f.line is valid
+    // rows[]
+    while (dataExists)
+    {
+      trimTrailing (f. line);
+      if (! f. line. empty ())
+      {
+        StringVector row (f. line, '\t', true);
+        FFOR_START (size_t, i, row. size (), header. size ())
+          row << noString;        
+        rows << std::move (row);
+        ASSERT (row. empty ());
+      }
+      dataExists = f. nextLine ();
+    }
+  }
+  
+  if (! columnSynonymsFName. empty ())
+  {
+    LineInput colF (columnSynonymsFName);
+    string mainSyn;
+    while (colF. nextLine ())
+    {
+      trim (colF. line);
+      const string& syn = colF. line;
+      if (syn. empty ())
+        mainSyn. clear ();
+      else
+      {
+        if (mainSyn. empty ())
+          mainSyn = syn;
+        else
+          if (mainSyn != syn)
+          {
+            const ColNum i = col2num_ (syn);
+            if (i != no_index)
+            {
+              if (hasColumn (mainSyn))
+                throw runtime_error ("Table " + strQuote (name) + ": Column " + strQuote (mainSyn) + " already exists");
+              else
+                header [i]. name = mainSyn;
+            }
+          }
+      }
+    }
+  }
+  
+  setHeader ();
+}
+
+
+
+void TextTable::setHeader ()
+{
+  RowNum row_num = 0;
+  for (const StringVector& row : rows)
+  {
+    row_num++;
+    if (row. size () != header. size ())
+      throw Error (*this, "Row " + to_string (row_num) + " contains " + to_string (row. size ()) + " columns whereas header has " + to_string (header. size ()) + " columns");
+    FFOR (RowNum, i, row. size ())
+    {
+      string field (row [i]);
+      trim (field);
+      Header& h = header [i];
+      if (field. empty ())
+      {
+        h. null = true;
+        continue;
+      }
+      maximize (h. len_max, field. size ());
+      if (h. choices. size () <= Header::choices_max)
+        h. choices << field;
+      if (! h. numeric)
+        continue;
+      {
+        char* endptr = nullptr;
+        strtod (field. c_str (), & endptr);
+        if (endptr != field. c_str () + field. size ())
+        {
+          h. numeric = false;
+          h. scientific = false;
+          h. decimals = 0;
+        }
+      }
+      if (h. numeric)
+      {
+        bool hasPoint = false;
+        streamsize decimals = 0;
+        if (getDecimals (field, hasPoint, decimals))
+          h. scientific = true;
+        maximize<streamsize> (h. decimals, decimals);
+      }
+    }
+  }
+
+  // Header::len_max for numeric
+  for (const StringVector& row : rows)
+    FFOR (RowNum, i, row. size ())
+    {
+      const string& field = row [i];
+      if (field. empty ())
+        continue;
+      Header& h = header [i];
+      if (h. numeric)
+      {
+        bool hasPoint = false;
+        streamsize decimals = 0;
+        getDecimals (field, hasPoint, decimals);
+        maximize (h. len_max, field. size () + (size_t) (h. decimals - decimals) + (! hasPoint));
+      }
+    }
+}
+
+
+
+void TextTable::qc () const
+{
+  if (! qc_on)
+    return;
+  if (! name. empty ())
+    Named::qc (); 
+
+  {    
+    StringVector v;  v. reserve (header. size ());
+    FFOR (size_t, i, header. size ())
+    {
+      const Header& h = header [i];
+      try { h. qc (); }
+        catch (const exception &e)
+        {
+          throw runtime_error ("Header column #" + to_string (i + 1) + ": " + e. what ());
+        }
+      v << h. name;
+    }
+    v. sort ();
+    const size_t i = v. findDuplicate ();
+    if (i != no_index)
+      throw Error (*this, "Duplicate column name: " + strQuote (v [i]));
+  }
+  
+  FFOR (RowNum, i, rows. size ())
+  {
+    if (rows [i]. size () != header. size ())
+      throw Error (*this, "Row " + to_string (i + 1) + " contains " + to_string (rows [i]. size ()) + " columns whereas table has " + to_string (header. size ()) + " columns");
+    for (const string& field : rows [i])
+    {
+      if (contains (field, '\t'))
+        throw Error (*this, "Field " + strQuote (header [i]. name) + " of row " + to_string (i + 1) + " contains a tab character");
+      if (contains (field, '\n'))
+        throw Error (*this, "Field " + strQuote (header [i]. name) + " of row " + to_string (i + 1) + " contains an EOL character");
+    }
+  }
+}
+
+
+
+void TextTable::saveText (ostream &os) const
+{ 
+  if (saveHeader)
+  { 
+    if (pound)
+      os << '#';
+    bool first = true;
+    for (const Header& h : header)
+    {
+      if (! first)
+        os << '\t';
+      os << h. name;
+      first = false;
+    }
+    os << endl;
+  }
+  
+  for (const StringVector& row : rows)
+  {
+    save (os, row, '\t');
+    os << endl;
+  }
+}
+
+    
+    
+bool TextTable::getDecimals (string s,
+                             bool &hasPoint,
+                             streamsize &decimals)
+{
+  strUpper (s);
+  const size_t ePos     = s. find ('E');
+  const size_t pointPos = s. find ('.');
+  
+  hasPoint = pointPos != string::npos;
+  
+  decimals = 0;
+  if (ePos == string::npos)
+  {
+    if (hasPoint)
+      decimals = (streamoff) (s. size () - (pointPos + 1));
+  }
+  else
+  {
+    if (hasPoint && ePos > pointPos)
+      decimals = (streamoff) (ePos - (pointPos + 1));
+  }
+  
+  return ePos != string::npos;
+}
+
+    
+  
+void TextTable::printHeader (ostream &os) const
+{
+  FFOR (size_t, i, header. size ())
+  {
+    os << i + 1 << '\t';
+    header [i]. saveText (os);
+    os << endl;
+  }
+}
+
+
+
+TextTable::ColNum TextTable::col2num_ (const string &columnName) const
+{ 
+  FFOR (size_t, i, header. size ())
+    if (header [i]. name == columnName)
+      return i;
+  return no_index;
+}
+
+
+
+void TextTable::duplicateColumn (const string &columnName_from,
+                                 const string &columnName_to)
+{
+  ASSERT (! columnName_to. empty ());
+  const ColNum from = col2num (columnName_from);
+  if (hasColumn (columnName_to))
+    throw runtime_error ("Table already has column " + strQuote (columnName_to));
+  header << header [from];
+  header. back (). name = columnName_to;
+  for (StringVector& row : rows)
+    row << row [from];
+  qc ();
+}
+
+
+
+TextTable::ColNum TextTable::findDate (Date::Format &fmt) const
+{
+  FFOR (ColNum, dateCol, header. size ())
+  {
+    const Header& h = header [dateCol];
+    if (   h. null
+        || h. scientific
+       )
+      continue;
+    size_t fmt_ = 0;
+    while (fmt_ < Date::fmt_None)
+    {
+      fmt = Date::Format (fmt_);
+      bool isDate = true;
+      for (const StringVector& row : rows)
+        if (Date::parse (row [dateCol], fmt). empty ())
+        {
+          isDate = false;
+          break;
+        }
+      if (isDate)
+        return dateCol;
+      fmt_++;
+    }
+  }
+  return no_index;
+}
+
+
+
+bool TextTable::isKey (ColNum colNum) const
+{
+  ASSERT (colNum < header. size ());
+
+  const Header& h = header [colNum];
+  if (h. null)
+    return false;
+  if (h. numeric)
+    if (   h. scientific
+        || h. decimals > 0
+       )
+      return false;
+    
+  unordered_set<string> values;  values. rehash (rows. size ());
+  for (const StringVector& row : rows)
+  {
+    ASSERT (! row [colNum]. empty ());
+    if (! values. insert (row [colNum]). second)
+      return false;
+  }
+      
+  return true;
+}
+
+
+
+int TextTable::compare (const StringVector& row1,
+                        const StringVector& row2,
+                        ColNum column) const
+{
+  const string& s1 = row1 [column];
+  const string& s2 = row2 [column];
+
+  if (header [column]. numeric)
+  {
+    const double a = s1. empty () ? 0.0 : stod (s1);
+    const double b = s2. empty () ? 0.0 : stod (s2);
+    if (a < b)
+      return -1;
+    if (a > b)
+      return 1;
+    return 0;
+  }
+  
+  if (s1 < s2)
+    return -1;
+  if (s1 > s2)
+    return 1;
+
+  return 0;
+}
+
+
+
+void TextTable::filterColumns (const StringVector &newColumnNames)
+{
+  const Vector<ColNum> colNums (columns2nums (newColumnNames));
+
+  {  
+    Vector<Header> newHeader;  newHeader. reserve (colNums. size ());
+    for (const ColNum i : colNums)
+      newHeader << header [i];
+    header = std::move (newHeader);
+  }
+
+  for (StringVector& row : rows)
+  {
+    StringVector newRow;  newRow. reserve (colNums. size ());
+    for (const ColNum i : colNums)
+      newRow << row [i];
+    row = std::move (newRow);
+  }
+}
+
+
+
+void TextTable::sort (const StringVector &by)
+{
+  const Vector<ColNum> byIndex (columns2nums (by));
+  
+  const auto lt = [&byIndex,this] (const StringVector &a, const StringVector &b) 
+                    { for (const ColNum i : byIndex) 
+                        switch (this->compare (a, b, i))
+                        { case -1: return true;
+                          case  1: return false;
+                        }
+                      // Tie resolution
+                      FFOR (size_t, i, a. size ())
+                        switch (this->compare (a, b, i))
+                        { case -1: return true;
+                          case  1: return false;
+                        }
+                      return false;
+                    };
+  Common_sp::sort (rows, lt);
+}
+
+
+
+namespace
+{
+  struct ColumnPartitionQC
+  {
+    map<string/*colName*/,string/*operation*/> name2oper;
+    
+    void add (const StringVector &cols,
+              const string &oper)
+      {
+        for (const string& s : cols)
+          if (name2oper [s]. empty ())
+            name2oper [s] = oper;
+          else
+            throw runtime_error ("Column " + strQuote (s) + " is used for operations " + strQuote (name2oper [s]) + " and " + strQuote (oper)); 
+      }
+  };
+}
+
+
+
+void TextTable::group (const StringVector &by,
+                       const StringVector &sum,
+                       const StringVector &minV,
+                       const StringVector &maxV,
+                       const StringVector &aggr)
+{
+  const Vector<ColNum> byIndex   (columns2nums (by));
+  const Vector<ColNum> sumIndex  (columns2nums (sum));
+  const Vector<ColNum> minIndex  (columns2nums (minV));
+  const Vector<ColNum> maxIndex  (columns2nums (maxV));
+  const Vector<ColNum> aggrIndex (columns2nums (aggr));
+  
+  // QC
+  {
+    ColumnPartitionQC cp;
+    cp. add (by, "group by");
+    cp. add (sum, "sum");
+    cp. add (minV, "min");
+    cp. add (maxV, "max");
+    cp. add (aggr, "aggregation");
+  }
+  for (const string& s : sum)
+    if (! header [col2num (s)]. numeric)
+      throw runtime_error ("Summation column " + strQuote (s) + " is not numeric");
+
+  sort (by);
+  
+  RowNum i = 0;  
+  FFOR_START (RowNum, j, 1, rows. size ())
+  {
+    ASSERT (i < j);
+    if (rows [i]. same (rows [j], byIndex))
+      merge (i, j, sumIndex, minIndex, maxIndex, aggrIndex);
+    else
+    {
+      i++;
+      if (i < j)
+        rows [i] = std::move (rows [j]);
+    }
+  }
+  if (! rows. empty ())
+    i++;
+
+  ASSERT (rows. size () >= i);
+  FFOR (RowNum, k, rows. size () - i)
+    rows. pop_back ();
+    
+  StringVector newColumns;
+  newColumns << by << sum << minV << maxV << aggr;
+  filterColumns (newColumns);
+}
+
+
+
+void TextTable::merge (RowNum toRowNum,
+                       RowNum fromRowNum,
+                       const Vector<ColNum> &sum,
+                       const Vector<ColNum> &minV,
+                       const Vector<ColNum> &maxV,
+                       const Vector<ColNum> &aggr) 
+{
+  ASSERT (toRowNum < fromRowNum);
+  
+        StringVector& to   = rows [toRowNum];
+  const StringVector& from = rows [fromRowNum];
+
+  for (const ColNum i : sum)
+  {
+    const Header& h = header [i];
+    ASSERT (h. numeric);
+    ostringstream oss;
+    ONumber on (oss, h. decimals, h. scientific);
+    const string& s1 = to   [i];
+    const string& s2 = from [i];
+    const double d1 = s1. empty () ? 0.0 : stod (s1);
+    const double d2 = s2. empty () ? 0.0 : stod (s2);
+    oss << (d1 + d2);
+    to [i] = oss. str ();
+  }
+
+  for (const ColNum i : minV)
+    if (   to [i]. empty ()
+        || (   ! from [i]. empty ()
+            && compare (to, from, i) == 1
+           )
+       )
+      to [i] = from [i];
+
+  for (const ColNum i : maxV)
+    if (   to [i]. empty ()
+        || (   ! from [i]. empty ()
+            && compare (to, from, i) == -1
+           )
+       )
+      to [i] = from [i];
+
+  for (const ColNum i : aggr)
+  {
+    if (from [i]. empty ())
+      continue;
+    if (contains (from [i], aggr_sep))
+      throw runtime_error ("Cannot aggregate column " + header [i]. name + " for row " + to_string (fromRowNum + 1) + " because it contains " + strQuote (string (1, aggr_sep)));
+    if (to [i]. empty ())
+      to [i] = from [i];
+    else
+    {
+      StringVector vec (to [i], aggr_sep, true);
+      vec << from [i];
+      vec. sort ();
+      vec. uniq ();
+      to [i] = vec. toString (string (1, aggr_sep));
+    }
+  }
+}
+
+
+
+void TextTable::colNumsRow2values (const Vector<ColNum> &colNums,
+                                   RowNum row_num,
+                                   StringVector &values) const
+{
+  values. clear ();
+  values. reserve (colNums. size ());
+  const StringVector& row = rows [row_num];    
+  FFOR (ColNum, i, colNums. size ())
+    values << row [colNums [i]];
+}
+
+
+
+TextTable::RowNum TextTable::find (const Vector<ColNum> &colNums,
+                                   const StringVector &targetValues,
+                                   RowNum row_num_start) const
+{
+  ASSERT (colNums. size () == targetValues. size ());
+  ASSERT (row_num_start != no_index);
+  StringVector values;
+  FOR_START (RowNum, i, row_num_start, rows. size ())
+  {
+    colNumsRow2values (colNums, i, values);
+    if (values == targetValues)
+      return i;
+  }
+  return no_index;
+}
+
+
+
+StringVector TextTable::col2values (ColNum col) const
+{
+  QC_ASSERT (col < header. size ());
+  
+  Set<string> s;
+  for (const StringVector& row : rows)
+    if (! row [col]. empty ())
+      s << row [col];
+            
+  StringVector vec;  vec. reserve (s. size ());
+  insertAll (vec, s);
+  
+  return vec;
+}
+
+
+
+
+// TextTable::Key
+
+
+TextTable::Key::Key (const TextTable &tab,
+                     const StringVector &columns)
+: colNums (tab. columns2nums (columns))
+{
+  data. rehash (tab. rows. size ());
+  StringVector values;  
+  FFOR (RowNum, i, tab. rows. size ())
+  {
+    tab. colNumsRow2values (colNums, i, values);
+    for (const string& s : values)
+      if (s. empty ())
+        throw Error (tab, "Empty value in key, in row " + to_string (i + 1));
+    if (data. find (values) != data. end ())
+      throw Error (tab, "Duplicate key " + values. toString (",") + " for the key on " + columns. toString (","));
+    ASSERT (i != no_index);
+    data [values] = i;
+  }  
+}
+
+
+
+
+// TextTable::Index
+
+
+TextTable::Index::Index (const TextTable &tab,
+                         const StringVector &columns)
+: colNums (tab. columns2nums (columns))
+{
+  data. rehash (tab. rows. size ());
+  StringVector values;  
+  FFOR (RowNum, i, tab. rows. size ())
+  {
+    tab. colNumsRow2values (colNums, i, values);
+    data [values] << i;
+  }  
+}
+
+
+
+
+}
+
+
diff --git a/tsv.hpp b/tsv.hpp
index 6e58eba..26af24f 100644
--- a/tsv.hpp
+++ b/tsv.hpp
@@ -48,7 +48,7 @@ namespace Common_sp
 
 struct Date : Root
 {
-  enum Format {fmt_Year, fmt_None};  // not complete list ??
+  enum Format {fmt_Year, fmt_YMD, fmt_None};  // not complete list ??
   short year {0};
   char month {0};
     // 0 .. 12 - 1
@@ -66,6 +66,10 @@ struct Date : Root
     {}
   static bool isYear (short n)
     { return n > 1000 && n < 2500; }  // PAR
+  static bool isMonth  (short n)
+    { return between<short> (n, 0, 12); }  
+  static bool isDay  (short n)
+    { return between<short> (n, 0, 31); }   // Must depend on month ??
   static Date parse (const string &s,
                      Format fmt);
     // Return: !empty() <=> success
@@ -94,9 +98,20 @@ struct Date : Root
              && month == other. month
              && day   == other. day;
     }
-  bool operator<= (const Date &other) const;
+  bool less (const Date &other,
+             bool equal) const;
+  bool operator<= (const Date &other) const
+    { return less (other, true); }
+  bool operator< (const Date &other) const
+    { return less (other, false); }
   Date operator- (const Date &other) const;
     // Requires: other <= *this
+  bool year_divisible () const
+    { return ! month && ! day; }
+  bool quarter_divisible () const
+    { return ! (month % 3) && ! day; }
+  bool month_divisible () const
+    { return ! day; }
 };
   
   
@@ -117,9 +132,11 @@ struct TextTable : Named
     bool scientific {false};
     streamsize decimals {0};
     bool null {false};
+      // = can be empty()
     static constexpr size_t choices_max {7};  // PAR
     Set<string> choices;
       // size() <= choices_max + 1
+    Header () = default;
     explicit Header (const string &name_arg)
       : Named (name_arg)
       {}
@@ -168,8 +185,9 @@ struct TextTable : Named
 
   explicit TextTable (const string &tableFName,
                       const string &columnSynonymsFName = noString);
-    // columnSynonymsFName: syn_format
-    // Top lines starting with '#': comment + header
+    // Input: tableFName: format: [{'#' <comment> <EOL>}* '#'] <header> <EOL> {<row> <EOL>>}*
+    //                    empty lines are skipped
+    //        columnSynonymsFName: <syn_format>
     // Rows where number of columns < header size are added empty values
   static constexpr const char* syn_format {"Column synonyms file with the format: {<main synonym> <eol> {<synonym> <eol>}* {<eol>|<eof>}}*"};
   TextTable () = default;
@@ -217,6 +235,7 @@ struct TextTable : Named
     // Date column is not empty and has the same format fmt in all rows
     // Return: no_index <=> not found
     // Output: fmt, valid if return != no_index
+  bool isKey (ColNum colNum) const;
 private:
   int compare (const StringVector& row1,
                const StringVector& row2,
@@ -232,6 +251,7 @@ struct TextTable : Named
               const StringVector &minV,
               const StringVector &maxV,
               const StringVector &aggr);
+    // aggr: slow
     // Invokes: filterColumns(by + sum + aggr)
 private:
   void merge (RowNum toRowNum,
diff --git a/version.txt b/version.txt
index c2320f5..b112f91 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.0.20
+1.0.21