Skip to content

Commit

Permalink
PD-5038 --ncleotide_output
Browse files Browse the repository at this point in the history
  • Loading branch information
Vyacheslav Brover committed Jul 15, 2024
1 parent ec5dfe5 commit 9a6edb1
Show file tree
Hide file tree
Showing 9 changed files with 1,127 additions and 21 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
*.o
stxtyper
fasta_check
fasta_extract
stx.prot.*
*.got
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) $(DBDIR) $(TEST_UPDATE_DB) -c

.PHONY: all clean install release test

BINARIES= stxtyper fasta_check
BINARIES= stxtyper fasta_check fasta_extract
DATABASE= stx.prot

all: $(BINARIES)
Expand All @@ -90,7 +90,7 @@ all: $(BINARIES)
common.o: common.hpp common.inc

stxtyper.o: common.hpp common.inc
stxtyperOBJS=stxtyper.o common.o
stxtyperOBJS=stxtyper.o common.o tsv.o
stxtyper: $(stxtyperOBJS)
$(CXX) -o $@ $(stxtyperOBJS) -pthread $(DBDIR)

Expand All @@ -99,6 +99,10 @@ fasta_checkOBJS=fasta_check.o common.o
fasta_check: $(fasta_checkOBJS)
$(CXX) -o $@ $(fasta_checkOBJS)

fasta_extract.o: common.hpp common.inc
fasta_extractOBJS=fasta_extract.o common.o
fasta_extract: $(fasta_extractOBJS)
$(CXX) -o $@ $(fasta_extractOBJS)

clean:
rm -f *.o
Expand Down
2 changes: 1 addition & 1 deletion common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1710,7 +1710,7 @@ void Xml::TextFile::tagStart (const string &tag)
string tag_ (tag);
replace (tag_, ':', '_');
if (! isIdentifier (tag_, true))
throw runtime_error (FUNC "Bad tag name: " + strQuote (tag));
throw runtime_error (FUNC "Bad textual XML tag name: " + strQuote (tag));

printRaw ("<" + tag + ">");
}
Expand Down
22 changes: 14 additions & 8 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ void errorExitStr (const string &msg);

void beep ();
// Requires: !isRedirected()
// SHLVL = 1 ??



Expand Down Expand Up @@ -744,9 +745,11 @@ inline void trim (string &s)

void trimLeading (string &s,
char c);
// Invokes: isSpace()

void trimTrailing (string &s,
char c);
// Invokes: isSpace()

inline void trim (string &s,
char c)
Expand Down Expand Up @@ -1295,13 +1298,13 @@ void copyText (const string &inFName,
#ifndef _MSC_VER
inline void moveFile (const string &from,
const string &to)
{ if (::rename (from. c_str (), to. c_str ()))
throw runtime_error ("Cannot move file + " + shellQuote (from) + " to " + shellQuote (to));
{ if (const int code = ::rename (from. c_str (), to. c_str ()))
throw runtime_error ("Cannot move file + " + shellQuote (from) + " to " + shellQuote (to) + " (" + to_string (code) + ")");
}

inline void removeFile (const string &fName)
{ if (::remove (fName. c_str ()))
throw runtime_error ("Cannot remove file + " + shellQuote (fName));
{ if (const int code = ::remove (fName. c_str ()))
throw runtime_error ("Cannot remove file + " + shellQuote (fName) + " (" + to_string (code) + ")");
}


Expand Down Expand Up @@ -1892,6 +1895,7 @@ struct Xml


struct TextFile : File
// Tag::name: idenifier with possible '-'
{
private:
struct XmlStream : OFStream
Expand Down Expand Up @@ -1925,7 +1929,7 @@ struct Xml
// <Data> ::= <nameIndex> <Data>* 0 0 <text> 0
// <nameIndex> ::= <byte> <byte>
// Number of different Tag::name's <= 2^16
// Tag::name has no: '\0', '\n'
// Tag::name: no '\0', '\n'
{
private:
ofstream os;
Expand Down Expand Up @@ -3913,8 +3917,9 @@ struct TokenInput : Root


[[noreturn]] void error (const Token &wrongToken,
const string &expected) const
{ throw TextPos::Error (wrongToken. tp, expected, true); }
const string &what,
bool expected = true) const
{ throw TextPos::Error (wrongToken. tp, what, expected); }
[[noreturn]] void error (const string &what,
bool expected = true) const
{ ci. error (what, expected); }
Expand Down Expand Up @@ -3948,7 +3953,8 @@ struct TokenInput : Root
error (t, Token::type2str (Token::eDouble) + " " + toString (expected));
}
void get (char expected)
{ if (getNextChar (false) != expected)
{ const Token t (get ());
if (! t. isDelimiter (expected))
error (Token::type2str (Token::eDelimiter) + " " + strQuote (toString (expected), '\''));
}
void setLast (Token &&t)
Expand Down
267 changes: 267 additions & 0 deletions fasta_extract.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
// fasta_check.cpp

/*===========================================================================
*
* PUBLIC DOMAIN NOTICE
* National Center for Biotechnology Information
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
* the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
*
* Although all reasonable efforts have been taken to ensure the accuracy
* and reliability of the software and data, the NLM and the U.S.
* Government do not and cannot warrant the performance or results that
* may be obtained by using this software or data. The NLM and the U.S.
* Government disclaim all warranties, express or implied, including
* warranties of performance, merchantability or fitness for any particular
* purpose.
*
* Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
* Author: Vyacheslav Brover
*
* File Description:
* Extract sequences out of a FASTA file
*
*/


#undef NDEBUG

#include "common.hpp"
using namespace Common_sp;

#include "common.inc"



namespace
{



struct Segment
// not circular
{
size_t start {0};
size_t stop {0};
bool strand {true};
// false <=> negative
string genesymbol;
string name;


bool isDna () const
{ return stop; }
size_t size () const
{ return stop - start; }
void saveText (ostream &os) const
{ os << start
<< '\t' << stop
<< '\t' << strand
<< '\t' << genesymbol
<< '\t' << name
<< endl;
}
};



char complementaryNucleotide (char wildNucleotide)
{
char r = ' ';
switch (toLower (wildNucleotide))
{
case 'a': r = 't'; break;
case 'c': r = 'g'; break;
case 'g': r = 'c'; break;
case 't': r = 'a'; break;
case 'm': r = 'k'; break;
case 'r': r = 'y'; break;
case 'w': r = 'w'; break;
case 's': r = 's'; break;
case 'y': r = 'r'; break;
case 'k': r = 'm'; break;
case 'v': r = 'b'; break;
case 'h': r = 'd'; break;
case 'd': r = 'h'; break;
case 'b': r = 'v'; break;
case 'n': r = 'n'; break;
case '-': r = '-'; break;
default:
throw runtime_error ("Bad nucleotide " + to_string (wildNucleotide));
}
if (isupper (wildNucleotide))
r = toUpper (r);

return r;
}



bool process (const string &id,
string &seq,
const map<string/*id*/,Vector<Segment>> &id2segments)
{
if (id. empty ())
return false;
const Vector<Segment>* segments = findPtr (id2segments, id);
if (! segments)
return false;

replaceStr (seq, "-", "");
QC_ASSERT (! seq. empty ());

for (Segment& seg : var_cast (*segments))
{
cout << '>' << id;
if (seg. isDna ())
{
QC_ASSERT (seg. start <= seq. size ());
minimize (seg. stop, seq. size ());
QC_ASSERT (seg. start < seg. stop);
cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-');
}
cout << ' ' << seg. genesymbol << ' ' << seg. name << endl;
string seq1 (seq);
if (seg. isDna ())
{
ASSERT (seg. stop <= seq1. size ());
seq1 = seq1. substr (seg. start, seg. size ());
if (! seg. strand)
{
reverse (seq1);
for (char &c : seq1)
c = complementaryNucleotide (c);
}
//strLower (seq1); // Letter case can indicate nucleotide quality
}
//else
//strUpper (seq1);
constexpr size_t line_len = 60; // PAR
for (size_t i = 0; i < seq1. size (); i += line_len)
cout << seq1. substr (i, line_len) << endl;
}

return true;
}



struct ThisApplication : Application
{
ThisApplication ()
: Application ("Extract sequences out of a FASTA file")
{
addPositional ("fasta", "FASTA file");
addPositional ("target", "Target identifiers in the FASTA file to extract.\n\
Line format for amino acid sequences : <id> <gene symbol> <product name>\n\
Line format for nucleotide sequences : <id> <start (>=1)> <stop (>= start)> <strand (+/-)> <gene symbol> <product name>\
");
addFlag ("aa", "Amino acid sequenes, otherwise nucleotide");
version = SVN_REV;
}



void body () const final
{
const string fName = getArg ("fasta");
const string targetFName = getArg ("target");
const bool aa = getFlag ("aa");


map<string/*id*/,Vector<Segment>> id2segments;
{
LineInput f (targetFName);
string id;
Istringstream iss;
while (f. nextLine ())
{
iss. reset (f. line);
Segment seg;
iss >> id;
if (! aa)
{
char strand = '\0';
iss >> seg. start >> seg. stop >> strand;
QC_ASSERT (seg. start);
QC_ASSERT (seg. start <= seg. stop);
seg. start--;
QC_ASSERT ( strand == '+'
|| strand == '-'
);
seg. strand = (strand == '+');
}
iss >> seg. genesymbol;
seg. name = f. line. substr ((size_t) iss. tellg ());
trim (seg. name);
QC_ASSERT (aa == ! seg. isDna ());
id2segments [id] << std::move (seg);
}
}
if (verbose ())
for (const auto& it : id2segments)
{
cout << it. first << ": " << endl;
for (const Segment& seg : it. second)
{
cout << " ";
seg. saveText (cout);
}
}
if (id2segments. empty ())
return;


size_t processed = 0;
{
LineInput f (fName);
string id;
string seq;
while (f. nextLine ())
{
trimTrailing (f. line);
if (f. line. empty ())
continue;
if (f. line [0] == '>')
{
processed += process (id, seq, id2segments);
size_t pos = 1;
while (pos < f. line. size () && ! isspace (f. line [pos]))
pos++;
id = f. line. substr (1, pos - 1);
seq. clear ();
}
else
seq += f. line;
}
processed += process (id, seq, id2segments);
}
if (processed != id2segments. size ())
throw runtime_error ("Requested identifiers: " + to_string (id2segments. size ()) + ", but processed: " + to_string (processed));
// Assumed: no duplicate identifiers in FASTA
}
};



} // namespace



int main (int argc,
const char* argv[])
{
ThisApplication app;
return app. run (argc, argv);
}



Loading

0 comments on commit 9a6edb1

Please sign in to comment.