From 061e1d4310cd0ff3e1efacd5a6898d6dccf11a3f Mon Sep 17 00:00:00 2001 From: Maxim Koltsov <kolmax94@gmail.com> Date: Tue, 4 Oct 2022 10:54:34 +0300 Subject: [PATCH] version 0.1.5.0 rewrite FASTA parser to Megaparsec (#67) Co-authored-by: Grigoriy <mikheevg@Grigoriys-MacBook-Pro.local> Co-authored-by: Grigoriy Mikheev <grigoriymihtt@gmail.com> Co-authored-by: Grigoriy Mikheev <mikheevg.@biocad.ru> --- ChangeLog.md | 4 + default.nix | 12 ++ package.yaml | 3 +- shell.nix | 1 + src/Bio/FASTA.hs | 7 +- src/Bio/FASTA/Parser.hs | 259 +++++++++++++++++++++------------------- src/Bio/FASTA/Type.hs | 12 +- test/FASTA/order1.fasta | 14 +++ test/FASTA/order2.fasta | 5 + test/FASTA/order3.fasta | 2 + test/FASTA/order4.fasta | 27 +++++ test/FASTA/order5.fasta | 9 ++ test/FASTA/order6.fasta | 26 ++++ test/FASTA/order7.fasta | 11 ++ test/FASTA/order8.fasta | 22 ++++ test/FASTASpec.hs | 86 +++++++++---- test/FastaParserSpec.hs | 70 +++++++---- 17 files changed, 394 insertions(+), 176 deletions(-) create mode 100644 default.nix create mode 100644 shell.nix create mode 100644 test/FASTA/order1.fasta create mode 100644 test/FASTA/order2.fasta create mode 100644 test/FASTA/order3.fasta create mode 100644 test/FASTA/order4.fasta create mode 100644 test/FASTA/order5.fasta create mode 100644 test/FASTA/order6.fasta create mode 100644 test/FASTA/order7.fasta create mode 100644 test/FASTA/order8.fasta diff --git a/ChangeLog.md b/ChangeLog.md index 841c1a7..3cec239 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,6 +2,10 @@ ## [Unreleased] +## [0.1.5.0] - 2022-09-30 +### Changed +- Update FASTA parser to megaparsec. + ## [0.1.4.4] - 2022-06-02 ### Changed - Update more dependencies; diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..314ae3b --- /dev/null +++ b/default.nix @@ -0,0 +1,12 @@ +let + bcd-lts = import (builtins.fetchGit { + url = "git@github.com:biocad/nix-lts.git"; + ref = "master"; + }); +in +bcd-lts.mkBiocadProject { + src = bcd-lts.pkgs.haskell-nix.haskellLib.cleanGit { name = "cobot-io"; src = ./.; }; + shellArgs = { + buildInputs = [ bcd-lts.pkgs.RNA ]; + }; +} diff --git a/package.yaml b/package.yaml index 0d4cde8..21dd9d0 100644 --- a/package.yaml +++ b/package.yaml @@ -1,5 +1,5 @@ name: cobot-io -version: 0.1.4.4 +version: 0.1.5.0 github: "biocad/cobot-io" license: BSD3 category: Bio @@ -41,6 +41,7 @@ dependencies: - containers >= 0.5.7.1 && < 0.7 - data-msgpack >= 0.0.9 && < 0.1 - deepseq >= 1.4 && < 1.5 +- filepath - http-conduit >= 2.3 && < 2.4 - hyraxAbif >= 0.2.3.27 && < 0.2.4.0 - lens >= 4.16 && < 5.2 diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..76918f6 --- /dev/null +++ b/shell.nix @@ -0,0 +1 @@ +(import ./default.nix).shellFor diff --git a/src/Bio/FASTA.hs b/src/Bio/FASTA.hs index d9dfbba..0c2ff4b 100644 --- a/src/Bio/FASTA.hs +++ b/src/Bio/FASTA.hs @@ -6,14 +6,15 @@ module Bio.FASTA , fromFile , toFile , fastaP - , fastaPGeneric , fastaLine , modificationP + , Parser ) where import Control.Monad.IO.Class (MonadIO, liftIO) -import Data.Attoparsec.Text (parseOnly) import Data.Text.IO (readFile, writeFile) +import System.FilePath (takeBaseName) +import Text.Megaparsec (errorBundlePretty, parse) #if !MIN_VERSION_base(4,13,0) import Control.Monad.Fail (MonadFail (..)) import Prelude hiding (fail, readFile, writeFile) @@ -28,7 +29,7 @@ import Bio.FASTA.Writer (WritableFastaToken (..), fastaToText) -- | Reads 'FastaSequence' from given file. -- fromFile :: (MonadFail m, MonadIO m) => FilePath -> m (Fasta Char) -fromFile f = liftIO (readFile f) >>= either fail pure . parseOnly fastaP +fromFile f = liftIO (readFile f) >>= either (fail . errorBundlePretty) pure . parse fastaP (takeBaseName f) -- | Writes 'FastaSequence' to file. -- diff --git a/src/Bio/FASTA/Parser.hs b/src/Bio/FASTA/Parser.hs index a5aa3a1..4eb7cc0 100644 --- a/src/Bio/FASTA/Parser.hs +++ b/src/Bio/FASTA/Parser.hs @@ -2,151 +2,168 @@ module Bio.FASTA.Parser ( fastaP - , fastaPGeneric - , fastaLine + , fastaLine + , parseOnly , modificationP + , fastaPGeneric + , Parser ) where -import Bio.FASTA.Type (Fasta, FastaItem (..), ModItem (..), Modification (..), - ParsableFastaToken (..)) -import Bio.Sequence (BareSequence, bareSequence) -import Control.Applicative ((<|>)) -import Data.Attoparsec.Text (Parser, char, choice, endOfInput, endOfLine, many', many1', satisfy, - skipWhile, space, string, takeWhile, try) -import Data.Char (isAlphaNum, isLetter, isSpace) -import Data.Text (Text, strip) -import Prelude hiding (takeWhile) +import Bio.FASTA.Type (Fasta, FastaItem (..), + ModItem (..), Modification (..), + ParsableFastaToken (..)) +import Bio.Sequence (BareSequence, bareSequence) +import Data.Bifunctor (first) +import Data.Char (isLetter) +import Data.Functor (void, ($>)) +import Data.Text (Text, pack, strip) +import Data.Void (Void) +import Text.Megaparsec +import Text.Megaparsec.Char +import qualified Text.Megaparsec.Char.Lexer as L instance ParsableFastaToken Char where - parseToken = satisfy + parseToken p = satisfy p <?> "letter" instance ParsableFastaToken ModItem where - parseToken predicate = (Mod <$> modificationP) <|> (Letter <$> satisfy predicate) + parseToken p = (Mod <$> modificationP <?> "fasta item modification") <|> (Letter <$> satisfy p <?> "letter") +type Parser = Parsec Void Text -- | Parser of .fasta file. -- + +parseOnly :: Parsec Void Text a -> Text -> Either String a +parseOnly p s = first errorBundlePretty $ parse p "input.fasta" s + +sc :: Parser () +sc = L.space space1 empty empty + +lexeme :: Parser a -> Parser a +lexeme = L.lexeme sc + +symbol :: Text -> Parser Text +symbol = L.symbol sc + fastaP :: ParsableFastaToken a => Parser (Fasta a) -fastaP = many' space *> fastaPGeneric isLetter +fastaP = many (item isLetter) <* hidden space <* eof fastaPGeneric :: ParsableFastaToken a => (Char -> Bool) -> Parser (Fasta a) -fastaPGeneric = many' . item +fastaPGeneric p = many (item p) <* hidden space <* eof item :: ParsableFastaToken a => (Char -> Bool) -> Parser (FastaItem a) -item predicate = (FastaItem <$> seqName <*> fastaSeq predicate) <* skipWhile isSpace +item p = + FastaItem + <$> seqName + <*> (fastaSeq p <?> "sequence") seqName :: Parser Text -seqName = strip <$> (char '>' *> tabs *> takeWhile (`notElem` ['\n', '\r']) <* tabs <* eol) +seqName = strip . pack <$> (symbol ">" *> (manyTill anySingle myEnd <?> "sequence name")) fastaSeq :: ParsableFastaToken a => (Char -> Bool) -> Parser (BareSequence a) -fastaSeq predicate = bareSequence . mconcat <$> many' (fastaLine predicate) +fastaSeq p = bareSequence . concat <$> many (fastaLine p) <* hidden space fastaLine :: ParsableFastaToken a => (Char -> Bool) -> Parser [a] -fastaLine predicate = concat <$> many1' (many1' (parseToken predicate) <* many' (char ' ')) <* eol - -eol :: Parser () -eol = tabs *> choice [slashN, endOfInput] - -slashN :: Parser () -slashN = () <$ many1' endOfLine +fastaLine p = concat <$> some (some (parseToken p) <* hidden hspace) <* myEnd -tabs :: Parser () -tabs = () <$ many' (char '\t') +myEnd :: Parser () +myEnd = void (some eol) <|> eof modificationP :: Parser Modification -modificationP - = string "[A*]" *> pure Mod_A_Star - <|> string "[C*]" *> pure Mod_C_Star - <|> string "[G*]" *> pure Mod_G_Star - <|> string "[T*]" *> pure Mod_T_Star - <|> string "[rA]" *> pure Mod_rA - <|> string "[rC]" *> pure Mod_rC - <|> string "[rG]" *> pure Mod_rG - <|> string "[rU]" *> pure Mod_rU - <|> string "[+A]" *> pure Mod_Plus_A - <|> string "[+C]" *> pure Mod_Plus_C - <|> string "[+G]" *> pure Mod_Plus_G - <|> string "[+T]" *> pure Mod_Plus_T - <|> string "[rAf]" *> pure Mod_rAf - <|> string "[rCf]" *> pure Mod_rCf - <|> string "[rGf]" *> pure Mod_rGf - <|> string "[rUf]" *> pure Mod_rUf - <|> string "[mA]" *> pure Mod_mA - <|> string "[mC]" *> pure Mod_mC - <|> string "[mG]" *> pure Mod_mG - <|> string "[mU]" *> pure Mod_mU - <|> string "[mA*]" *> pure Mod_mA_Star - <|> string "[mC*]" *> pure Mod_mC_Star - <|> string "[mG*]" *> pure Mod_mG_Star - <|> string "[mU*]" *> pure Mod_mU_Star - <|> string "[dU]" *> pure Mod_dU - <|> string "[5Bio]" *> pure Mod_5Bio - <|> string "[iBio]" *> pure Mod_iBio - <|> string "[56FAM]" *> pure Mod_56FAM - <|> string "[36FAM]" *> pure Mod_36FAM - <|> string "[5HEX]" *> pure Mod_5HEX - <|> string "[5TMR]" *> pure Mod_5TMR - <|> string "[3BHQ1]" *> pure Mod_3BHQ1 - <|> string "[3BHQ2]" *> pure Mod_3BHQ2 - <|> string "[5NH2]" *> pure Mod_5NH2 - <|> string "[3NH2]" *> pure Mod_3NH2 - <|> string "[5PO4]" *> pure Mod_5PO4 - <|> string "[3PO4]" *> pure Mod_3PO4 - <|> string "[3BioTEG]" *> pure Mod_3BioTEG - <|> string "[C12]" *> pure Mod_C12 - <|> string "[NHSdT]" *> pure Mod_NHSdT - <|> string "[5Mal]" *> pure Mod_5Mal - <|> string "[5thio]" *> pure Mod_5thio - <|> string "[3thio]" *> pure Mod_3thio - <|> string "[3azide]" *> pure Mod_3azide - <|> string "[3alkine]" *> pure Mod_3alkine - <|> string "[5CholTEG]" *> pure Mod_5CholTEG - <|> string "[3CholTEG]" *> pure Mod_3CholTEG - <|> string "[5C10]" *> pure Mod_5C10 - <|> string "[5Alk]" *> pure Mod_5Alk - <|> string "[GC]" *> pure Mod_GC - <|> string "[GT]" *> pure Mod_GT - <|> string "[AT]" *> pure Mod_AT - <|> string "[TG]" *> pure Mod_TG - <|> string "[AC]" *> pure Mod_AC - <|> string "[CC]" *> pure Mod_CC - <|> string "[AA]" *> pure Mod_AA - <|> string "[TC]" *> pure Mod_TC - <|> string "[TT]" *> pure Mod_TT - <|> string "[CG]" *> pure Mod_CG - <|> string "[GG]" *> pure Mod_GG - <|> string "[AG]" *> pure Mod_AG - <|> string "[GA]" *> pure Mod_GA - <|> string "[CA]" *> pure Mod_CA - <|> string "[CT]" *> pure Mod_CT - <|> string "[TA]" *> pure Mod_TA - <|> string "[AAA]" *> pure Mod_AAA - <|> string "[AAC]" *> pure Mod_AAC - <|> string "[ACT]" *> pure Mod_ACT - <|> string "[ATC]" *> pure Mod_ATC - <|> string "[ATG]" *> pure Mod_ATG - <|> string "[CAG]" *> pure Mod_CAG - <|> string "[AGA]" *> pure Mod_AGA - <|> string "[CAT]" *> pure Mod_CAT - <|> string "[CCG]" *> pure Mod_CCG - <|> string "[CGT]" *> pure Mod_CGT - <|> string "[CTG]" *> pure Mod_CTG - <|> string "[GAA]" *> pure Mod_GAA - <|> string "[GAC]" *> pure Mod_GAC - <|> string "[GCT]" *> pure Mod_GCT - <|> string "[GGT]" *> pure Mod_GGT - <|> string "[GTT]" *> pure Mod_GTT - <|> string "[TAC]" *> pure Mod_TAC - <|> string "[TCT]" *> pure Mod_TCT - <|> string "[TGC]" *> pure Mod_TGC - <|> string "[TGG]" *> pure Mod_TGG - <|> string "[TTC]" *> pure Mod_TTC - <|> string "[TTT]" *> pure Mod_TTT - <|> unknownP +modificationP + = choice + [ string "[A*]" $> Mod_A_Star + , string "[C*]" $> Mod_C_Star + , string "[G*]" $> Mod_G_Star + , string "[T*]" $> Mod_T_Star + , string "[rA]" $> Mod_rA + , string "[rC]" $> Mod_rC + , string "[rG]" $> Mod_rG + , string "[rU]" $> Mod_rU + , string "[+A]" $> Mod_Plus_A + , string "[+C]" $> Mod_Plus_C + , string "[+G]" $> Mod_Plus_G + , string "[+T]" $> Mod_Plus_T + , string "[rAf]" $> Mod_rAf + , string "[rCf]" $> Mod_rCf + , string "[rGf]" $> Mod_rGf + , string "[rUf]" $> Mod_rUf + , string "[mA]" $> Mod_mA + , string "[mC]" $> Mod_mC + , string "[mG]" $> Mod_mG + , string "[mU]" $> Mod_mU + , string "[mA*]" $> Mod_mA_Star + , string "[mC*]" $> Mod_mC_Star + , string "[mG*]" $> Mod_mG_Star + , string "[mU*]" $> Mod_mU_Star + , string "[dU]" $> Mod_dU + , string "[5Bio]" $> Mod_5Bio + , string "[iBio]" $> Mod_iBio + , string "[56FAM]" $> Mod_56FAM + , string "[36FAM]" $> Mod_36FAM + , string "[5HEX]" $> Mod_5HEX + , string "[5TMR]" $> Mod_5TMR + , string "[3BHQ1]" $> Mod_3BHQ1 + , string "[3BHQ2]" $> Mod_3BHQ2 + , string "[5NH2]" $> Mod_5NH2 + , string "[3NH2]" $> Mod_3NH2 + , string "[5PO4]" $> Mod_5PO4 + , string "[3PO4]" $> Mod_3PO4 + , string "[3BioTEG]" $> Mod_3BioTEG + , string "[C12]" $> Mod_C12 + , string "[NHSdT]" $> Mod_NHSdT + , string "[5Mal]" $> Mod_5Mal + , string "[5thio]" $> Mod_5thio + , string "[3thio]" $> Mod_3thio + , string "[3azide]" $> Mod_3azide + , string "[3alkine]" $> Mod_3alkine + , string "[5CholTEG]" $> Mod_5CholTEG + , string "[3CholTEG]" $> Mod_3CholTEG + , string "[5C10]" $> Mod_5C10 + , string "[5Alk]" $> Mod_5Alk + , string "[GC]" $> Mod_GC + , string "[GT]" $> Mod_GT + , string "[AT]" $> Mod_AT + , string "[TG]" $> Mod_TG + , string "[AC]" $> Mod_AC + , string "[CC]" $> Mod_CC + , string "[AA]" $> Mod_AA + , string "[TC]" $> Mod_TC + , string "[TT]" $> Mod_TT + , string "[CG]" $> Mod_CG + , string "[GG]" $> Mod_GG + , string "[AG]" $> Mod_AG + , string "[GA]" $> Mod_GA + , string "[CA]" $> Mod_CA + , string "[CT]" $> Mod_CT + , string "[TA]" $> Mod_TA + , string "[AAA]" $> Mod_AAA + , string "[AAC]" $> Mod_AAC + , string "[ACT]" $> Mod_ACT + , string "[ATC]" $> Mod_ATC + , string "[ATG]" $> Mod_ATG + , string "[CAG]" $> Mod_CAG + , string "[AGA]" $> Mod_AGA + , string "[CAT]" $> Mod_CAT + , string "[CCG]" $> Mod_CCG + , string "[CGT]" $> Mod_CGT + , string "[CTG]" $> Mod_CTG + , string "[GAA]" $> Mod_GAA + , string "[GAC]" $> Mod_GAC + , string "[GCT]" $> Mod_GCT + , string "[GGT]" $> Mod_GGT + , string "[GTT]" $> Mod_GTT + , string "[TAC]" $> Mod_TAC + , string "[TCT]" $> Mod_TCT + , string "[TGC]" $> Mod_TGC + , string "[TGG]" $> Mod_TGG + , string "[TTC]" $> Mod_TTC + , string "[TTT]" $> Mod_TTT + , unknownP + ] unknownP :: Parser Modification -unknownP = try $ do - _ <- char '[' - m <- many1' $ satisfy (\c -> isAlphaNum c || c `elem` ['+', '-', '*', '_']) - _ <- char ']' - pure $ Unknown ("[" <> m <> "]") +unknownP = do + res <- between (symbol "[") (symbol "]") + (lexeme (some (alphaNumChar <|> choice (char <$> ['+', '-', '*', '_'])) <?> "modification name")) + pure $ Unknown ("[" <> res <> "]") diff --git a/src/Bio/FASTA/Type.hs b/src/Bio/FASTA/Type.hs index bb5a86f..f599e5c 100644 --- a/src/Bio/FASTA/Type.hs +++ b/src/Bio/FASTA/Type.hs @@ -7,10 +7,11 @@ module Bio.FASTA.Type , modificationToString ) where -import Bio.Sequence (BareSequence) -import Data.Attoparsec.Text (Parser) -import Data.Text (Text) -import GHC.Generics (Generic) +import Bio.Sequence (BareSequence) +import Data.Text (Text) +import Data.Void +import GHC.Generics (Generic) +import Text.Megaparsec -- | Type alias for FASTA file. -- satisfies the following format : >(\s|\t)*[^\n\r]+(\s|\t)*(\n|\r)*((\w|\s)(\n|\r)*)* @@ -29,7 +30,7 @@ data FastaItem a deriving (Eq, Show, Functor) class ParsableFastaToken a where - parseToken :: (Char -> Bool) -> Parser a + parseToken :: (Char -> Bool) -> Parsec Void Text a data ModItem = Mod Modification @@ -216,4 +217,3 @@ modificationToString Mod_TGG = "[TGG]" modificationToString Mod_TTC = "[TTC]" modificationToString Mod_TTT = "[TTT]" modificationToString (Unknown s) = s - diff --git a/test/FASTA/order1.fasta b/test/FASTA/order1.fasta new file mode 100644 index 0000000..927b811 --- /dev/null +++ b/test/FASTA/order1.fasta @@ -0,0 +1,14 @@ +>3HMX:A|PDBID|CHAIN|SEQUENCE +IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE +VLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL + +>7HMX:A|PDBID|CHAIN|SEQUENCE +EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +VLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL + +> With_spaces +MDFFDLDIEI KQERLPAECS LNSPLNYSLS AQLTDRMTPR TENVRRQRER +MDFFDLDIEI KQERLPAECS LNSPLNYSLS AQLTDRMTPR TENVRRQRER +MDFFDLDIEI KQERLPAECS LNSPLNYSLS AQLTDRMTPR TENVRRQRER + +> Empty_ha_ha_ha diff --git a/test/FASTA/order2.fasta b/test/FASTA/order2.fasta new file mode 100644 index 0000000..3f70512 --- /dev/null +++ b/test/FASTA/order2.fasta @@ -0,0 +1,5 @@ +>Sample_name1 +ACGT....TCG + +>Sample_name2 +GTCA....TGC \ No newline at end of file diff --git a/test/FASTA/order3.fasta b/test/FASTA/order3.fasta new file mode 100644 index 0000000..e690170 --- /dev/null +++ b/test/FASTA/order3.fasta @@ -0,0 +1,2 @@ +>N-His-E4Orf6-7-R2(115) +TGATGGTGATGGTGATGcatGTGGTAAACTCGACTTTCACTTTTCTCTATCACTGATAGGGAGTGGTAAACTCGACTTTCACTTTTCTCTATCACTGATAGGGAaacagtcagcc \ No newline at end of file diff --git a/test/FASTA/order4.fasta b/test/FASTA/order4.fasta new file mode 100644 index 0000000..38de7ad --- /dev/null +++ b/test/FASTA/order4.fasta @@ -0,0 +1,27 @@ +>CMV-Lox2272-HindIII_R +aatcatAAGCTTataacttcgtataaagtatcctatacgaagttatagctctgcttatatagacctcccacc + + +HindIII-BFP_F +aagttatAAGCTTatgagcgagctgattaaggagaacatgc + +>sPA-lox2272_R +taatTGGCTAGCATataacttcgtataaagtatcctatacgaagttatgctgcatcacacaaaaaaccaacac + +>NheI-GFP_F +ttatATGCTAGCCAatggtgagcaagggcgagg + +>NotI-pMP_F +tataatGCGGCCGCAGGTGGCac + +>CMV-LoxP-HindIII_R +attaatAAGCTTataacttcgtataatgtatgctatacgaagttatagctctgcttatatagacctcccacc + +>sPA-NotI_R +aattaaTGCGGCCGCgctgcatcacacaaaaaaccaacacac + +sPA-LoxP-NheI_R +taatTGGCTAGCATataacttcgtataatgtatgctatacgaagttatgctgcatcacacaaaaaaccaacac + + + \ No newline at end of file diff --git a/test/FASTA/order5.fasta b/test/FASTA/order5.fasta new file mode 100644 index 0000000..5979123 --- /dev/null +++ b/test/FASTA/order5.fasta @@ -0,0 +1,9 @@ +>qCHO49 F +TGGAGAGATGGCTCGAGGTT + + + + + +qCHO R +TGGTTGCTGGGAATTGAACTC diff --git a/test/FASTA/order6.fasta b/test/FASTA/order6.fasta new file mode 100644 index 0000000..156488b --- /dev/null +++ b/test/FASTA/order6.fasta @@ -0,0 +1,26 @@ +>CMV-Lox2272-HindIII_R +aatcatAAGCTTataacttcgtataaagtatcctatacgaagttatagctctgcttatatagacctcccacc + +>HindIII-BFP_F +aagttatAAGCTTatgagcgagctgattaaggagaacatgc + +>sPA-lox2272_R +taatTGGCTAGCATataacttcgtataaagtatcctatacgaagttatgctgcatcacacaaaaaaccaacac + +>NheI-GFP_F +ttatATGCTAGCCAatggtgagcaagggcgagg + +>NotI-pMP_F +tataatGCGGCCGCAGGTGGCac + +>CMV-LoxP-HindIII_R +attaatAAGCTTataacttcgtataatgtatgctatacgaagttatagctctgcttatatagacctcccacc + +>sPA-NotI_R +aattaaTGCGGCCGCgctgcatcacacaaaaaaccaacacac + +sPA-LoxP-NheI_R +taatTGGCTAGCATataacttcgtataatgtatgctatacgaagttatgctgcatcacacaaaaaaccaacac + + + \ No newline at end of file diff --git a/test/FASTA/order7.fasta b/test/FASTA/order7.fasta new file mode 100644 index 0000000..0a097f5 --- /dev/null +++ b/test/FASTA/order7.fasta @@ -0,0 +1,11 @@ +>GB_F +5’-CTTCAAGAGAGAGACCTGCGT-3’ + +>GB_R +5’-GATGTTGTTGGCCACCTCG-3’ + +>F8_GB20_F +GCTACACCTTCAAGCACA + +>F8_GB20_R +GGGTTCTCCATGCTCA diff --git a/test/FASTA/order8.fasta b/test/FASTA/order8.fasta new file mode 100644 index 0000000..f0fe91c --- /dev/null +++ b/test/FASTA/order8.fasta @@ -0,0 +1,22 @@ +>Ampl_prcTnT_del +tttttACGCGTtaatagtaatcaattacggggtcattagttcatagcccatatatggagttccggctgccttatcagcgtctcgggcactcacgtatctccgtccgacgggtttaaaatagcaaaactctgagcgctgctgccaaaatagcagctcacaagtgttgcattcctctctgggcgccgggcacattcctgctggctctgcccgccccccatatatggagttccgcgttacataacttacggtaaatgg +>Ampl_MHCK7-1 +tttttACGCGTtaatagtaatcaattacggggtcattagttcatagcccatatatggagttccgccttcagattaaaaataactgaggtaagggcctgggtaggggaggtggtgtgagacgctcctgtctctcctctatctgcccatcggccctttggggaggaggaatgtgcccaaggactaaaaaaaggccatggagccagaggggcgagggcaacagacctttcatgggcaaaccttggggccctgctgtctagcatgcccc +>Ampl_MHCK7-2 +accttggggccctgctgtctagcatgccccactacgggtctaggctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaacccagacatgtggctgcccccccccccccaacacctgctgcctctaaaaataaccctgtccctggtggatcccctgcatgcgaagatcttcgaaccatatatggagttccgcgttacataacttacggtaaatgg +>Amplicon1_MH +tttttACGCGTtaatagtaatcaattacggggtcattagttcatagcccatatatggagttccgGTGCTGTCAGCCTTCCTTGACACCTCTGTCTCCTCAGGTGCCTGGCTCCCAGTCCCCAGAACGCCTCTCCTGTACCTTGCTTCCTAGCTGGGCCTTTCCTTCTCCTCTATAAATACCAGCTCTGGTATTTCGCCTTGGCAGCTGTagcagccactacgggtctaggctgcccatgtaaggaggcaaggcctgggga +>Amplicon2_MH +gctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaacccagacatgtggctgcccccccccccccaacacctgctgcctgagcctcacccccaccccggtgcctgggtcttaggctctgtacaccatggaggagaagctcgctctaaaaataaccctgtcccccatatatggagttccgcgttacataacttacggtaaatgg +>Ampl_MHCK7-1 +tttttACGCGTtaatagtaatcaattacggggtcattagttcatagcccatatatggagttccgccttcagattaaaaataactgaggtaagggcctgggtaggggaggtggtgtgagacgctcctgtctctcctctatctgcccatcggccctttggggaggaggaatgtgcccaaggactaaaaaaaggccatggagccagaggggcgagggcaacagacctttcatgggcaaaccttggggccctgctgtctagcatgcccc +>Ampl_MHCK7-2 +accttggggccctgctgtctagcatgccccactacgggtctaggctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaacccagacatgtggctgcccccccccccccaacacctgctgcctctaaaaataaccctgtccctggtggatcccctgcatgcgaagatcttcgaaccatatatggagttccgcgttacataacttacggtaaatgg +>CMV + enhMH-1 +GTGCTGTCAGCCTTCCTTGACACCTCTGTCTCCTCAGGTGCCTGGCTCCCAGTCCCCAGAACGCCTCTCCTGTACCTTGCTTCCTAGCTGGGCCTTTCCTTCTCCTCTATAAATACCAGCTCTGGTATTTCGCCTTGGCAGCTGTagcagccactacgggtctaggctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaacccagacatgtggctgcccccccccccccaacacctgctgcctgagcctcacccccaccccggtgcctgggtcttaggctctgtacac +>CMV + enhMH-2 +cccggtgcctgggtcttaggctctgtacaccatggaggagaagctcgctctaaaaataaccctgtcccgtgatgcggttttggcagtacatcaatgggcgtggatagcggtttgactcacggggatttccaagtctccaccccattgacgtcaatgggagtttgttttggcaccaaaatcaacgggactttccaaaatgtcgtaacaactccgccccattgacgcaaatgggcggtaggcgtgtacggtgggaggtctatataagcagagct +>CMV + enhMCK + prcTnT-1 +ccactacgggtctaggctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaaccccaacacctgctgcccccccccccccaacacctgctgcctgagcctgagcggttaccccaccccggtgcctgggtcttaggctctgtacaccatggaggagaagctcgctctaaaaataaccctgtccctggtgggtgccttatcagcgtccccagccctgggaggtgacagctggctggcttgtgtcagcccctcgggcactcacgtatctccgt +CMV + enhMCK + prcTnT-2 +tcagcccctcgggcactcacgtatctccgtccgacgggtttaaaatagcaaaactgtgatgcggttttggcagtacatcaatgggcgtggatagcggtttgactcacggggatttccaagtctccaccccattgacgtcaatgggagtttgttttggcaccaaaatcaacgggactttccaaaatgtcgtaacaactccgccccattgacgcaaatgggcggtaggcgtgtacggtgggaggtctatataagcagagct \ No newline at end of file diff --git a/test/FASTASpec.hs b/test/FASTASpec.hs index c4f5b0a..01199ff 100644 --- a/test/FASTASpec.hs +++ b/test/FASTASpec.hs @@ -2,37 +2,79 @@ module FASTASpec where -import Bio.FASTA (fromFile, toFile) -import Bio.FASTA.Type (Fasta, FastaItem (..)) -import Bio.Sequence (bareSequence) -import Prelude hiding (readFile, writeFile) -import System.Directory (removeFile) +import Bio.FASTA (fastaP, fromFile, toFile) +import Bio.FASTA.Parser (parseOnly) +import Bio.FASTA.Type (Fasta, FastaItem (..)) +import Bio.Sequence (bareSequence) +import Control.Monad.IO.Class (liftIO) +import Data.Text.IO (readFile) +import Prelude hiding (readFile, writeFile) +import System.Directory (removeFile) import Test.Hspec -correctFasta :: Fasta Char -correctFasta = [ FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL") - , FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL") - , FastaItem "With_spaces" (bareSequence "MDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRERMDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRERMDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRER") - , FastaItem "Empty_ha_ha_ha" (bareSequence "") - ] +correctFasta1 :: Fasta Char +correctFasta1 = [ FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL") + , FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL") + , FastaItem "With_spaces" (bareSequence "MDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRERMDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRERMDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRER") + , FastaItem "Empty_ha_ha_ha" (bareSequence "") + ] + +badFasta2 :: Either String (Fasta Char) +badFasta2 = Left "input.fasta:2:5:\n |\n2 | ACGT....TCG\r\n | ^^\nunexpected \"..\"\nexpecting end of input, end of line, or letter\n" + + +correctFasta3 :: Fasta Char +correctFasta3 = [ FastaItem "N-His-E4Orf6-7-R2(115)" (bareSequence "TGATGGTGATGGTGATGcatGTGGTAAACTCGACTTTCACTTTTCTCTATCACTGATAGGGAGTGGTAAACTCGACTTTCACTTTTCTCTATCACTGATAGGGAaacagtcagcc") + ] + +badFasta4 :: Either String (Fasta Char) +badFasta4 = Left "input.fasta:5:8:\n |\n5 | HindIII-BFP_F \r\n | ^^\nunexpected \"-B\"\nexpecting end of input, end of line, or letter\n" + +correctFasta5 :: Fasta Char +correctFasta5 = [FastaItem "qCHO49 F" (bareSequence "TGGAGAGATGGCTCGAGGTTqCHORTGGTTGCTGGGAATTGAACTC")] + +badFasta6 :: Either String (Fasta Char) +badFasta6 = Left "input.fasta:22:1:\n |\n22 | sPA-LoxP-NheI_R \r\n | ^\nunexpected 's'\nexpecting '>' or end of input\n" + +badFasta7 :: Either String (Fasta Char) +badFasta7 = Left "input.fasta:2:1:\n |\n2 | 5\8217-CTTCAAGAGAGAGACCTGCGT-3\8217\r\n | ^\nunexpected '5'\nexpecting '>', end of input, end of line, or sequence\n" + +badFasta8 :: Either String (Fasta Char) +badFasta8 = Left "input.fasta:21:5:\n |\n21 | CMV + enhMCK + prcTnT-2\r\n | ^^\nunexpected \"+ \"\nexpecting end of input, end of line, or letter\n" fastaSpec :: Spec -fastaSpec = describe "Fasta file parser." $ do - parseFile "test/FASTA/correct.fasta" - writeFile "test/FASTA/test.fasta" +fastaSpec = describe "Fasta files parser." $ do + parseFile "test/FASTA/order1.fasta" correctFasta1 + writeFile "test/FASTA/input.fasta" correctFasta1 + parseBadFile "test/FASTA/order2.fasta" badFasta2 + parseFile "test/FASTA/order3.fasta" correctFasta3 + writeFile "test/FASTA/input.fasta" correctFasta3 + parseBadFile "test/FASTA/order4.fasta" badFasta4 + parseFile "test/FASTA/order5.fasta" correctFasta5 + writeFile "test/FASTA/input.fasta" correctFasta5 + parseBadFile "test/FASTA/order6.fasta" badFasta6 + parseBadFile "test/FASTA/order7.fasta" badFasta7 + parseBadFile "test/FASTA/order8.fasta" badFasta8 -parseFile :: FilePath -> Spec -parseFile path = do +parseFile :: FilePath -> Fasta Char -> Spec +parseFile path cf = do describe "fromFile" $ do it "correctly parses fasta from file" $ do fasta <- fromFile path - fasta `shouldBe` correctFasta + fasta `shouldBe` cf -writeFile :: FilePath -> Spec -writeFile path = describe "writeFile" $ do +parseBadFile :: FilePath -> Either String (Fasta Char) -> Spec +parseBadFile path cf = do + describe "fromFile" $ do + it "correctly parses fasta from file" $ do + res <- liftIO (readFile path) + let badRes = parseOnly fastaP res + badRes `shouldBe` cf + +writeFile :: FilePath -> Fasta Char -> Spec +writeFile path cf = describe "writeFile" $ do it "correctly write fasta into file" $ do - toFile correctFasta path + toFile cf path fasta <- fromFile path removeFile path - fasta `shouldBe` correctFasta - + fasta `shouldBe` cf diff --git a/test/FastaParserSpec.hs b/test/FastaParserSpec.hs index 25cda17..5378595 100644 --- a/test/FastaParserSpec.hs +++ b/test/FastaParserSpec.hs @@ -1,18 +1,19 @@ {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeApplications #-} module FastaParserSpec where -import Bio.FASTA.Parser (fastaP) -import Bio.FASTA.Type (Fasta, FastaItem (..), ModItem (..), Modification (..)) -import Bio.Sequence (bareSequence) -import Data.Attoparsec.Text (endOfInput, parseOnly) -import Data.Text (Text) -import qualified Data.Text as T +import Bio.FASTA.Parser (fastaP, parseOnly) +import Bio.FASTA.Type (Fasta, FastaItem (..), ModItem (..), Modification (..)) +import Bio.Sequence (bareSequence) +import Data.Bifunctor +import Data.Text (Text) +import qualified Data.Text as T import Test.Hspec +import Text.Megaparsec (eof, errorBundlePretty, parse) fastaParserSpec :: Spec -fastaParserSpec = describe "Fasta format parser." $ do +fastaParserSpec = describe "Fasta format parser" $ do emptyFasta onlyName oneSequence @@ -25,6 +26,7 @@ fastaParserSpec = describe "Fasta format parser." $ do sequenceWithTabsInName sequenceWithTabsInSequence sequenceWithModifications + sequenceWithSpaces toughParserTests emptyFasta :: Spec @@ -49,19 +51,22 @@ twoSequences :: Spec twoSequences = describe "twoSequences" $ do it "correctly parses two correct sequences" $ do let res = parseOnly fastaP ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE\nVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n>7HMX:A|PDBID|CHAIN|SEQUENCE\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE\nVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL" - res `shouldBe` Right [FastaItem @Char "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL"), FastaItem @Char "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")] + res `shouldBe` Right + [ FastaItem @Char "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL") + , FastaItem @Char "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL") + ] sequenceWithDigit :: Spec sequenceWithDigit = describe "sequenceWithDigit" $ do it "correctly parses incorrect sequence with digit" $ do - let res = parseOnly fastaP ">123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE" - res `shouldBe` Right [FastaItem @Char "123" (bareSequence "")] + let res = parseOnly (fastaP @Char) ">123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE" + res `shouldBe` Left "input.fasta:2:34:\n |\n2 | IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE\n | ^^\nunexpected \"4G\"\nexpecting end of input, end of line, or letter\n" sequenceWithWrongName :: Spec sequenceWithWrongName = describe "sequenceWithWrongName" $ do it "correctly parses incorrect sequence with wrong name" $ do - let res = parseOnly fastaP "123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE" - res `shouldBe` Right ([] :: Fasta Char) + let res = parseOnly (fastaP @Char) "123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE" + res `shouldBe` Left "input.fasta:1:1:\n |\n1 | 123\n | ^\nunexpected '1'\nexpecting '>' or end of input\n" sequenceWithSpacesInName :: Spec sequenceWithSpacesInName = describe "sequenceWithSpacesInName" $ do @@ -71,15 +76,15 @@ sequenceWithSpacesInName = describe "sequenceWithSpacesInName" $ do sequenceWithSeveralEndOfLine :: Spec sequenceWithSeveralEndOfLine = describe "sequenceWithSeveralEndOfLine" $ do - it "correctly parses sequence with several \n after name" $ do - let res = parseOnly fastaP ">this is my sequence\n\n\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE" - res `shouldBe` Right [FastaItem @Char "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")] + it "correctly parses sequence with several \\n after name" $ do + let res = parseOnly (fastaP @Char) ">this is my sequence\n\n\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE" + res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")] sequenceWithSeveralEndOfLineInSequence :: Spec sequenceWithSeveralEndOfLineInSequence = describe "sequenceWithSeveralEndOfLineInSequence" $ do - it "correctly parses sequence with several \n between sequence parts" $ do - let res = parseOnly fastaP ">this is my sequence\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\n\n\nYYYYYYYYYYYYYYYYYYYYYYYY" - res `shouldBe` Right [FastaItem @Char "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSEYYYYYYYYYYYYYYYYYYYYYYYY")] + it "correctly parses sequence with several \\n between sequence parts" $ do + let res = parseOnly (fastaP @Char) ">this is my sequence\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\n\n\nYYYYYYYYYYYYYYYYYYYYYYYY" + res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSEYYYYYYYYYYYYYYYYYYYYYYYY")] sequenceWithTabsInName :: Spec sequenceWithTabsInName = describe "sequenceWithTabsInName" $ do @@ -99,13 +104,22 @@ sequenceWithModifications = describe "sequenceWithModifications" $ do let res = parseOnly fastaP ">this is my sequence\nIWEL[mU*]KKDVYV\t\t\nYY[56FAM]YY[Trololo]YY\t\n" res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence [Letter 'I', Letter 'W', Letter 'E', Letter 'L', Mod Mod_mU_Star, Letter 'K', Letter 'K', Letter 'D', Letter 'V', Letter 'Y', Letter 'V', Letter 'Y', Letter 'Y', Mod Mod_56FAM, Letter 'Y', Letter 'Y', Mod (Unknown "[Trololo]"), Letter 'Y', Letter 'Y'])] +sequenceWithSpaces :: Spec +sequenceWithSpaces = describe "sequenceWithSpaces" $ do + it "correctly parses sequence with spaces" $ do + let res = parseOnly fastaP ">test1\nAAAA TTTT GGGG ccA\n" + res `shouldBe` Right [FastaItem @Char "test1" (bareSequence "AAAATTTTGGGGccA")] + toughParserTests :: Spec toughParserTests = describe "various parser tests" $ do it "correctly parses empty lines" $ checkParser correctTest1 (Right correctAnswer) it "correctly parses empty lines with spaces" $ checkParser correctTest2 (Right correctAnswer) it "correctly parses empty lines with tabs" $ checkParser correctTest3 (Right correctAnswer) - it "correctly fails to parse a name without >" $ checkParser incorrectTest1 (Left "endOfInput") - it "correctly fails to parse a new sequence at the same line" $ checkParser incorrectTest2 (Left "endOfInput") + it "correctly parses empty lines with trailing tabs" $ checkParser correctTest4 (Right correctAnswer4) + it "correctly fails to parse a name without >" $ checkParser incorrectTest1 + (Left "input.fasta:1:1:\n |\n1 | test1\n | ^\nunexpected 't'\nexpecting '>' or end of input\n") + it "correctly fails to parse a new sequence at the same line" $ checkParser incorrectTest2 + (Left "input.fasta:3:8:\n |\n3 | GHIJKL >test2\n | ^^\nunexpected \">t\"\nexpecting end of input, end of line, or letter\n") correctTest1 :: Text correctTest1 = T.unlines @@ -137,6 +151,14 @@ correctTest3 = T.unlines , "ABCDEF" ] +correctTest4 :: Text +correctTest4 = "> test4\nTTTAGGTactTGT\t\t \t\n" + +correctAnswer4 :: [FastaItem Char] +correctAnswer4 = + [ FastaItem "test4" (bareSequence "TTTAGGTactTGT") + ] + incorrectTest1 :: Text incorrectTest1 = T.unlines [ "test1" @@ -157,5 +179,7 @@ incorrectTest2 = T.unlines correctAnswer :: Fasta Char correctAnswer = [FastaItem "test1" (bareSequence "ABCDEFGHIJKL"), FastaItem "test2" (bareSequence "ABCDEF")] -checkParser :: Text -> Either String (Fasta Char) -> Expectation -checkParser source expectation = parseOnly (fastaP <* endOfInput) source `shouldBe` expectation +checkParser :: HasCallStack => Text -> Either String (Fasta Char) -> Expectation +checkParser source expectation = + first errorBundlePretty (parse (fastaP <* eof) "input.fasta" source) + `shouldBe` expectation