pdb parser version 1 (#19)

biocad · Dec 24, 2019 · 3c5e1c4 · 3c5e1c4
1 parent b7b4393
commit 3c5e1c4
Show file tree

Hide file tree

Showing 8 changed files with 14,910 additions and 6 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -2,6 +2,11 @@
 
 ## [Unreleased]
 
+## [0.1.2.4] - 2019-12-23
+### Added
+- Preprocessing for pdb-files.
+- Pdb parser.
+
 ## [0.1.2.3] - 2019-12-12
 ### Fixed
 - Fixes for .mae pasrser.

diff --git a/package.yaml b/package.yaml
@@ -1,5 +1,5 @@
 name:                cobot-io
-version:             0.1.2.3
+version:             0.1.2.4
 github:              "less-wrong/cobot-io"
 license:             BSD3
 category:            Bio 

diff --git a/src/Bio/PDB/Parser.hs b/src/Bio/PDB/Parser.hs
@@ -0,0 +1,157 @@
+{-# LANGUAGE OverloadedStrings   #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+module Bio.PDB.Parser
+  ( pdbP )
+where
+
+
+import           Bio.PDB.Type         (Atom (..), Chain, FieldType, Model,
+                                       PDB (..), RemarkCode)
+import           Control.Applicative  (many, some, (<|>))
+import           Control.DeepSeq      ()
+import           Data.Attoparsec.Text (Parser, choice, count, endOfInput,
+                                       endOfLine, isEndOfLine, satisfy,
+                                       skipWhile, space, string, takeWhile)
+import qualified Data.List            as L (groupBy)
+import           Data.Map.Strict      (Map, fromListWithKey)
+import           Data.Maybe           (catMaybes)
+import           Data.Monoid          ((<>))
+import           Data.Text            as T (Text, concat, pack, stripEnd)
+import qualified Data.Vector          as V (Vector, concat, fromList, singleton)
+import           GHC.Generics         ()
+import           Text.Read            (readMaybe)
+
+pdbP :: Parser PDB
+pdbP = do
+    pdbData <- many (choice [titleP, remarkStringP, manyModelsP, otherFieldP]) -- parser order is important
+    let
+        models         = V.concat $ catMaybes (getModels <$> pdbData)
+        otherFieldsMap = fromRevListWith (<>) $ catMaybes (getOtherField <$> pdbData)
+        title          = T.concat $ catMaybes (getTitle <$> pdbData)
+        remarks        = fromRevListWith (<>) $ catMaybes (getRemarks <$> pdbData)
+
+    return $ PDB title models remarks otherFieldsMap
+  where
+    getModels :: PdbData -> Maybe (V.Vector Model)
+    getModels item = case item of
+      ModelData x -> Just x
+      _           -> Nothing
+    getOtherField :: PdbData -> Maybe (FieldType, V.Vector Text)
+    getOtherField item = case item of
+      OtherFieldData (Just x, y) -> Just (x, V.singleton y)
+      _                          -> Nothing
+    getTitle :: PdbData -> Maybe Text
+    getTitle item = case item of
+      TitleData x -> Just x
+      _           -> Nothing
+    getRemarks :: PdbData -> Maybe (RemarkCode, V.Vector Text)
+    getRemarks item = case item of
+      RemarkData (x, y) -> Just (x, V.singleton y)
+      _                 -> Nothing
+
+data PdbData = ModelData (V.Vector Model)
+             | OtherFieldData (Maybe FieldType, Text)
+             | RemarkData (RemarkCode, Text)
+             | TitleData Text
+  deriving (Show)
+
+notEndLineChar :: Parser Char
+notEndLineChar = satisfy $ not . isEndOfLine
+
+takeText :: Parser Text
+takeText = Data.Attoparsec.Text.takeWhile $ not . isEndOfLine
+
+atomP :: Parser CoordLike
+atomP = let atom = Atom <$>
+                    (string "ATOM " *>                                       -- (1 -  6) # we increased atomSerial field for one symbol
+                    (read <$> count 6 notEndLineChar) <* space)              -- (7 - 11)  atomSerial
+                    <*> (T.pack <$> count 4 notEndLineChar)                  -- (13 - 16) atomName
+                    <*> notEndLineChar                                       -- (17)      atomAltLoc
+                    <*> (T.pack <$> count 3 notEndLineChar) <* space         -- (18 - 20) atomResName
+                    <*> notEndLineChar                                       -- (22)      atomChainID
+                    <*> (read <$> count 4 notEndLineChar)                    -- (23 - 26) atomResSeq
+                    <*> notEndLineChar <* count 3 space                      -- (27)      atomICode
+                    <*> (read <$> count 8 notEndLineChar)                    -- (31 - 38) atomX
+                    <*> (read <$> count 8 notEndLineChar)                    -- (39 - 46) atomY
+                    <*> (read <$> count 8 notEndLineChar)                    -- (47 - 54) atomZ
+                    <*> (read <$> count 6 notEndLineChar)                    -- (55 - 60) atomOccupancy
+                    <*> (read <$> count 6 notEndLineChar) <* count 10 space  -- (61 - 66) atomTempFactor
+                    <*> (T.pack <$> count 2 notEndLineChar)                  -- (77 - 78) atomElement
+                    <*> (T.pack <$> count 2 notEndLineChar)                  -- (79 - 80) atomCharge
+                    <* (endOfLine <|> endOfInput)
+        in AtomLine <$> atom
+
+coordNotAtomP :: Parser CoordLike
+coordNotAtomP = do
+    _ <- string "HETATM" <|> string "TER " <|> string "ANISOU" <|> string "CONECT"
+    skipWhile $ not . isEndOfLine
+    endOfLine
+    return CoordNotAtomLine
+
+data CoordLike = AtomLine Atom | CoordNotAtomLine
+  deriving (Show)
+
+coordLikeP :: Parser [CoordLike]
+coordLikeP = some (coordNotAtomP <|> atomP)
+
+chainsP :: Parser (V.Vector Chain)
+chainsP = do
+    coordLikeLines <- coordLikeP
+    let atoms  = catMaybes (getAtom <$> coordLikeLines)
+        chains = V.fromList (map V.fromList $ groupByChains atoms)
+    pure chains
+  where
+    getAtom :: CoordLike -> Maybe Atom
+    getAtom line = case line of
+      AtomLine x -> Just x
+      _          -> Nothing
+    groupByChains :: [Atom]-> [[Atom]]
+    groupByChains = L.groupBy (\x y ->  atomChainID x == atomChainID y)
+
+modelP :: Parser Model
+modelP = do
+    _ <- string "MODEL"
+    skipWhile $ not . isEndOfLine
+    endOfLine
+    chains <- chainsP
+    string "ENDMDL" >> skipWhile (not . isEndOfLine)
+    endOfLine <|> endOfInput
+    pure chains
+
+manyModelsP :: Parser PdbData
+manyModelsP = do
+    models <- (:[]) <$> chainsP <|> some modelP
+    return $ ModelData (V.fromList models)
+
+titleStringP :: Parser Text
+titleStringP = do
+    _ <- string "TITLE "
+    titleText <- takeText
+    endOfLine
+    pure $ T.stripEnd titleText
+
+titleP :: Parser PdbData
+titleP =  do
+    titleText <- T.concat <$> some titleStringP
+    return $ TitleData titleText
+
+remarkStringP :: Parser PdbData
+remarkStringP = do
+    _ <- string "REMARK"
+    _ <- space
+    (remarkCode :: RemarkCode) <- readMaybe <$> count 3 notEndLineChar
+    _ <- notEndLineChar
+    remarkText <- takeText
+    endOfLine
+    pure $ RemarkData (remarkCode, T.stripEnd remarkText)
+
+fromRevListWith :: Ord k => (a -> a -> a) -> [(k,a)] -> Map k a
+fromRevListWith f xs = fromListWithKey (\_ x y -> f y x) xs
+
+otherFieldP :: Parser PdbData
+otherFieldP = do
+    (fieldType :: Maybe FieldType) <- readMaybe <$> count 6 notEndLineChar
+    fieldTypeText <- takeText
+    endOfLine <|> endOfInput
+    return $ OtherFieldData (fieldType,  T.stripEnd fieldTypeText)
diff --git a/src/Bio/PDB/Reader.hs b/src/Bio/PDB/Reader.hs
@@ -0,0 +1,73 @@
+module Bio.PDB.Reader
+  ( fromTextPDB
+  , fromFilePDB
+  ) where
+
+import           Bio.PDB.Parser         (pdbP)
+import           Bio.PDB.Type           (PDB (..))
+import           Control.Monad.IO.Class (MonadIO, liftIO)
+import           Data.Attoparsec.Text   (parseOnly)
+import           Data.Bifunctor         (first)
+import           Data.List              as L (findIndices, length)
+import           Data.Maybe             (catMaybes)
+import           Data.Text              as T (Text, length, lines, pack,
+                                              replicate, take, unlines)
+import qualified Data.Text.IO           as TIO (readFile)
+
+
+type LineNumber = Int
+
+data PDBWarnings = LineTooLong LineNumber
+                 | LineTooShort LineNumber
+  deriving (Show, Eq)
+
+standardizeText :: Text -> ([PDBWarnings], Text)
+standardizeText text = (textWarnings, T.unlines standardizedLines)
+  where
+    textLines = T.lines text
+    desiredLength = 80  -- cause it is max length in standart pdb file
+
+    warnings'n'text = map standardizeLine $ zip [0..] textLines
+    textWarnings = catMaybes (fst <$> warnings'n'text)
+    standardizedLines = snd <$> warnings'n'text
+
+    standardizeLine :: (Int, Text) -> (Maybe PDBWarnings, Text)
+    standardizeLine (lineNumber,line) | lineLength < desiredLength = (Just (LineTooShort lineNumber), line <> T.replicate spacesCount " ")
+                                      | lineLength > desiredLength = (Just (LineTooLong lineNumber), T.take desiredLength line)
+                                      | otherwise = (Nothing, line)
+      where
+        lineLength = T.length line
+        spacesCount = desiredLength - lineLength
+
+
+isMdlLine :: Text -> Bool
+isMdlLine line = elem (T.take 6 line) modelStrings || elem (T.take 5 line) modelStrings
+  where
+    modelStrings = ["MODEL ", "ENDMDL", "ATOM ", "TER   ", "HETATM", "ANISOU", "CONECT"]
+
+checkRow :: [Int] -> Bool
+checkRow [] = True
+checkRow xs = last xs - head xs + 1 == L.length xs
+
+checkMdlLines :: ([PDBWarnings], Text) -> Bool
+checkMdlLines warnings'n'text = checkRow mdlLineNumbers
+  where
+    mdlLineNumbers = findIndices isMdlLine $ T.lines (snd warnings'n'text)
+
+preprocess :: Text -> Either Text ([PDBWarnings], Text)
+preprocess text = do
+  let standardizedText = standardizeText text
+  if checkMdlLines standardizedText
+  then Right standardizedText
+  else Left "There are trash strings between model strings"
+
+fromFilePDB :: MonadIO m => FilePath -> m (Either Text ([PDBWarnings], Either Text PDB))
+fromFilePDB f = do
+  content <- liftIO (TIO.readFile f)
+  let preprocessed = preprocess content
+  pure $ fmap (first pack . parseOnly pdbP) <$> preprocessed
+
+fromTextPDB :: Text -> Either Text ([PDBWarnings], Either Text PDB)
+fromTextPDB text = fmap (first pack . parseOnly pdbP) <$> preprocessed
+  where
+    preprocessed = preprocess text
diff --git a/src/Bio/PDB/Type.hs b/src/Bio/PDB/Type.hs
@@ -24,7 +24,7 @@ data FieldType
    -- Title Section (except TITLE and REMARKS)
      HEADER
    | OBSLTE
-   | SPLT
+   | SPLIT
    | CAVEAT
    | COMPND
    | SOURCE
@@ -59,12 +59,18 @@ data FieldType
    | SITE
    -- Crystallographic and Coordinate Transformation Section
    | CRYST1
-   | MTRIXn
-   | ORIGXn
-   | SCALEn
+   | MTRIX1
+   | MTRIX2
+   | MTRIX3
+   | ORIGX1
+   | ORIGX2
+   | ORIGX3
+   | SCALE1
+   | SCALE2
+   | SCALE3
    -- Bookkeeping Section
    | MASTER
-  deriving (Show, Eq, Read, Generic, NFData)
+  deriving (Show, Eq, Read, Generic, NFData, Ord)
 
 type Model = Vector Chain