Skip to content

Commit

Permalink
pdb parser version 1 (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
CarrollNew authored and ozzzzz committed Dec 24, 2019
1 parent b7b4393 commit 3c5e1c4
Show file tree
Hide file tree
Showing 8 changed files with 14,910 additions and 6 deletions.
5 changes: 5 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## [Unreleased]

## [0.1.2.4] - 2019-12-23
### Added
- Preprocessing for pdb-files.
- Pdb parser.

## [0.1.2.3] - 2019-12-12
### Fixed
- Fixes for .mae pasrser.
Expand Down
2 changes: 1 addition & 1 deletion package.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cobot-io
version: 0.1.2.3
version: 0.1.2.4
github: "less-wrong/cobot-io"
license: BSD3
category: Bio
Expand Down
157 changes: 157 additions & 0 deletions src/Bio/PDB/Parser.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}

module Bio.PDB.Parser
( pdbP )
where


import Bio.PDB.Type (Atom (..), Chain, FieldType, Model,
PDB (..), RemarkCode)
import Control.Applicative (many, some, (<|>))
import Control.DeepSeq ()
import Data.Attoparsec.Text (Parser, choice, count, endOfInput,
endOfLine, isEndOfLine, satisfy,
skipWhile, space, string, takeWhile)
import qualified Data.List as L (groupBy)
import Data.Map.Strict (Map, fromListWithKey)
import Data.Maybe (catMaybes)
import Data.Monoid ((<>))
import Data.Text as T (Text, concat, pack, stripEnd)
import qualified Data.Vector as V (Vector, concat, fromList, singleton)
import GHC.Generics ()
import Text.Read (readMaybe)

pdbP :: Parser PDB
pdbP = do
pdbData <- many (choice [titleP, remarkStringP, manyModelsP, otherFieldP]) -- parser order is important
let
models = V.concat $ catMaybes (getModels <$> pdbData)
otherFieldsMap = fromRevListWith (<>) $ catMaybes (getOtherField <$> pdbData)
title = T.concat $ catMaybes (getTitle <$> pdbData)
remarks = fromRevListWith (<>) $ catMaybes (getRemarks <$> pdbData)

return $ PDB title models remarks otherFieldsMap
where
getModels :: PdbData -> Maybe (V.Vector Model)
getModels item = case item of
ModelData x -> Just x
_ -> Nothing
getOtherField :: PdbData -> Maybe (FieldType, V.Vector Text)
getOtherField item = case item of
OtherFieldData (Just x, y) -> Just (x, V.singleton y)
_ -> Nothing
getTitle :: PdbData -> Maybe Text
getTitle item = case item of
TitleData x -> Just x
_ -> Nothing
getRemarks :: PdbData -> Maybe (RemarkCode, V.Vector Text)
getRemarks item = case item of
RemarkData (x, y) -> Just (x, V.singleton y)
_ -> Nothing

data PdbData = ModelData (V.Vector Model)
| OtherFieldData (Maybe FieldType, Text)
| RemarkData (RemarkCode, Text)
| TitleData Text
deriving (Show)

notEndLineChar :: Parser Char
notEndLineChar = satisfy $ not . isEndOfLine

takeText :: Parser Text
takeText = Data.Attoparsec.Text.takeWhile $ not . isEndOfLine

atomP :: Parser CoordLike
atomP = let atom = Atom <$>
(string "ATOM " *> -- (1 - 6) # we increased atomSerial field for one symbol
(read <$> count 6 notEndLineChar) <* space) -- (7 - 11) atomSerial
<*> (T.pack <$> count 4 notEndLineChar) -- (13 - 16) atomName
<*> notEndLineChar -- (17) atomAltLoc
<*> (T.pack <$> count 3 notEndLineChar) <* space -- (18 - 20) atomResName
<*> notEndLineChar -- (22) atomChainID
<*> (read <$> count 4 notEndLineChar) -- (23 - 26) atomResSeq
<*> notEndLineChar <* count 3 space -- (27) atomICode
<*> (read <$> count 8 notEndLineChar) -- (31 - 38) atomX
<*> (read <$> count 8 notEndLineChar) -- (39 - 46) atomY
<*> (read <$> count 8 notEndLineChar) -- (47 - 54) atomZ
<*> (read <$> count 6 notEndLineChar) -- (55 - 60) atomOccupancy
<*> (read <$> count 6 notEndLineChar) <* count 10 space -- (61 - 66) atomTempFactor
<*> (T.pack <$> count 2 notEndLineChar) -- (77 - 78) atomElement
<*> (T.pack <$> count 2 notEndLineChar) -- (79 - 80) atomCharge
<* (endOfLine <|> endOfInput)
in AtomLine <$> atom

coordNotAtomP :: Parser CoordLike
coordNotAtomP = do
_ <- string "HETATM" <|> string "TER " <|> string "ANISOU" <|> string "CONECT"
skipWhile $ not . isEndOfLine
endOfLine
return CoordNotAtomLine

data CoordLike = AtomLine Atom | CoordNotAtomLine
deriving (Show)

coordLikeP :: Parser [CoordLike]
coordLikeP = some (coordNotAtomP <|> atomP)

chainsP :: Parser (V.Vector Chain)
chainsP = do
coordLikeLines <- coordLikeP
let atoms = catMaybes (getAtom <$> coordLikeLines)
chains = V.fromList (map V.fromList $ groupByChains atoms)
pure chains
where
getAtom :: CoordLike -> Maybe Atom
getAtom line = case line of
AtomLine x -> Just x
_ -> Nothing
groupByChains :: [Atom]-> [[Atom]]
groupByChains = L.groupBy (\x y -> atomChainID x == atomChainID y)

modelP :: Parser Model
modelP = do
_ <- string "MODEL"
skipWhile $ not . isEndOfLine
endOfLine
chains <- chainsP
string "ENDMDL" >> skipWhile (not . isEndOfLine)
endOfLine <|> endOfInput
pure chains

manyModelsP :: Parser PdbData
manyModelsP = do
models <- (:[]) <$> chainsP <|> some modelP
return $ ModelData (V.fromList models)

titleStringP :: Parser Text
titleStringP = do
_ <- string "TITLE "
titleText <- takeText
endOfLine
pure $ T.stripEnd titleText

titleP :: Parser PdbData
titleP = do
titleText <- T.concat <$> some titleStringP
return $ TitleData titleText

remarkStringP :: Parser PdbData
remarkStringP = do
_ <- string "REMARK"
_ <- space
(remarkCode :: RemarkCode) <- readMaybe <$> count 3 notEndLineChar
_ <- notEndLineChar
remarkText <- takeText
endOfLine
pure $ RemarkData (remarkCode, T.stripEnd remarkText)

fromRevListWith :: Ord k => (a -> a -> a) -> [(k,a)] -> Map k a
fromRevListWith f xs = fromListWithKey (\_ x y -> f y x) xs

otherFieldP :: Parser PdbData
otherFieldP = do
(fieldType :: Maybe FieldType) <- readMaybe <$> count 6 notEndLineChar
fieldTypeText <- takeText
endOfLine <|> endOfInput
return $ OtherFieldData (fieldType, T.stripEnd fieldTypeText)
73 changes: 73 additions & 0 deletions src/Bio/PDB/Reader.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
module Bio.PDB.Reader
( fromTextPDB
, fromFilePDB
) where

import Bio.PDB.Parser (pdbP)
import Bio.PDB.Type (PDB (..))
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Attoparsec.Text (parseOnly)
import Data.Bifunctor (first)
import Data.List as L (findIndices, length)
import Data.Maybe (catMaybes)
import Data.Text as T (Text, length, lines, pack,
replicate, take, unlines)
import qualified Data.Text.IO as TIO (readFile)


type LineNumber = Int

data PDBWarnings = LineTooLong LineNumber
| LineTooShort LineNumber
deriving (Show, Eq)

standardizeText :: Text -> ([PDBWarnings], Text)
standardizeText text = (textWarnings, T.unlines standardizedLines)
where
textLines = T.lines text
desiredLength = 80 -- cause it is max length in standart pdb file

warnings'n'text = map standardizeLine $ zip [0..] textLines
textWarnings = catMaybes (fst <$> warnings'n'text)
standardizedLines = snd <$> warnings'n'text

standardizeLine :: (Int, Text) -> (Maybe PDBWarnings, Text)
standardizeLine (lineNumber,line) | lineLength < desiredLength = (Just (LineTooShort lineNumber), line <> T.replicate spacesCount " ")
| lineLength > desiredLength = (Just (LineTooLong lineNumber), T.take desiredLength line)
| otherwise = (Nothing, line)
where
lineLength = T.length line
spacesCount = desiredLength - lineLength


isMdlLine :: Text -> Bool
isMdlLine line = elem (T.take 6 line) modelStrings || elem (T.take 5 line) modelStrings
where
modelStrings = ["MODEL ", "ENDMDL", "ATOM ", "TER ", "HETATM", "ANISOU", "CONECT"]

checkRow :: [Int] -> Bool
checkRow [] = True
checkRow xs = last xs - head xs + 1 == L.length xs

checkMdlLines :: ([PDBWarnings], Text) -> Bool
checkMdlLines warnings'n'text = checkRow mdlLineNumbers
where
mdlLineNumbers = findIndices isMdlLine $ T.lines (snd warnings'n'text)

preprocess :: Text -> Either Text ([PDBWarnings], Text)
preprocess text = do
let standardizedText = standardizeText text
if checkMdlLines standardizedText
then Right standardizedText
else Left "There are trash strings between model strings"

fromFilePDB :: MonadIO m => FilePath -> m (Either Text ([PDBWarnings], Either Text PDB))
fromFilePDB f = do
content <- liftIO (TIO.readFile f)
let preprocessed = preprocess content
pure $ fmap (first pack . parseOnly pdbP) <$> preprocessed

fromTextPDB :: Text -> Either Text ([PDBWarnings], Either Text PDB)
fromTextPDB text = fmap (first pack . parseOnly pdbP) <$> preprocessed
where
preprocessed = preprocess text
16 changes: 11 additions & 5 deletions src/Bio/PDB/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ data FieldType
-- Title Section (except TITLE and REMARKS)
HEADER
| OBSLTE
| SPLT
| SPLIT
| CAVEAT
| COMPND
| SOURCE
Expand Down Expand Up @@ -59,12 +59,18 @@ data FieldType
| SITE
-- Crystallographic and Coordinate Transformation Section
| CRYST1
| MTRIXn
| ORIGXn
| SCALEn
| MTRIX1
| MTRIX2
| MTRIX3
| ORIGX1
| ORIGX2
| ORIGX3
| SCALE1
| SCALE2
| SCALE3
-- Bookkeeping Section
| MASTER
deriving (Show, Eq, Read, Generic, NFData)
deriving (Show, Eq, Read, Generic, NFData, Ord)

type Model = Vector Chain

Expand Down
Loading

0 comments on commit 3c5e1c4

Please sign in to comment.