-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_predicted_models_adaptive.py
executable file
·127 lines (102 loc) · 3.55 KB
/
process_predicted_models_adaptive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
# # -*- coding: utf-8 -*-
"""
@Authors Max Tong & HB
@Require phenix 1.20
@v0.2 In this version, you have no flexibility in choosing option. the maximum_domains = no_of_AA/100
assuming ~100AA per domain
TO DO: remove *.remainder_seq in source PDB folder
"""
import sys,os,time,platform,math
import Bio
import os.path,re
from Bio.PDB import PDBParser
from Bio.PDB import PDBIO
from datetime import datetime
import subprocess, multiprocessing
script_dir=os.path.dirname(os.path.realpath(__file__))
def print_usage ():
print("usage: process_predicted_models_adaptive.py <inputDir> <outputDir> <noProc> options")
print("eg: process_predicted_models_adaptive.py input domains 10 maximum_rmsd=.8")
sys.exit()
def execute(cmd):
print(f'start {cmd}', datetime.now())
return subprocess.call(cmd,shell=True)
""" Write domain info"""
def write_domains(input_pdb, output):
parser=PDBParser(QUIET=True)
protein = parser.get_structure('Y', input_pdb)
log = open(output, "w")
chain_id = 1
for chain in protein[0]:
#print ("Chain " + str(chain))
# Print first & last residues
first = last = next(chain.get_residues(), None)
for last in chain.get_residues():
pass
#log.write("D{:s}\t{:d}-{:d}\n".format(chain.get_id(), first.get_id()[1], last.get_id()[1]))
log.write("D{:d}\t{:d}-{:d}\n".format(chain_id, first.get_id()[1], last.get_id()[1]))
chain_id += 1
log.close()
""" Retrieve protein sequence length"""
def get_PDB_len(input_pdb):
parser=PDBParser(QUIET=True)
protein = parser.get_structure('Y', input_pdb)
for last in protein[0].get_residues():
pass
return last.get_id()[1]
if __name__ == "__main__":
# Default option seems to work very well
# Adaptive doesn't seem to do much
default_options = "split_model_by_compact_regions = True"
if len(sys.argv) < 4 :
print_usage()
if len(sys.argv) == 4 :
options = default_options
else:
logs=[]
options = ' '.join(sys.argv[4 : ])
input_dir = sys.argv[1]
output_dir = sys.argv[2]
threads = int(sys.argv[3])
os.makedirs(output_dir, exist_ok=True)
print(f'Process_predicted_model options: {options}')
cmds=[]
for pdb in os.listdir(input_dir):
if pdb.endswith(".pdb"):
ext = ".pdb"
elif pdb.endswith(".cif"):
ext = ".cif"
else:
continue
# Adaptive adjust maximum_domains
no_residues = get_PDB_len(os.path.join(input_dir, pdb))
no_domains = math.ceil(no_residues/100)
print(f'{pdb} has {no_residues} residues')
if "maximum_domains" not in options:
adaptive_options = options + " maximum_domains=" + '{:d}'.format(no_domains)
print(f'Running with adaptive option of {no_domains}')
else:
adaptive_options = options
# Add them to the command list
out_pdb = re.sub("{}$".format(ext), '_domains', pdb)
cmds.append(f'phenix.process_predicted_model {input_dir}/{pdb} processed_model_prefix={output_dir}/{out_pdb} {adaptive_options}')
# Execute command list
count = threads
with multiprocessing.Pool(processes=count) as pool:
results = pool.map(execute, cmds)
# Writing info
for pdb in os.listdir(output_dir):
# Clean up the single PDB files
if "domains_A" in pdb:
os.remove(os.path.join(output_dir, pdb))
print("Delete " + pdb)
if pdb.endswith("domains.pdb"):
print("Writing info for " + pdb)
protein_basename = os.path.basename(pdb).replace('_domains.pdb','')
write_domains(os.path.join(output_dir, pdb), os.path.join(output_dir, f"{protein_basename}.domains"))
# Clean up .seq (not always produced)
for file in os.listdir("."):
if file.endswith("remainder.seq"):
os.remove(file)
print("Delete ", file)