-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkiln.py
98 lines (76 loc) · 2.67 KB
/
kiln.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
CTB TODO: change ident to identprefix, full_ident to ident.
"""
import csv
import os
def check_dna(seq):
orig_seq = seq
for ch in 'ACGTN':
seq = seq.replace(ch, '')
if len(seq) > 0.1 * len(orig_seq):
return False
return True
def remove_extension(basepath, extra=[]):
exts = set(['.fa', '.gz', '.faa', '.fna'])
exts.update(extra)
name, ext = os.path.splitext(basepath)
while ext in exts:
basepath = name
name, ext = os.path.splitext(basepath)
return basepath
class OutputRecords:
def __init__(self, filename):
self.filename = filename
self.fp = None
def open(self):
self.fp = open(self.filename, 'w', newline='')
self.writer = csv.DictWriter(self.fp,
fieldnames=['identprefix',
'ident',
'name',
'genome_filename',
'protein_filename'])
self.writer.writeheader()
return self.writer
def write_record(self, input_file_obj):
input_file_obj.to_csv(self.writer)
def close(self):
self.fp.close()
self.writer = None
class InputFile(object):
ident = None
full_ident = None
genome_filename = None
protein_filename = None
name = None
def merge(self, other):
assert self.ident == other.ident
#assert self.full_ident == other.full_ident
#assert self.name == other.name
if (self.genome_filename and other.genome_filename):
raise ValueError("duplicate genome filename")
if (self.protein_filename and other.protein_filename):
raise ValueError("duplicate protein filename")
if self.genome_filename:
assert other.protein_filename
self.protein_filename = other.protein_filename
else:
assert self.protein_filename
self.genome_filename = other.genome_filename
return self
def is_empty(self):
if self.name is None:
return True
if self.ident is None:
return True
if self.full_ident is None:
return True
if self.genome_filename is None and self.protein_filename is None:
return True
return False
def to_csv(self, w):
w.writerow(dict(identprefix=self.ident,
ident=self.full_ident,
name=self.name,
genome_filename=self.genome_filename or "",
protein_filename=self.protein_filename or ""))