-
Notifications
You must be signed in to change notification settings - Fork 1
/
test_data_helpers.py
86 lines (56 loc) · 2.81 KB
/
test_data_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from data_helpers import SpeciesCDSSource, CDSHelper, countSpeciesCDS, getCrc
import mysql_rnafold as db
from runningstats import RunningStats
from rate_limit import RateLimit
# Configuration
taxId = 9606
statsLength = RunningStats()
statsShuffles = RunningStats()
recordsCount = 0
warningsCount = 0
rl = RateLimit(30)
total = countSpeciesCDS(taxId)
for protId in SpeciesCDSSource(taxId):
cds = CDSHelper( taxId, protId )
recordsCount += 1
statsLength.push( cds.length())
if( len(cds.sequence()) != cds.length() ):
print("WARNING: incorrect sequence length detected for record (taxid=%d, protId=%s); real-length=%d, recorded-length=%d." % (taxId, protId, len(cds.sequence()), cds.length()))
warningsCount += 1
recomputedCrc = getCrc( cds.sequence() )
annotatedCrc = cds.crc()
assert( recomputedCrc == annotatedCrc )
print(cds.sequence()[:15])
seq1trans = Seq(cds.sequence(), generic_dna).translate()
crc1 = getCrc(seq1trans)
shuffles = cds.shuffledSeqIds( shuffleType = db.Sources.ShuffleCDSv2_python )
unique = len(frozenset(shuffles))
if( unique != len(shuffles)):
print("WARNING: duplicate shuffles found in record (taxid=%d, protId=%s); count=%d, unique=%d." % (taxId, protId, len(shuffles), unique))
warningsCount += 1
statsShuffles.push( len(shuffles))
shuffledCrcs = set()
shuffledTransCrcs = set()
for shuffledSeqId in shuffles:
shufSeq = cds.getShuffledSeq2(shuffledSeqId)
shuffledCrcs.add( getCrc(shufSeq) )
seq1shufftrans = Seq(shufSeq, generic_dna).translate()
shuffledTransCrcs.add( getCrc( seq1shufftrans ) )
print("Shuffled distinct: %d" % (len(shuffledCrcs)))
print("Shuffled trans distinct: %d" % len(shuffledTransCrcs))
#for s in range(len(shuffles)):
# shuf = cds.getShuffledSeq(s)
# if( len(cds.sequence()) != cds.length() ):
# print("WARNING: incorrect shuffled sequence length detected for record (taxid=%d, protId=%s, seqId=%d); real-length=%d, recorded-length=%d." % (taxId, protId, shuffles[s], len(shuff.sequence()), cds.length()))
# warningsCount += 1
if(rl()):
print("processed %d records (%.2g%%)" % (recordsCount, float(recordsCount)/total*100))
print(statsLength.count())
print("%.3g %.3g +-%.3g %.3g" % (statsLength.min(), statsLength.mean(), 2*statsLength.stdev(), statsLength.max()))
print(statsShuffles.count())
print("%.3g %.3g +-%.3g %.3g" % (statsShuffles.min(), statsShuffles.mean(), 2*statsShuffles.stdev(), statsShuffles.max()))
print("Done - Processed %d records, found %d warnings" % (recordsCount, warningsCount))
if( warningsCount > 0 ):
print("%d warnings found!" % (warningsCount,))