Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 8ab1ba4

Browse files
committedOct 9, 2018
Update docs
0 parents  commit 8ab1ba4

9 files changed

+315
-0
lines changed
 

‎.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
__pycache__
2+
*.pyc
3+
*.egg-info
4+
.tox

‎README.md

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Small Python and Bioinformatics interview test code
2+
3+
This is a small **(broken, incomplete)** code project to test your overall python ecosystem and knowledge about algorithms, complexity and general data handling.
4+
5+
You will be questioned and guided on several aspects during the code interview. Valuable
6+
experience is:
7+
8+
1) Being able to work in a collaborative way using Github.
9+
2) Use [test driven development][TDD] methodology to reason about inputs/outputs of code pipelines, unit tests.
10+
3) Identify bad coding practices.
11+
4) Improve the performance, usefulness, cleanness and generality of the code.
12+
13+
## Install process
14+
15+
git clone https://github.com/brainstorm/telotest && cd telotest
16+
conda create -n telotest -python=3
17+
pip install -r requirements.txt
18+
19+
## Testing loop
20+
21+
In order to run the testsuite under `tests`, the [tox][tox] tool will be used to run them all:
22+
23+
tox
24+
25+
## Download datasets
26+
27+
When tests pass, the input dataset for this code is the [2013 human reference genome (hg38)][hg38] (~950MB compressed), which can be downloaded to the `data` folder for further testing.
28+
29+
[TDD]: https://en.wikipedia.org/wiki/Test-driven_development
30+
[tox]: https://tox.readthedocs.io/en/latest/
31+
[hg38]: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz

‎__init__.py

Whitespace-only changes.

‎requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
tox
2+
pytest
3+
discover
4+
biopython

‎setup.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# !/usr/bin/env python
2+
3+
from distutils.core import setup
4+
5+
setup(
6+
name='telotest',
7+
packages=[],
8+
version='0.0.1',
9+
description='Fun small telomeric bioinformatic coding test',
10+
author='Roman Valls Guimera',
11+
license='GPLv3',
12+
author_email='roman.vallsguimera@unimelb.edu.au',
13+
url='https://github.com/brainstorm/telotest',
14+
keywords=['codeinterview', 'genomics'],
15+
classifiers=[
16+
'Development Status :: 1 - Alpha',
17+
'Environment :: Console',
18+
'Intended Audience :: Developers',
19+
'License :: OSI Approved :: GPLv3 License',
20+
'Programming Language :: Python',
21+
'Programming Language :: Python :: 3.6'
22+
],
23+
)

‎telotest.py

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
#!/usr/bin/env python
2+
3+
import math
4+
import gzip
5+
from typing import List
6+
from pathlib import Path
7+
from collections import defaultdict, deque
8+
from itertools import islice, tee
9+
from Bio import SeqIO
10+
from Bio.Seq import MutableSeq, Seq
11+
from Bio.SeqRecord import SeqRecord
12+
from Bio.Alphabet import generic_dna
13+
14+
import logging
15+
16+
# Set log level
17+
loglevel = logging.INFO
18+
logging.basicConfig(level=loglevel)
19+
log = logging.getLogger(__name__)
20+
21+
22+
## Global
23+
FA_IDX = "hg38.fa.idx"
24+
25+
# Telomeric hexamer
26+
KMER_K = 6
27+
28+
# Human telomeric hexamers and complementary sequences
29+
TELO_HEXAMERS = ['CCCTAA', 'TTAGGG', 'TAACCC']
30+
31+
32+
def find_N_boundaries(seq: str):
33+
''' Returns all N-boundaries in a sequence via tuple: (first, second)
34+
'''
35+
pos = first = second = 0
36+
37+
# first N stretch
38+
for base in seq:
39+
if 'N' in base:
40+
pos = pos + 1
41+
else:
42+
first = pos
43+
break
44+
45+
base = None
46+
pos = 0
47+
48+
# last N stretch
49+
for base in reversed(seq):
50+
if 'N' in base:
51+
pos = pos + 1
52+
else:
53+
second = len(seq) - pos - 1
54+
break
55+
56+
return (first, second)
57+
58+
59+
# Elongate forward and backward N's, respecting telomeric patterns
60+
def elongate_forward_sequence(seq):
61+
# Determine N boundaries in the sequence
62+
boundary, boundary_r = find_N_boundaries(seq)
63+
64+
# K-mer telomeric sequence right after the N boundary
65+
kmer_seq = seq[boundary:boundary + KMER_K]
66+
67+
# How many chunks to elongate and remainder
68+
chunks = len(seq[0:boundary]) % KMER_K
69+
chunks_r = len(seq[0:boundary]) / KMER_K
70+
71+
# Capture remainder of the pattern to fit in sequence
72+
kmer_seq_r = kmer_seq[math.floor(chunks_r):]
73+
tmp_seq = kmer_seq_r
74+
75+
# Build forward sequence
76+
for _ in range(0, chunks - 2): # XXX 2?
77+
tmp_seq = tmp_seq + kmer_seq
78+
79+
# Attach inner pattern
80+
tmp_seq = tmp_seq + seq[boundary:boundary_r] + seq[boundary_r:]
81+
82+
return tmp_seq
83+
84+
def elongate_reverse_sequence(seq):
85+
# Determine N boundaries in the sequence
86+
boundary, boundary_r = find_N_boundaries(seq)
87+
88+
# K-mer telomeric sequence right before the N boundary
89+
kmer_seq = seq[boundary_r - KMER_K:boundary_r]
90+
91+
# How many chunks to elongate and remainder
92+
chunks = len(seq[boundary_r:]) % KMER_K
93+
chunks_r = len(seq[boundary_r:]) / KMER_K
94+
95+
# Start with the N boundary
96+
tst_seq = seq[0:boundary]
97+
98+
# Attach inner pattern
99+
tst_seq = tst_seq + seq[boundary:boundary_r]
100+
101+
# Build reverse sequence
102+
for _ in range(0, chunks):
103+
tst_seq = tst_seq + kmer_seq
104+
105+
# Capture remainder of the pattern to fit in sequence
106+
kmer_seq_r = kmer_seq[0:math.floor(chunks_r)]
107+
tst_seq = tst_seq + kmer_seq_r
108+
109+
return tst_seq
110+
111+
112+
def determine_hexamer(seq: str):
113+
'''
114+
Builds a table containing hexamers and all its possible rotations.
115+
116+
Useful to determine boundary conditions between N-regions and telomeric
117+
repeats on the reference genome(s).
118+
119+
Also takes the sequence seq and tries to find which hexamer pattern it has
120+
'''
121+
hexamer_table = defaultdict(list)
122+
rotated = []
123+
124+
# Seed table with first non-rotated "canonical" hexamer
125+
for pattern in TELO_HEXAMERS:
126+
hexamer_table[pattern] = pattern
127+
128+
# Rotate the telomeric pattern to match boundaries
129+
for pattern in TELO_HEXAMERS:
130+
dq = deque(pattern)
131+
for rot in range(1, len(pattern)):
132+
dq.rotate(rot)
133+
rotated.append(''.join(dq))
134+
135+
hexamer_table[pattern] = rotated
136+
137+
for k, v in hexamer_table.items():
138+
for kmer in v:
139+
if kmer in str.upper(str(seq)):
140+
return k
141+
else:
142+
return None
143+
144+
return None
145+
146+
def fasta_idx(filename):
147+
''' Indexes a fasta filename, since SeqIO.to_dict is not very efficient for
148+
big files, see: https://github.com/biopython/biopython/pull/959 and
149+
related issues.
150+
'''
151+
with gzip.open(filename, 'wt') as hg38_idx:
152+
SeqIO.index_db(filename, hg38_idx, 'fasta')
153+
154+
155+
def main(genome_build='/Users/romanvg/dev/10x/telomeres/data/external/hg38.fa.gz'):
156+
#def main(genome_build='../../data/processed/hg38_synthetic/new_hg38.fa.gz'):
157+
#def main(genome_build='../../data/external/chr11.fa.gz'):
158+
with gzip.open(genome_build, "rt") as hg38_fa:
159+
record_dict = SeqIO.to_dict(SeqIO.parse(hg38_fa, "fasta"))
160+
for _, chrom_attrs in record_dict.items():
161+
sequence = chrom_attrs.seq
162+
seq_id = chrom_attrs.id
163+
164+
# Discard _KI_random and _alt assemblies, disregard chrM too
165+
# since there are no relevant telomeres there (circular sequence)
166+
if "_" not in seq_id:
167+
if "chrM" not in seq_id:
168+
#print(chrom_attrs)
169+
boundaries = find_N_boundaries(sequence)
170+
detected_hexamer = determine_hexamer(sequence)
171+
print("{}\t{}:\t\t{}\t...\t{}\t...\t{}\t{}".format(seq_id.split(':')[0], boundaries, sequence[boundaries[0]:boundaries[0] + KMER_K + 30], sequence[boundaries[1] - KMER_K - 30:boundaries[1]], len(sequence), detected_hexamer))
172+
173+
174+
175+
#final_seq = elongate_forward_sequence(sequence)
176+
#final_seq = elongate_reverse_sequence(final_seq)
177+
178+
# with open("hg38_elongated_telomeres.fasta", "w") as output_handle:
179+
# SeqIO.write(new_hg38, output_handle, "fasta")
180+
181+
if __name__ == "__main__":
182+
main()

‎tests/__init__.py

Whitespace-only changes.

‎tests/test_telomeres.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/env python
2+
3+
import logging
4+
import unittest
5+
6+
from telotest.telotest import TELO_HEXAMERS, find_N_boundaries, elongate_forward_sequence, elongate_reverse_sequence, determine_hexamer
7+
8+
# Set log level
9+
loglevel = logging.INFO
10+
logging.basicConfig(level=loglevel)
11+
log = logging.getLogger(__name__)
12+
13+
class TestStringMethods(unittest.TestCase):
14+
15+
def setUp(self):
16+
# 0 16 33 46
17+
self.src_seq = 'NNNNNNNNNNNNNNNNTAACCCTAACCCTAACCCNNNNNNNNNNNNN'
18+
self.fwd_seq = 'ACCCTAACCCTAACCCTAACCCTAACCCTAACCCNNNNNNNNNNNNN'
19+
self.rev_seq = 'NNNNNNNNNNNNNNNNTAACCCTAACCCTAACCCTAACCCTAACCCT'
20+
self.no_patt = 'NNNNNNNNNNNNNNNNATCATAAtAaaaaccCCANNNNNNNNNNNNN'
21+
self.mdl_nnn = 'NNNNNNNNNNNNNNNNCACACANNNNNNCACACANNNNNNNNNNNNN'
22+
self.kmer_k = 6
23+
24+
def test_N_boundary_positions(self):
25+
boundaries = find_N_boundaries(self.src_seq)
26+
self.assertTupleEqual(boundaries, (16, 33))
27+
28+
boundaries = find_N_boundaries(self.fwd_seq)
29+
self.assertTupleEqual(boundaries, (0, 33))
30+
31+
boundaries = find_N_boundaries(self.rev_seq)
32+
self.assertTupleEqual(boundaries, (16, 46))
33+
34+
def test_elongate_forward_sequence(self):
35+
tst_seq = elongate_forward_sequence(self.src_seq)
36+
37+
self.assertEqual(len(tst_seq), len(self.fwd_seq))
38+
self.assertEqual(tst_seq, self.fwd_seq)
39+
40+
def test_elongate_reverse_sequence(self):
41+
tst_seq = elongate_reverse_sequence(self.src_seq)
42+
43+
self.assertEqual(len(tst_seq), len(self.rev_seq))
44+
self.assertEqual(tst_seq, self.rev_seq)
45+
46+
def test_determine_hexamer(self):
47+
# Will find known pattern CCCTAA in sequence
48+
tst_seq = determine_hexamer(self.src_seq)
49+
self.assertEqual(TELO_HEXAMERS[1], tst_seq)
50+
51+
# Should not find any pattern in sequence
52+
tst_seq = determine_hexamer(self.no_patt)
53+
self.assertNotIn(tst_seq, TELO_HEXAMERS)
54+
self.assertEqual(tst_seq, None)
55+
56+
if __name__ == '__main__':
57+
unittest.main()

‎tox.ini

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[tox]
2+
envlist = py36
3+
skipsdist = true
4+
5+
[travis]
6+
python =
7+
3.6: py36
8+
9+
[testenv]
10+
deps = -r {toxinidir}/requirements.txt
11+
commands =
12+
pip install -e {toxinidir}
13+
#py.test {toxinidir}/tests
14+
py.test -s {toxinidir}/tests

0 commit comments

Comments
 (0)
Please sign in to comment.