From d13e68da3b95774c3727184eb8f5fdc764521c2e Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Thu, 27 Oct 2016 19:42:44 -0400
Subject: [PATCH 01/11] assemble reads and then trimmed assembled sequences
 down to desired levels of coverage

---
 isovar/allele_reads.py                        |  26 +--
 isovar/assembly.py                            |  55 ++----
 isovar/variant_sequences.py                   | 176 ++++++++++++++++--
 script/isovar-variant-sequences.py            |   4 +-
 .../isovar-translate-variants-results.csv     |  12 +-
 test/test_assembly.py                         |  11 +-
 6 files changed, 203 insertions(+), 81 deletions(-)

diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py
index 4508559..9d76460 100644
--- a/isovar/allele_reads.py
+++ b/isovar/allele_reads.py
@@ -84,7 +84,7 @@ def allele_read_from_locus_read(locus_read, n_ref):
     if ref_pos_before is None:
         logger.warn(
             "Missing reference pos for nucleotide before variant on read: %s",
-                locus_read)
+            locus_read)
         return None
 
     ref_pos_after = reference_positions[read_pos_after]
@@ -92,7 +92,7 @@ def allele_read_from_locus_read(locus_read, n_ref):
     if ref_pos_after is None:
         logger.warn(
             "Missing reference pos for nucleotide after variant on read: %s",
-                locus_read)
+            locus_read)
         return None
 
     if n_ref == 0:
@@ -102,9 +102,9 @@ def allele_read_from_locus_read(locus_read, n_ref):
             # don't use this read
             logger.debug(
                 "Positions before (%d) and after (%d) variant should be adjacent on read %s",
-                    ref_pos_before,
-                    ref_pos_after,
-                    locus_read)
+                ref_pos_before,
+                ref_pos_after,
+                locus_read)
             return None
 
         # insertions require a sequence of non-aligned bases
@@ -125,9 +125,9 @@ def allele_read_from_locus_read(locus_read, n_ref):
             # don't use this read
             logger.debug(
                 "Positions before (%d) and after (%d) variant should be adjacent on read %s",
-                    ref_pos_before,
-                    ref_pos_after,
-                    locus_read)
+                ref_pos_before,
+                ref_pos_after,
+                locus_read)
             return None
 
     nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
@@ -206,9 +206,11 @@ def reads_overlapping_variant(
     if chromosome is None:
         chromosome = variant.contig
 
-    logger.info("Gathering variant reads for variant %s (chromosome=%s)",
+    logger.info(
+        "Gathering variant reads for variant %s (chromosome = %s, gene names = %s)",
         variant,
-        chromosome)
+        chromosome,
+        variant.gene_names)
 
     base1_position, ref, alt = trim_variant(variant)
 
@@ -271,7 +273,9 @@ def reads_overlapping_variants(variants, samfile, **kwargs):
         else:
             logger.warn(
                 "Chromosome '%s' from variant %s not in alignment file %s",
-                    chromosome, variant, samfile.filename)
+                chromosome,
+                variant,
+                samfile.filename)
             continue
         allele_reads = reads_overlapping_variant(
             samfile=samfile,
diff --git a/isovar/assembly.py b/isovar/assembly.py
index c19156f..6edd2b6 100644
--- a/isovar/assembly.py
+++ b/isovar/assembly.py
@@ -19,9 +19,6 @@
 # I'd rather just use list.copy but Python 2.7 doesn't support it
 from copy import copy
 
-from .read_helpers import group_unique_sequences
-from .variant_sequences import VariantSequence
-
 
 logger = logging.getLogger(__name__)
 
@@ -68,14 +65,6 @@ def sort_by_decreasing_total_length(seq):
     """
     return -len(seq.sequence)
 
-def combine_variant_sequences(variant_sequence1, variant_sequence2):
-    return VariantSequence(
-        prefix=variant_sequence1.prefix,
-        alt=variant_sequence1.alt,
-        suffix=variant_sequence2.suffix,
-        reads=set(
-            variant_sequence1.reads).union(set(variant_sequence2.reads)))
-
 def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
     """
     Greedily merge overlapping sequences into longer sequences.
@@ -103,10 +92,15 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
                     variant_sequence2,
                     min_overlap_size=min_overlap_size):
                 continue
-            combined = combine_variant_sequences(
-                variant_sequence1, variant_sequence2)
+            combined = variant_sequence1.combine(variant_sequence2)
             if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)):
                 # if the combined sequence isn't any longer, then why bother?
+                logger.info(
+                    "Skipping %s since doesn't increase length beyond max(%d, %d)",
+                    combined,
+                    len(variant_sequence1),
+                    len(variant_sequence2)
+                )
                 continue
             if combined.sequence in merged_variant_sequences:
                 # it's possible to get the same merged sequence from distinct
@@ -120,8 +114,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
                 # sequence is supported by any one read.
                 existing_record_with_same_sequence = merged_variant_sequences[
                     combined.sequence]
-                combined_with_more_reads = combine_variant_sequences(
-                    existing_record_with_same_sequence,
+                combined_with_more_reads = existing_record_with_same_sequence.combine(
                     combined)
                 merged_variant_sequences[combined.sequence] = combined_with_more_reads
             else:
@@ -168,7 +161,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence):
     """
     return -len(variant_sequence.reads), -len(variant_sequence.sequence)
 
-def iterative_assembly(
+def iterative_overlap_assembly(
         variant_sequences,
         min_overlap_size=30,
         n_merge_iters=2):
@@ -185,9 +178,9 @@ def iterative_assembly(
 
         logger.info(
             "Iteration #%d of assembly: generated %d sequences (from %d)",
-                i + 1,
-                len(variant_sequences),
-                len(previous_sequences))
+            i + 1,
+            len(variant_sequences),
+            len(previous_sequences))
 
         if len(variant_sequences) == 0:
             # if the greedy merge procedure fails for all pairs of candidate
@@ -199,31 +192,9 @@ def iterative_assembly(
         variant_sequences = collapse_substrings(variant_sequences)
         logger.info(
             "After collapsing subsequences, %d distinct sequences left",
-                len(variant_sequences))
+            len(variant_sequences))
         if len(variant_sequences) == 1:
             # once we have only one sequence then there's no point trying
             # to further combine sequences
             break
     return variant_sequences
-
-def assemble_reads_into_variant_sequences(
-        variant_reads,
-        min_overlap_size=30,
-        n_merge_iters=2):
-    """
-    Turn a collection of AlleleRead objects into a collection of VariantSequence
-    objects, which are returned in order of (# supported reads, sequence length)
-    """
-    initial_variant_sequences = [
-        VariantSequence(
-            prefix=prefix,
-            alt=alt,
-            suffix=suffix,
-            reads=reads)
-        for (prefix, alt, suffix), reads in
-        group_unique_sequences(variant_reads).items()
-    ]
-    return iterative_assembly(
-        initial_variant_sequences,
-        min_overlap_size=min_overlap_size,
-        n_merge_iters=n_merge_iters)
diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py
index a12fa0c..d62fc8e 100644
--- a/isovar/variant_sequences.py
+++ b/isovar/variant_sequences.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from __future__ import print_function, division, absolute_import
-from collections import namedtuple
 import logging
+from collections import namedtuple
+
+import numpy as np
 
 from .read_helpers import (
     get_single_allele_from_reads,
@@ -26,11 +28,10 @@
     VARIANT_CDNA_SEQUENCE_LENGTH,
 )
 from .dataframe_builder import dataframe_from_generator
-
+from .assembly import iterative_overlap_assembly, collapse_substrings
 
 logger = logging.getLogger(__name__)
 
-
 # using this base class to define the core fields of a VariantSequence
 # but inheriting it from it to allow the addition of helper methods
 VariantSequenceFields = namedtuple(
@@ -81,8 +82,6 @@ def contains(self, other):
     def left_overlaps(self, other, min_overlap_size=1):
         """
         Does this VariantSequence overlap another on the left side?
-        The alleles must match and they must share at least `min_overlap_size`
-        nucleotides in their prefix or suffix.
         """
 
         if self.alt != other.alt:
@@ -107,10 +106,14 @@ def left_overlaps(self, other, min_overlap_size=1):
         # p2 a2 s2 =       XX Y ZZZZZZZZZ
         # ...
         # then we can combine them into a longer sequence
-        return (
+        sequence_overlaps = (
             self.prefix.endswith(other.prefix) and
             other.suffix.startswith(self.suffix)
         )
+        prefix_overlap_size = len(self.prefix) - len(other.prefix)
+        suffix_overlap_size = len(other.suffix) - len(self.suffix)
+        return sequence_overlaps and (
+            (prefix_overlap_size + suffix_overlap_size) >= min_overlap_size)
 
     def add_reads(self, reads):
         """
@@ -122,6 +125,106 @@ def add_reads(self, reads):
             suffix=self.suffix,
             reads=self.reads.union(reads))
 
+    def combine(self, other_sequence):
+        """
+        If this sequence is the prefix of another sequence, combine
+        them into a single VariantSequence object.
+        """
+        if other_sequence.alt != self.alt:
+            raise ValueError(
+                "Cannot combine %s and %s with mismatching alt sequences" % (
+                    self,
+                    other_sequence))
+        elif self.left_overlaps(other_sequence):
+            # If sequences are like AABC and ABCC
+            return VariantSequence(
+                prefix=self.prefix,
+                alt=self.alt,
+                suffix=other_sequence.suffix,
+                reads=self.reads.union(other_sequence.reads))
+        elif other_sequence.left_overlaps(self):
+            return other_sequence.combine(self)
+        elif self.contains(other_sequence):
+            return self.add_reads(other_sequence.reads)
+        elif other_sequence.contains(self):
+            return other_sequence.add_reads(self.reads)
+        else:
+            raise ValueError("%s does not overlap with %s" % (self, other_sequence))
+
+    def variant_indices(self):
+        """
+        When we combine prefix + alt + suffix into a single string,
+        what are is base-0 index interval which gets us back the alt
+        sequence? First returned index is inclusive, the second is exclusive.
+        """
+        variant_start_index = len(self.prefix)
+        variant_len = len(self.alt)
+        variant_end_index = variant_start_index + variant_len
+        return variant_start_index, variant_end_index
+
+    def coverage(self):
+        """
+        Returns NumPy array indicating number of reads covering each
+        nucleotides of this sequence.
+        """
+        variant_start_index, variant_end_index = self.variant_indices()
+        n_nucleotides = len(self)
+        coverage_array = np.zeros(n_nucleotides, dtype="int32")
+        for read in self.reads:
+            coverage_array[
+                max(0, variant_start_index - len(read.prefix)):
+                min(n_nucleotides, variant_end_index + len(read.suffix))] += 1
+        return coverage_array
+
+    def min_coverage(self):
+        return np.min(self.coverage())
+
+    def mean_coverage(self):
+        return np.mean(self.coverage())
+
+    def trim_by_coverage(self, min_reads):
+        """
+        Given the min number of reads overlapping each nucleotide of
+        a variant sequence, trim this sequence by getting rid of positions
+        which are overlapped by fewer reads than specified.
+        """
+        read_count_array = self.coverage()
+
+        sufficient_coverage_mask = read_count_array >= min_reads
+        sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask)
+        if len(sufficient_coverage_indices) == 0:
+            logger.debug("No bases in %s have coverage >= %d" % (self, min_reads))
+            return VariantSequence(prefix="", alt="", suffix="", reads=self.reads)
+        variant_start_index, variant_end_index = self.variant_indices()
+        # assuming that coverage drops off monotonically away from
+        # variant nucleotides
+        first_covered_index = sufficient_coverage_indices.min()
+        last_covered_index = sufficient_coverage_indices.max()
+        # adding 1 to last_covered_index since it's an inclusive index
+        # whereas variant_end_index is the end of a half-open interval
+        if (first_covered_index > variant_start_index or
+                last_covered_index + 1 < variant_end_index):
+            # Example:
+            #   Nucleotide sequence:
+            #       ACCCTTTT|AA|GGCGCGCC
+            #   Coverage:
+            #       12222333|44|33333211
+            # Then the mask for bases covered >= 4x would be:
+            #       ________|**|________
+            # with indices:
+            #       first_covered_index = 9
+            #       last_covered_index = 10
+            #       variant_start_index = 9
+            #       variant_end_index = 11
+            logger.debug("Some variant bases in %s don't have coverage >= %d" % (
+                self, min_reads))
+            return VariantSequence(prefix="", alt="", suffix="", reads=self.reads)
+        return VariantSequence(
+            prefix=self.prefix[first_covered_index:],
+            alt=self.alt,
+            suffix=self.suffix[:last_covered_index - variant_end_index + 1],
+            reads=self.reads)
+
 def sort_key_decreasing_read_count(variant_sequence):
     return -len(variant_sequence.reads)
 
@@ -148,6 +251,7 @@ def all_variant_sequences_supported_by_variant_reads(
         for ((prefix, alt, suffix), reads)
         in unique_sequence_groups.items()
     ]
+
 def filter_variant_sequences_by_read_support(
         variant_sequences,
         min_reads_supporting_cdna_sequence):
@@ -155,11 +259,12 @@ def filter_variant_sequences_by_read_support(
     variant_sequences = [
         s
         for s in variant_sequences
-        if len(s.reads) >= min_reads_supporting_cdna_sequence
+        if s.min_coverage() >= min_reads_supporting_cdna_sequence
     ]
     n_dropped = n_total - len(variant_sequences)
     if n_dropped > 0:
-        logger.info("Dropped %d/%d variant sequences less than %d supporting reads",
+        logger.info(
+            "Dropped %d/%d variant sequences less than %d supporting reads",
             n_dropped,
             n_total,
             min_reads_supporting_cdna_sequence)
@@ -179,19 +284,38 @@ def filter_variant_sequences_by_length(
     min_required_sequence_length = min(
         max_observed_sequence_length,
         preferred_sequence_length)
-
     variant_sequences = [
         s for s in variant_sequences
         if len(s.sequence) >= min_required_sequence_length
     ]
     n_dropped = n_total - len(variant_sequences)
     if n_dropped > 0:
-        logger.info("Dropped %d/%d variant sequences shorter than %d",
+        logger.info(
+            "Dropped %d/%d variant sequences shorter than %d",
             n_dropped,
             n_total,
             min_required_sequence_length)
     return variant_sequences
 
+def trim_variant_sequences(variant_sequences, min_reads_supporting_cdna_sequence):
+    """
+    Trim VariantSequences to desired coverage and then combine any
+    subsequences which get generated.
+    """
+    n_total = len(variant_sequences)
+    trimmed_variant_sequences = [
+        variant_sequence.trim_by_coverage(min_reads_supporting_cdna_sequence)
+        for variant_sequence in variant_sequences
+    ]
+    collapsed_variant_sequences = collapse_substrings(trimmed_variant_sequences)
+    n_after_trimming = len(collapsed_variant_sequences)
+    logger.info(
+        "Kept %d/%d variant sequences after read coverage trimming to >=%dx",
+        n_after_trimming,
+        n_total,
+        min_reads_supporting_cdna_sequence)
+    return collapsed_variant_sequences
+
 def filter_variant_sequences(
         variant_sequences,
         preferred_sequence_length,
@@ -200,9 +324,8 @@ def filter_variant_sequences(
     Drop variant sequences which are shorter than request or don't have
     enough supporting reads.
     """
-    variant_sequences = filter_variant_sequences_by_read_support(
-        variant_sequences=variant_sequences,
-        min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence)
+    variant_sequences = trim_variant_sequences(
+        variant_sequences, min_reads_supporting_cdna_sequence)
     return filter_variant_sequences_by_length(
         variant_sequences=variant_sequences,
         preferred_sequence_length=preferred_sequence_length)
@@ -259,11 +382,38 @@ def supporting_reads_to_variant_sequences(
         max_nucleotides_before_variant=max_nucleotides_before_variant,
         max_nucleotides_after_variant=max_nucleotides_after_variant)
 
+    logger.info(
+        "Initial pool of %d variant sequences (min length=%d, max length=%d)",
+        len(variant_sequences),
+        min(len(s) for s in variant_sequences),
+        max(len(s) for s in variant_sequences))
+
+    variant_sequences = iterative_overlap_assembly(variant_sequences)
+
+    if variant_sequences:
+        logger.info(
+            "After overlap assembly: %d variant sequences (min length=%d, max length=%d)",
+            len(variant_sequences),
+            min(len(s) for s in variant_sequences),
+            max(len(s) for s in variant_sequences))
+    else:
+        logger.info("After overlap assembly: 0 variant sequences")
+
     variant_sequences = filter_variant_sequences(
         variant_sequences=variant_sequences,
         preferred_sequence_length=preferred_sequence_length,
         min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence)
 
+    if variant_sequences:
+        logger.info(
+            ("After coverage & length filtering: %d variant sequences "
+             "(min length=%d, max length=%d)"),
+            len(variant_sequences),
+            min(len(s) for s in variant_sequences),
+            max(len(s) for s in variant_sequences))
+    else:
+        logger.info("After coverage & length filtering: 0 variant sequences")
+
     # sort VariantSequence objects by decreasing order of supporting read
     # counts
     variant_sequences.sort(key=sort_key_decreasing_read_count)
diff --git a/script/isovar-variant-sequences.py b/script/isovar-variant-sequences.py
index 122908c..397fc69 100644
--- a/script/isovar-variant-sequences.py
+++ b/script/isovar-variant-sequences.py
@@ -24,8 +24,8 @@
     variant_sequences_dataframe_from_args
 )
 
-
-logging.config.fileConfig(pkg_resources.resource_filename('isovar.cli', 'logging.conf'))
+logging.config.fileConfig(
+    pkg_resources.resource_filename('isovar.cli', 'logging.conf'))
 logger = logging.getLogger(__name__)
 
 parser = make_variant_sequences_arg_parser(add_sequence_length_arg=True)
diff --git a/test/data/b16.f10/isovar-translate-variants-results.csv b/test/data/b16.f10/isovar-translate-variants-results.csv
index bc59adb..f70f03f 100644
--- a/test/data/b16.f10/isovar-translate-variants-results.csv
+++ b/test/data/b16.f10/isovar-translate-variants-results.csv
@@ -1,9 +1,3 @@
-,chr,pos,ref,alt,amino_acids,variant_aa_interval_start,variant_aa_interval_end,ends_with_stop_codon,frameshift,translations,supporting_variant_reads,total_variant_reads,supporting_transcripts,total_transcripts,gene
-0,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGIRTEDFF,14,27,False,True,1,16,27,1,1,Klf6
-1,13,5864876,C,CG,DSQEDLWTKFILARGGRRRRNQN,14,23,True,True,1,8,27,1,1,Klf6
-2,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGTTDCARN,14,27,False,True,1,2,27,1,1,Klf6
-3,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGIRTDDFF,14,27,False,True,1,1,27,1,1,Klf6
-4,4,45802539,G,C,CLQGRTTSYSTAAPLPNPIPNPEICY,14,15,False,False,1,3,3,1,1,Aldh1b1
-5,9,82927102,G,T,TMVAWDRHDNTVINYNCVVMSIPSYHS,14,15,False,False,2,11,13,1,1,Phip
-6,9,82927102,G,T,TMVAWDRHDNTVIQYNCVVMSIPSYHS,14,15,False,False,1,2,13,1,1,Phip
-7,X,8125624,C,A,NKLQGHSAPVLDVIDVKHGRAVALQLV,14,15,False,False,2,8,8,3,3,Wdr13
+,chr,pos,ref,alt,amino_acids,variant_aa_interval_start,variant_aa_interval_end,ends_with_stop_codon,frameshift,translations,overlapping_reads,ref_reads,alt_reads,alt_reads_supporting_protein_sequence,transcripts_overlapping_variant,transcripts_supporting_protein_sequence,gene
+0,9,82927102,G,T,AWDRHDNTVIHAVNNMTLKV,10,11,False,False,1,56,39,17,5,1,1,Phip
+1,X,8125624,C,A,QGHSAPVLDVIVNCDESLLA,10,11,False,False,1,71,46,25,8,3,3,Wdr13
diff --git a/test/test_assembly.py b/test/test_assembly.py
index 3a8fda1..a5cf7c6 100644
--- a/test/test_assembly.py
+++ b/test/test_assembly.py
@@ -14,8 +14,11 @@
 
 from __future__ import print_function, division, absolute_import
 
-from isovar.variant_reads import reads_supporting_variant
-from isovar.assembly import assemble_reads_into_variant_sequences
+from isovar.variant_reads import (
+    reads_supporting_variant,
+    all_variant_sequences_supported_by_variant_reads,
+)
+from isovar.assembly import iterative_overlap_assembly
 from pyensembl import ensembl_grch38
 from varcode import Variant
 from nose.tools import eq_
@@ -39,8 +42,8 @@ def test_assemble_transcript_fragments_snv():
         samfile=samfile,
         chromosome=chromosome,)
 
-    sequences = assemble_reads_into_variant_sequences(
-        variant_reads,
+    sequences = iterative_overlap_assembly(
+        all_variant_sequences_supported_by_variant_reads(variant_reads),
         min_overlap_size=30)
 
     assert len(sequences) > 0

From 0e653dd04f65421c7fe4cc7b11fc342d4a805cb9 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Thu, 27 Oct 2016 21:18:16 -0400
Subject: [PATCH 02/11] testing with and without read overlap assembly

---
 isovar/assembly.py              | 10 ----------
 isovar/cli/variant_sequences.py |  8 +++++++-
 isovar/default_parameters.py    |  5 +++++
 isovar/protein_sequences.py     |  8 ++++++--
 isovar/translation.py           | 21 +++++++++++++--------
 isovar/variant_sequences.py     | 31 ++++++++++++++++++++++---------
 test/test_assembly.py           |  8 +++-----
 test/test_protein_sequences.py  | 26 ++++++++++++++------------
 8 files changed, 70 insertions(+), 47 deletions(-)

diff --git a/isovar/assembly.py b/isovar/assembly.py
index 6edd2b6..86c78fc 100644
--- a/isovar/assembly.py
+++ b/isovar/assembly.py
@@ -22,7 +22,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 MIN_OVERLAP_SIZE = 30
 
 def sort_by_decreasing_prefix_length(seq):
@@ -93,15 +92,6 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
                     min_overlap_size=min_overlap_size):
                 continue
             combined = variant_sequence1.combine(variant_sequence2)
-            if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)):
-                # if the combined sequence isn't any longer, then why bother?
-                logger.info(
-                    "Skipping %s since doesn't increase length beyond max(%d, %d)",
-                    combined,
-                    len(variant_sequence1),
-                    len(variant_sequence2)
-                )
-                continue
             if combined.sequence in merged_variant_sequences:
                 # it's possible to get the same merged sequence from distinct
                 # input sequences
diff --git a/isovar/cli/variant_sequences.py b/isovar/cli/variant_sequences.py
index b27fd89..14579a5 100644
--- a/isovar/cli/variant_sequences.py
+++ b/isovar/cli/variant_sequences.py
@@ -38,6 +38,11 @@ def add_variant_sequence_args(
         default=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
         help="Minimum number of reads supporting a variant sequence (default %(default)s)")
 
+    rna_sequence_group.add_argument(
+        "--overlap-assembly",
+        default=False,
+        action="store_true")
+
     # when cDNA sequence length can be inferred from a protein length then
     # we may want to omit this arg
     if add_sequence_length_arg:
@@ -76,7 +81,8 @@ def variant_sequences_generator_from_args(args):
     return reads_generator_to_sequences_generator(
         allele_reads_generator,
         min_reads_supporting_cdna_sequence=args.min_reads_supporting_variant_sequence,
-        preferred_sequence_length=args.cdna_sequence_length)
+        preferred_sequence_length=args.cdna_sequence_length,
+        overlap_assembly=args.overlap_assembly)
 
 def variant_sequences_dataframe_from_args(args):
     variant_sequences_generator = variant_sequences_generator_from_args(args)
diff --git a/isovar/default_parameters.py b/isovar/default_parameters.py
index b279b05..1bbab69 100644
--- a/isovar/default_parameters.py
+++ b/isovar/default_parameters.py
@@ -72,3 +72,8 @@
 
 # number of protein sequences we want to return per variant
 MAX_PROTEIN_SEQUENCES_PER_VARIANT = 1
+
+# run overlap assembly algorithm to construct variant
+# sequences from multiple reads which only partially
+# overlap (rather than fully spanning a coding sequence)
+VARIANT_CDNA_SEQUENCE_ASSEMBLY = False
diff --git a/isovar/protein_sequences.py b/isovar/protein_sequences.py
index ff3b158..af55d21 100644
--- a/isovar/protein_sequences.py
+++ b/isovar/protein_sequences.py
@@ -28,6 +28,7 @@
     PROTEIN_SEQUENCE_LENGTH,
     MAX_PROTEIN_SEQUENCES_PER_VARIANT,
     MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
+    VARIANT_CDNA_SEQUENCE_ASSEMBLY
 )
 from .dataframe_builder import dataframe_from_generator
 from .translation import translate_variant_reads, TranslationKey
@@ -158,7 +159,8 @@ def reads_generator_to_protein_sequences_generator(
         min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
         min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH,
         max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES,
-        max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT):
+        max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT,
+        variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY):
     """"
     Translates each coding variant in a collection to one or more
     Translation objects, which are then aggregated into equivalent
@@ -190,6 +192,7 @@ def reads_generator_to_protein_sequences_generator(
     max_protein_sequences_per_variant : int
         Number of protein sequences to return for each ProteinSequence
 
+    variant_cdna_sequence_assembly : bool
 
     Yields pairs of a Variant and a list of ProteinSequence objects
     """
@@ -214,7 +217,8 @@ def reads_generator_to_protein_sequences_generator(
             protein_sequence_length=protein_sequence_length,
             min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence,
             min_transcript_prefix_length=min_transcript_prefix_length,
-            max_transcript_mismatches=max_transcript_mismatches)
+            max_transcript_mismatches=max_transcript_mismatches,
+            variant_cdna_sequence_assembly=variant_cdna_sequence_assembly)
 
         protein_sequences = []
         for (key, equivalent_translations) in group_translations(translations).items():
diff --git a/isovar/translation.py b/isovar/translation.py
index fc87fed..d1ec571 100644
--- a/isovar/translation.py
+++ b/isovar/translation.py
@@ -33,6 +33,7 @@
     MAX_REFERENCE_TRANSCRIPT_MISMATCHES,
     PROTEIN_SEQUENCE_LENGTH,
     MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
+    VARIANT_CDNA_SEQUENCE_ASSEMBLY,
 )
 from .dataframe_builder import dataframe_from_generator
 
@@ -467,9 +468,9 @@ def translate_variant_sequence(
             logger.warn(
                 ("Truncating amino acid sequence %s from variant sequence %s "
                  "to only %d elements loses all variant residues"),
-                    variant_amino_acids,
-                    variant_sequence,
-                    protein_sequence_length)
+                variant_amino_acids,
+                variant_sequence,
+                protein_sequence_length)
             return None
         # if the protein is too long then short it, which implies we're no longer
         # stopping due to a stop codon and that the variant amino acids might
@@ -542,7 +543,8 @@ def translate_variant_reads(
         transcript_id_whitelist=None,
         min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
         min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH,
-        max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES):
+        max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES,
+        variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY):
     if len(variant_reads) == 0:
         logger.info("No supporting reads for variant %s", variant)
         return []
@@ -554,7 +556,8 @@ def translate_variant_reads(
     variant_sequences = supporting_reads_to_variant_sequences(
         variant_reads=variant_reads,
         preferred_sequence_length=cdna_sequence_length,
-        min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence)
+        min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence,
+        variant_cdna_sequence_assembly=variant_cdna_sequence_assembly)
 
     if not variant_sequences:
         logger.info("No spanning cDNA sequences for variant %s", variant)
@@ -572,7 +575,7 @@ def translate_variant_reads(
     if context_size < min_transcript_prefix_length:
         logger.info(
             "Skipping variant %s, none of the cDNA sequences have sufficient context",
-                variant)
+            variant)
         return []
 
     reference_contexts = reference_contexts_for_variant(
@@ -592,7 +595,8 @@ def translate_variants(
         protein_sequence_length=PROTEIN_SEQUENCE_LENGTH,
         min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
         min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH,
-        max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES):
+        max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES,
+        variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY):
     """
     Translates each coding variant in a collection to one or more protein
     fragment sequences (if the variant is not filtered and its spanning RNA
@@ -636,7 +640,8 @@ def translate_variants(
             transcript_id_whitelist=transcript_id_whitelist,
             min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence,
             min_transcript_prefix_length=min_transcript_prefix_length,
-            max_transcript_mismatches=max_transcript_mismatches)
+            max_transcript_mismatches=max_transcript_mismatches,
+            variant_cdna_sequence_assembly=variant_cdna_sequence_assembly)
         yield variant, translations
 
 def translations_generator_to_dataframe(translations_generator):
diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py
index d62fc8e..ba02b58 100644
--- a/isovar/variant_sequences.py
+++ b/isovar/variant_sequences.py
@@ -26,6 +26,7 @@
 from .default_parameters import (
     MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
     VARIANT_CDNA_SEQUENCE_LENGTH,
+    VARIANT_CDNA_SEQUENCE_ASSEMBLY,
 )
 from .dataframe_builder import dataframe_from_generator
 from .assembly import iterative_overlap_assembly, collapse_substrings
@@ -112,8 +113,8 @@ def left_overlaps(self, other, min_overlap_size=1):
         )
         prefix_overlap_size = len(self.prefix) - len(other.prefix)
         suffix_overlap_size = len(other.suffix) - len(self.suffix)
-        return sequence_overlaps and (
-            (prefix_overlap_size + suffix_overlap_size) >= min_overlap_size)
+        overlap_size = prefix_overlap_size + suffix_overlap_size + len(self.alt)
+        return sequence_overlaps and overlap_size >= min_overlap_size
 
     def add_reads(self, reads):
         """
@@ -189,7 +190,7 @@ def trim_by_coverage(self, min_reads):
         which are overlapped by fewer reads than specified.
         """
         read_count_array = self.coverage()
-
+        logger.info("%s (len=%d)" % (read_count_array, len(read_count_array)))
         sufficient_coverage_mask = read_count_array >= min_reads
         sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask)
         if len(sufficient_coverage_indices) == 0:
@@ -228,7 +229,7 @@ def trim_by_coverage(self, min_reads):
 def sort_key_decreasing_read_count(variant_sequence):
     return -len(variant_sequence.reads)
 
-def all_variant_sequences_supported_by_variant_reads(
+def initial_variant_sequences_from_reads(
         variant_reads,
         max_nucleotides_before_variant=None,
         max_nucleotides_after_variant=None):
@@ -333,7 +334,8 @@ def filter_variant_sequences(
 def supporting_reads_to_variant_sequences(
         variant_reads,
         preferred_sequence_length,
-        min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE):
+        min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
+        variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY):
     """
     Collapse variant-support RNA reads into consensus sequences of approximately
     the preferred length (may differ at the ends of transcripts), filter
@@ -355,6 +357,10 @@ def supporting_reads_to_variant_sequences(
         Drop sequences which don't at least have this number of fully spanning
         reads.
 
+    overlap_assembly : bool
+        Construct variant sequences by merging overlapping reads. If False
+        then variant sequences must be fully spanned by cDNA reads.
+
     Returns a collection of VariantSequence objects
     """
     # just in case variant_reads is a generator, convert it to a list
@@ -377,7 +383,7 @@ def supporting_reads_to_variant_sequences(
     max_nucleotides_before_variant = (
         n_surrounding_nucleotides - max_nucleotides_after_variant)
 
-    variant_sequences = all_variant_sequences_supported_by_variant_reads(
+    variant_sequences = initial_variant_sequences_from_reads(
         variant_reads=variant_reads,
         max_nucleotides_before_variant=max_nucleotides_before_variant,
         max_nucleotides_after_variant=max_nucleotides_after_variant)
@@ -388,7 +394,8 @@ def supporting_reads_to_variant_sequences(
         min(len(s) for s in variant_sequences),
         max(len(s) for s in variant_sequences))
 
-    variant_sequences = iterative_overlap_assembly(variant_sequences)
+    if variant_cdna_sequence_assembly:
+        variant_sequences = iterative_overlap_assembly(variant_sequences)
 
     if variant_sequences:
         logger.info(
@@ -433,7 +440,8 @@ def overlapping_reads_to_variant_sequences(
 def reads_generator_to_sequences_generator(
         variant_and_reads_generator,
         min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
-        preferred_sequence_length=VARIANT_CDNA_SEQUENCE_LENGTH):
+        preferred_sequence_length=VARIANT_CDNA_SEQUENCE_LENGTH,
+        variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY):
     """
     For each variant, collect all possible sequence contexts around the
     variant which are spanned by at least min_reads.
@@ -450,6 +458,10 @@ def reads_generator_to_sequences_generator(
     sequence_length : int
         Desired sequence length, including variant nucleotides
 
+    overlap_assembly : bool
+        Construct variant sequences by merging overlapping reads. If False
+        then variant sequences must be fully spanned by cDNA reads.
+
     Yields pairs with the following fields:
         - Variant
         - list of VariantSequence objects
@@ -459,7 +471,8 @@ def reads_generator_to_sequences_generator(
             variant=variant,
             overlapping_reads=variant_reads,
             min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence,
-            preferred_sequence_length=preferred_sequence_length)
+            preferred_sequence_length=preferred_sequence_length,
+            variant_cdna_sequence_assembly=variant_cdna_sequence_assembly)
         yield variant, variant_sequences
 
 def variant_sequences_generator_to_dataframe(variant_sequences_generator):
diff --git a/test/test_assembly.py b/test/test_assembly.py
index a5cf7c6..6929e0c 100644
--- a/test/test_assembly.py
+++ b/test/test_assembly.py
@@ -14,10 +14,8 @@
 
 from __future__ import print_function, division, absolute_import
 
-from isovar.variant_reads import (
-    reads_supporting_variant,
-    all_variant_sequences_supported_by_variant_reads,
-)
+from isovar.variant_reads import reads_supporting_variant
+from isovar.variant_sequences import initial_variant_sequences_from_reads
 from isovar.assembly import iterative_overlap_assembly
 from pyensembl import ensembl_grch38
 from varcode import Variant
@@ -43,7 +41,7 @@ def test_assemble_transcript_fragments_snv():
         chromosome=chromosome,)
 
     sequences = iterative_overlap_assembly(
-        all_variant_sequences_supported_by_variant_reads(variant_reads),
+        initial_variant_sequences_from_reads(variant_reads),
         min_overlap_size=30)
 
     assert len(sequences) > 0
diff --git a/test/test_protein_sequences.py b/test/test_protein_sequences.py
index caf129a..08bb822 100644
--- a/test/test_protein_sequences.py
+++ b/test/test_protein_sequences.py
@@ -166,18 +166,20 @@ def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant():
         samfile=samfile,
         min_mapping_quality=0)
 
-    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
-        allele_reads_generator,
-        max_protein_sequences_per_variant=1)
-    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
-    print(df)
-    eq_(
-        len(df),
-        len(expressed_variants),
-        "Expected %d/%d entries to have RNA support, got %d" % (
+    for overlap_assembly in [False, True]:
+        protein_sequences_generator = reads_generator_to_protein_sequences_generator(
+            allele_reads_generator,
+            max_protein_sequences_per_variant=1,
+            variant_cdna_sequence_assembly=overlap_assembly)
+        df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
+        print(df)
+        eq_(
+            len(df),
             len(expressed_variants),
-            len(combined_variants),
-            len(df),))
+            "Expected %d/%d entries to have RNA support, got %d" % (
+                len(expressed_variants),
+                len(combined_variants),
+                len(df),))
 
 def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality():
     # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
@@ -207,7 +209,7 @@ def test_variants_to_protein_sequences_dataframe_protein_sequence_length():
             "--vcf", data_path("data/b16.f10/b16.vcf"),
             "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
             "--max-protein-sequences-per-variant", "1",
-            "--protein-sequence-length", str(desired_length)
+            "--protein-sequence-length", str(desired_length),
         ])
         df = protein_sequences_dataframe_from_args(args)
         eq_(

From b80d3d47d97fe499aaadfc23ddcdb526a7318448 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Mon, 31 Oct 2016 17:51:43 -0400
Subject: [PATCH 03/11] got unit tests with assmembly passing, working on
 adding more

---
 isovar/protein_sequences.py    | 20 ++++++++--
 isovar/variant_sequences.py    |  3 +-
 test/test_assembly.py          |  7 ++++
 test/test_protein_sequences.py | 70 +++++++++++++++++++++++-----------
 4 files changed, 74 insertions(+), 26 deletions(-)

diff --git a/isovar/protein_sequences.py b/isovar/protein_sequences.py
index af55d21..34429c2 100644
--- a/isovar/protein_sequences.py
+++ b/isovar/protein_sequences.py
@@ -21,6 +21,7 @@
 
 from __future__ import print_function, division, absolute_import
 from collections import namedtuple, defaultdict
+import logging
 
 from .default_parameters import (
     MIN_TRANSCRIPT_PREFIX_LENGTH,
@@ -48,6 +49,8 @@
 #
 ##########################
 
+logger = logging.getLogger(__name__)
+
 ProteinSequence = namedtuple(
     "ProteinSequence",
     TranslationKey._fields + (
@@ -66,10 +69,10 @@
         # number of unique read names from all the VariantSequence objects
         # from each translation
         "alt_reads_supporting_protein_sequence",
-        # how many reference transcripts overlap the variant locus?
+        # IDs of transcripts overlapping the variant locus
         "transcripts_overlapping_variant",
-        # how many reference transcripts were used to establish the
-        # reading frame for this protein sequence
+        # IDs of reference transcripts used to establish the reading frame for
+        # this protein sequence
         "transcripts_supporting_protein_sequence",
         # name of gene of the reference transcripts used in Translation
         # objects
@@ -193,6 +196,9 @@ def reads_generator_to_protein_sequences_generator(
         Number of protein sequences to return for each ProteinSequence
 
     variant_cdna_sequence_assembly : bool
+        If True, then assemble variant cDNA sequences based on overlap of
+        RNA reads. If False, then variant cDNA sequences must be fully spanned
+        and contained within RNA reads.
 
     Yields pairs of a Variant and a list of ProteinSequence objects
     """
@@ -222,11 +228,18 @@ def reads_generator_to_protein_sequences_generator(
 
         protein_sequences = []
         for (key, equivalent_translations) in group_translations(translations).items():
+
             # get the variant read names, transcript IDs and gene names for
             # protein sequence we're about to construct
             alt_reads_supporting_protein_sequence, group_transcript_ids, group_gene_names = \
                 summarize_translations(equivalent_translations)
 
+            logger.info(
+                "%s: %s alt reads supporting protein sequence (gene names = %s)",
+                key,
+                len(alt_reads_supporting_protein_sequence),
+                group_gene_names)
+
             protein_sequence = translation_key_to_protein_sequence(
                 translation_key=key,
                 translations=equivalent_translations,
@@ -237,6 +250,7 @@ def reads_generator_to_protein_sequences_generator(
                 transcripts_supporting_protein_sequence=group_transcript_ids,
                 transcripts_overlapping_variant=overlapping_transcript_ids,
                 gene=list(group_gene_names))
+            logger.info("%s: protein sequence = %s" % (key, protein_sequence.amino_acids))
             protein_sequences.append(protein_sequence)
 
         # sort protein sequences before returning the top results
diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py
index ba02b58..822607e 100644
--- a/isovar/variant_sequences.py
+++ b/isovar/variant_sequences.py
@@ -190,7 +190,8 @@ def trim_by_coverage(self, min_reads):
         which are overlapped by fewer reads than specified.
         """
         read_count_array = self.coverage()
-        logger.info("%s (len=%d)" % (read_count_array, len(read_count_array)))
+        logger.info("Coverage: %s (len=%d)" % (
+            read_count_array, len(read_count_array)))
         sufficient_coverage_mask = read_count_array >= min_reads
         sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask)
         if len(sufficient_coverage_indices) == 0:
diff --git a/test/test_assembly.py b/test/test_assembly.py
index 6929e0c..1bfe127 100644
--- a/test/test_assembly.py
+++ b/test/test_assembly.py
@@ -16,6 +16,8 @@
 
 from isovar.variant_reads import reads_supporting_variant
 from isovar.variant_sequences import initial_variant_sequences_from_reads
+from isovar.allele_reads import AlleleRead
+from isovar.variant_sequences import VariantSequence
 from isovar.assembly import iterative_overlap_assembly
 from pyensembl import ensembl_grch38
 from varcode import Variant
@@ -57,6 +59,11 @@ def test_assemble_transcript_fragments_snv():
         assert len(s) > max_read_length, \
             "Expected assembled sequences to be longer than read length (%d)" % (
                 max_read_length,)
+"""
+def test_assemble_simple_sequences():
+    AlleleRead
+    VariantSequence
+"""
 
 if __name__ == "__main__":
     test_assemble_transcript_fragments_snv()
diff --git a/test/test_protein_sequences.py b/test/test_protein_sequences.py
index 08bb822..5973203 100644
--- a/test/test_protein_sequences.py
+++ b/test/test_protein_sequences.py
@@ -152,34 +152,58 @@ def test_sort_protein_sequences():
     ]
     eq_(sort_protein_sequences(unsorted_protein_sequences), expected_order)
 
-
-def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant():
-    expressed_variants = load_vcf("data/b16.f10/b16.expressed.vcf")
-    not_expressed_variants = load_vcf("data/b16.f10/b16.not-expressed.vcf")
+def variants_to_protein_sequences_dataframe(
+        expressed_vcf="data/b16.f10/b16.expressed.vcf",
+        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
+        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
+        min_mapping_quality=0,
+        max_protein_sequences_per_variant=1,
+        variant_cdna_sequence_assembly=False):
+    """
+    Helper function to load pair of VCFs and tumor RNA BAM
+    and use them to generate a DataFrame of expressed variant protein
+    sequences.
+    """
+    expressed_variants = load_vcf(expressed_vcf)
+    not_expressed_variants = load_vcf(not_expressed_vcf)
 
     combined_variants = VariantCollection(
         list(expressed_variants) + list(not_expressed_variants))
-    samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")
+    samfile = load_bam(tumor_rna_bam)
 
     allele_reads_generator = reads_overlapping_variants(
         variants=combined_variants,
         samfile=samfile,
-        min_mapping_quality=0)
-
-    for overlap_assembly in [False, True]:
-        protein_sequences_generator = reads_generator_to_protein_sequences_generator(
-            allele_reads_generator,
-            max_protein_sequences_per_variant=1,
-            variant_cdna_sequence_assembly=overlap_assembly)
-        df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
-        print(df)
-        eq_(
-            len(df),
+        min_mapping_quality=min_mapping_quality)
+
+    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
+        allele_reads_generator,
+        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
+        variant_cdna_sequence_assembly=variant_cdna_sequence_assembly)
+    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
+    return df, expressed_variants, combined_variants
+
+def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_with_assembly():
+    df, expressed_variants, combined_variants = \
+        variants_to_protein_sequences_dataframe(variant_cdna_sequence_assembly=True)
+    print(df)
+    eq_(len(df),
+        len(expressed_variants),
+        "Expected %d/%d entries to have RNA support, got %d" % (
             len(expressed_variants),
-            "Expected %d/%d entries to have RNA support, got %d" % (
-                len(expressed_variants),
-                len(combined_variants),
-                len(df),))
+            len(combined_variants),
+            len(df)))
+
+def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_without_assembly():
+    df, expressed_variants, combined_variants = \
+        variants_to_protein_sequences_dataframe(variant_cdna_sequence_assembly=False)
+    print(df)
+    eq_(len(df),
+        len(expressed_variants),
+        "Expected %d/%d entries to have RNA support, got %d" % (
+            len(expressed_variants),
+            len(combined_variants),
+            len(df)))
 
 def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality():
     # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
@@ -222,10 +246,12 @@ def test_variants_to_protein_sequences_dataframe_protein_sequence_length():
         protein_sequences = df["amino_acids"]
         print(protein_sequences)
         protien_sequence_lengths = protein_sequences.str.len()
-        assert (protien_sequence_lengths == desired_length).all(), protien_sequence_lengths
+        assert (protien_sequence_lengths == desired_length).all(), (
+            protien_sequence_lengths,)
 
 if __name__ == "__main__":
     test_sort_protein_sequences()
-    test_variants_to_protein_sequences_dataframe_one_sequence_per_variant()
+    test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_with_assembly()
+    test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_without_assembly()
     test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality()
     test_variants_to_protein_sequences_dataframe_protein_sequence_length()

From a8ae1601a49b75672e97acfa9ac827dcd1c7cb9c Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 4 Nov 2016 13:20:55 -0400
Subject: [PATCH 04/11] splitting reference_context into three modules

---
 isovar/allele_reads.py                        | 195 ++++++------
 isovar/locus_reads.py                         | 274 ++++++++--------
 isovar/protein_sequences.py                   |   4 +-
 isovar/reference_context.py                   | 299 +-----------------
 isovar/reference_sequence_key.py              | 132 ++++++++
 ...ference_sequence_key_with_reading_frame.py | 201 ++++++++++++
 test/test_reference_sequence_key.py           |  29 +-
 7 files changed, 584 insertions(+), 550 deletions(-)
 create mode 100644 isovar/reference_sequence_key.py
 create mode 100644 isovar/reference_sequence_key_with_reading_frame.py

diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py
index 9d76460..37d5a76 100644
--- a/isovar/allele_reads.py
+++ b/isovar/allele_reads.py
@@ -34,121 +34,114 @@
 
 logger = logging.getLogger(__name__)
 
-
 # subclassing from namedtuple to get a lightweight object with built-in
 # hashing and comparison while also being able to add methods
-AlleleReadFields = namedtuple(
-    "AlleleRead",
-    "prefix allele suffix name sequence")
-
-class AlleleRead(AlleleReadFields):
-    def __new__(cls, prefix, allele, suffix, name):
-        # construct sequence from prefix + alt + suffix
-        return AlleleReadFields.__new__(
-            cls,
-            prefix=prefix,
-            allele=allele,
-            suffix=suffix,
-            name=name,
-            sequence=prefix + allele + suffix)
+class AlleleRead(
+        namedtuple("AlleleRead", "prefix allele suffix name sequence")):
+
+    @property
+    def sequence(self):
+        return self.prefix + self.allele + self.suffix
 
     def __len__(self):
         return len(self.prefix) + len(self.allele) + len(self.suffix)
 
-def allele_read_from_locus_read(locus_read, n_ref):
-    """
-    Given a single ReadAtLocus object, return either an AlleleRead or None
-
-    Parameters
-    ----------
-    locus_read : LocusRead
-        Read which overlaps a variant locus but doesn't necessarily contain the
-        alternate nucleotides
-
-    n_ref : int
-        Number of reference positions we are expecting to be modified or
-        deleted (for insertions this should be 0)
-    """
-    sequence = locus_read.sequence
-    reference_positions = locus_read.reference_positions
-
-    # positions of the nucleotides before and after the variant within
-    # the read sequence
-    read_pos_before = locus_read.base0_read_position_before_variant
-    read_pos_after = locus_read.base0_read_position_after_variant
-
-    # positions of the nucleotides before and after the variant on the
-    # reference genome
-    ref_pos_before = reference_positions[read_pos_before]
-
-    if ref_pos_before is None:
-        logger.warn(
-            "Missing reference pos for nucleotide before variant on read: %s",
-            locus_read)
-        return None
-
-    ref_pos_after = reference_positions[read_pos_after]
-
-    if ref_pos_after is None:
-        logger.warn(
-            "Missing reference pos for nucleotide after variant on read: %s",
-            locus_read)
-        return None
-
-    if n_ref == 0:
-        if ref_pos_after - ref_pos_before != 1:
-            # if the number of nucleotides skipped isn't the same
-            # as the number of reference nucleotides in the variant then
-            # don't use this read
-            logger.debug(
-                "Positions before (%d) and after (%d) variant should be adjacent on read %s",
-                ref_pos_before,
-                ref_pos_after,
+    @classmethod
+    def from_locus_read(cls, locus_read, n_ref):
+        """
+        Given a single LocusRead object, return either an AlleleRead or None
+
+        Parameters
+        ----------
+        locus_read : LocusRead
+            Read which overlaps a variant locus but doesn't necessarily contain the
+            alternate nucleotides
+
+        n_ref : int
+            Number of reference positions we are expecting to be modified or
+            deleted (for insertions this should be 0)
+        """
+        sequence = locus_read.sequence
+        reference_positions = locus_read.reference_positions
+
+        # positions of the nucleotides before and after the variant within
+        # the read sequence
+        read_pos_before = locus_read.base0_read_position_before_variant
+        read_pos_after = locus_read.base0_read_position_after_variant
+
+        # positions of the nucleotides before and after the variant on the
+        # reference genome
+        ref_pos_before = reference_positions[read_pos_before]
+
+        if ref_pos_before is None:
+            logger.warn(
+                "Missing reference pos for nucleotide before variant on read: %s",
                 locus_read)
             return None
 
-        # insertions require a sequence of non-aligned bases
-        # followed by the subsequence reference position
-        ref_positions_for_inserted = reference_positions[
-            read_pos_before + 1:read_pos_after]
-        if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
-            # all these inserted nucleotides should *not* align to the
-            # reference
-            logger.debug(
-                "Skipping read, inserted nucleotides shouldn't map to reference")
-            return None
-    else:
-        # substitutions and deletions
-        if ref_pos_after - ref_pos_before != n_ref + 1:
-            # if the number of nucleotides skipped isn't the same
-            # as the number of reference nucleotides in the variant then
-            # don't use this read
-            logger.debug(
-                "Positions before (%d) and after (%d) variant should be adjacent on read %s",
-                ref_pos_before,
-                ref_pos_after,
+        ref_pos_after = reference_positions[read_pos_after]
+
+        if ref_pos_after is None:
+            logger.warn(
+                "Missing reference pos for nucleotide after variant on read: %s",
                 locus_read)
             return None
 
-    nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
-
-    prefix = sequence[:read_pos_before + 1]
-    suffix = sequence[read_pos_after:]
-
-    prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
-    prefix, suffix = trim_N_nucleotides(prefix, suffix)
-
-    return AlleleRead(
-        prefix,
-        nucleotides_at_variant_locus,
-        suffix,
-        name=locus_read.name)
+        if n_ref == 0:
+            if ref_pos_after - ref_pos_before != 1:
+                # if the number of nucleotides skipped isn't the same
+                # as the number of reference nucleotides in the variant then
+                # don't use this read
+                logger.debug(
+                    "Positions before (%d) and after (%d) variant should be adjacent on read %s",
+                    ref_pos_before,
+                    ref_pos_after,
+                    locus_read)
+                return None
+
+            # insertions require a sequence of non-aligned bases
+            # followed by the subsequence reference position
+            ref_positions_for_inserted = reference_positions[
+                read_pos_before + 1:read_pos_after]
+            if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
+                # all these inserted nucleotides should *not* align to the
+                # reference
+                logger.debug(
+                    "Skipping read, inserted nucleotides shouldn't map to reference")
+                return None
+        else:
+            # substitutions and deletions
+            if ref_pos_after - ref_pos_before != n_ref + 1:
+                # if the number of nucleotides skipped isn't the same
+                # as the number of reference nucleotides in the variant then
+                # don't use this read
+                logger.debug(
+                    ("Positions before (%d) and after (%d) variant should be "
+                     "adjacent on read %s"),
+                    ref_pos_before,
+                    ref_pos_after,
+                    locus_read)
+                return None
+
+        nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
+
+        prefix = sequence[:read_pos_before + 1]
+        suffix = sequence[read_pos_after:]
+
+        prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
+        prefix, suffix = trim_N_nucleotides(prefix, suffix)
+
+        return cls(
+            prefix,
+            nucleotides_at_variant_locus,
+            suffix,
+            name=locus_read.name)
 
 def allele_reads_from_locus_reads(locus_reads, n_ref):
     """
-    Given a collection of ReadAtLocus objects, returns a
-    list of VariantRead objects (which are split into prefix/variant/suffix
-    nucleotides).
+    Given a collection of LocusRead objects, returns a
+    list of AlleleRead objects
+    (which are split into prefix/allele/suffix nucleotide strings).
 
     Parameters
     ----------
@@ -161,7 +154,7 @@ def allele_reads_from_locus_reads(locus_reads, n_ref):
     """
 
     for locus_read in locus_reads:
-        allele_read = allele_read_from_locus_read(locus_read, n_ref)
+        allele_read = AlleleRead.from_locus_read(locus_read, n_ref)
         if allele_read is None:
             continue
         else:
diff --git a/isovar/locus_reads.py b/isovar/locus_reads.py
index 971595b..f5f8a81 100644
--- a/isovar/locus_reads.py
+++ b/isovar/locus_reads.py
@@ -33,154 +33,154 @@
 
 logger = logging.getLogger(__name__)
 
-
-LocusRead = namedtuple(
-    "LocusRead",
-    [
+class LocusRead(namedtuple("LocusRead", [
         "name",
         "sequence",
         "reference_positions",
         "quality_scores",
         "base0_read_position_before_variant",
-        "base0_read_position_after_variant",
-    ])
-
-def create_locus_read_from_pysam_pileup_element(
-        pileup_element,
-        base0_position_before_variant,
-        base0_position_after_variant,
-        use_secondary_alignments,
-        use_duplicate_reads,
-        min_mapping_quality):
-    """
-    Parameters
-    ----------
-    pileup_element : pysam.PileupRead
-
-    base0_position_before_variant : int
+        "base0_read_position_after_variant"])):
 
-    base0_position_after_variant : int
-
-    use_secondary_alignments : bool
-
-    use_duplicate_reads : bool
-
-    min_mapping_quality : int
-
-    Returns LocusRead or None
-    """
-    read = pileup_element.alignment
-
-    # For future reference,  may get overlapping reads
-    # which can be identified by having the same name
-    name = read.query_name
-
-    if name is None:
-        logger.warn("Read missing name at position %d",
-            base0_position_before_variant + 1)
-        return None
-
-    if read.is_unmapped:
-        logger.warn(
-            "How did we get unmapped read '%s' in a pileup?", name)
-        return None
-
-    if pileup_element.is_refskip:
-        # if read sequence doesn't actually align to the reference
-        # base before a variant, skip it
-        logger.debug("Skipping pileup element with CIGAR alignment N (intron)")
-        return None
-    elif pileup_element.is_del:
-        logger.debug(
-            "Skipping deletion at position %d (read name = %s)",
+    @classmethod
+    def from_pysam_pileup_element(
+            cls,
+            pileup_element,
+            base0_position_before_variant,
+            base0_position_after_variant,
+            use_secondary_alignments,
+            use_duplicate_reads,
+            min_mapping_quality):
+        """
+        Parameters
+        ----------
+        pileup_element : pysam.PileupRead
+
+        base0_position_before_variant : int
+
+        base0_position_after_variant : int
+
+        use_secondary_alignments : bool
+
+        use_duplicate_reads : bool
+
+        min_mapping_quality : int
+
+        Returns LocusRead or None
+        """
+        read = pileup_element.alignment
+
+        # For future reference,  may get overlapping reads
+        # which can be identified by having the same name
+        name = read.query_name
+
+        if name is None:
+            logger.warn(
+                "Read missing name at position %d",
+                base0_position_before_variant + 1)
+            return None
+
+        if read.is_unmapped:
+            logger.warn(
+                "How did we get unmapped read '%s' in a pileup?", name)
+            return None
+
+        if pileup_element.is_refskip:
+            # if read sequence doesn't actually align to the reference
+            # base before a variant, skip it
+            logger.debug("Skipping pileup element with CIGAR alignment N (intron)")
+            return None
+        elif pileup_element.is_del:
+            logger.debug(
+                "Skipping deletion at position %d (read name = %s)",
                 base0_position_before_variant + 1,
                 name)
-        return None
+            return None
 
-    if read.is_secondary and not use_secondary_alignments:
-        logger.debug("Skipping secondary alignment of read '%s'", name)
-        return None
+        if read.is_secondary and not use_secondary_alignments:
+            logger.debug("Skipping secondary alignment of read '%s'", name)
+            return None
 
-    if read.is_duplicate and not use_duplicate_reads:
-        logger.debug("Skipping duplicate read '%s'", name)
-        return None
+        if read.is_duplicate and not use_duplicate_reads:
+            logger.debug("Skipping duplicate read '%s'", name)
+            return None
 
-    mapping_quality = read.mapping_quality
+        mapping_quality = read.mapping_quality
 
-    missing_mapping_quality = mapping_quality is None
-    if min_mapping_quality > 0 and missing_mapping_quality:
-        logger.debug("Skipping read '%s' due to missing MAPQ", name)
-        return None
-    elif mapping_quality < min_mapping_quality:
-        logger.debug(
-            "Skipping read '%s' due to low MAPQ: %d < %d",
+        missing_mapping_quality = mapping_quality is None
+        if min_mapping_quality > 0 and missing_mapping_quality:
+            logger.debug("Skipping read '%s' due to missing MAPQ", name)
+            return None
+        elif mapping_quality < min_mapping_quality:
+            logger.debug(
+                "Skipping read '%s' due to low MAPQ: %d < %d",
                 read.mapping_quality,
                 mapping_quality,
                 min_mapping_quality)
-        return None
-
-    sequence = read.query_sequence
-
-    if sequence is None:
-        logger.warn("Read '%s' missing sequence", name)
-        return None
-
-    base_qualities = read.query_qualities
-
-    if base_qualities is None:
-        logger.warn("Read '%s' missing base qualities", name)
-        return None
-
-    #
-    # Documentation for pysam.AlignedSegment.get_reference_positions:
-    # ------------------------------------------------------------------
-    # By default, this method only returns positions in the reference
-    # that are within the alignment. If full_length is set, None values
-    # will be included for any soft-clipped or unaligned positions
-    # within the read. The returned list will thus be of the same length
-    # as the read.
-    #
-    # Source:
-    # http://pysam.readthedocs.org/en/latest/
-    # api.html#pysam.AlignedSegment.get_reference_positions
-    #
-    # We want a None value for every read position that does not have a
-    # corresponding reference position.
-    reference_positions = read.get_reference_positions(
-        full_length=True)
-
-    # pysam uses base-0 positions everywhere except region strings
-    # Source:
-    # http://pysam.readthedocs.org/en/latest/faq.html#pysam-coordinates-are-wrong
-    if base0_position_before_variant not in reference_positions:
-        logger.debug(
-            "Skipping read '%s' because first position %d not mapped",
+            return None
+
+        sequence = read.query_sequence
+
+        if sequence is None:
+            logger.warn("Read '%s' missing sequence", name)
+            return None
+
+        base_qualities = read.query_qualities
+
+        if base_qualities is None:
+            logger.warn("Read '%s' missing base qualities", name)
+            return None
+
+        #
+        # Documentation for pysam.AlignedSegment.get_reference_positions:
+        # ------------------------------------------------------------------
+        # By default, this method only returns positions in the reference
+        # that are within the alignment. If full_length is set, None values
+        # will be included for any soft-clipped or unaligned positions
+        # within the read. The returned list will thus be of the same length
+        # as the read.
+        #
+        # Source:
+        # http://pysam.readthedocs.org/en/latest/
+        # api.html#pysam.AlignedSegment.get_reference_positions
+        #
+        # We want a None value for every read position that does not have a
+        # corresponding reference position.
+        reference_positions = read.get_reference_positions(
+            full_length=True)
+
+        # pysam uses base-0 positions everywhere except region strings
+        # Source:
+        # http://pysam.readthedocs.org/en/latest/faq.html#pysam-coordinates-are-wrong
+        if base0_position_before_variant not in reference_positions:
+            logger.debug(
+                "Skipping read '%s' because first position %d not mapped",
                 name,
                 base0_position_before_variant)
-        return None
-    else:
-        base0_read_position_before_variant = reference_positions.index(
-            base0_position_before_variant)
-
-    if base0_position_after_variant not in reference_positions:
-        logger.debug(
-            "Skipping read '%s' because last position %d not mapped",
+            return None
+        else:
+            base0_read_position_before_variant = reference_positions.index(
+                base0_position_before_variant)
+
+        if base0_position_after_variant not in reference_positions:
+            logger.debug(
+                "Skipping read '%s' because last position %d not mapped",
                 name,
                 base0_position_after_variant)
-        return None
-    else:
-        base0_read_position_after_variant = reference_positions.index(
-            base0_position_after_variant)
-
-    if isinstance(sequence, bytes):
-        sequence = sequence.decode('ascii')
-    return LocusRead(
-        name=name,
-        sequence=sequence,
-        reference_positions=reference_positions,
-        quality_scores=base_qualities,
-        base0_read_position_before_variant=base0_read_position_before_variant,
-        base0_read_position_after_variant=base0_read_position_after_variant)
+            return None
+        else:
+            base0_read_position_after_variant = reference_positions.index(
+                base0_position_after_variant)
+
+        if isinstance(sequence, bytes):
+            sequence = sequence.decode('ascii')
+
+        return cls(
+            name=name,
+            sequence=sequence,
+            reference_positions=reference_positions,
+            quality_scores=base_qualities,
+            base0_read_position_before_variant=base0_read_position_before_variant,
+            base0_read_position_after_variant=base0_read_position_after_variant)
 
 def pileup_reads_at_position(samfile, chromosome, base0_position):
     """
@@ -250,9 +250,9 @@ def locus_read_generator(
 
     logger.debug(
         "Gathering reads at locus %s: %d-%d",
-            chromosome,
-            base1_position_before_variant,
-            base1_position_after_variant)
+        chromosome,
+        base1_position_before_variant,
+        base1_position_after_variant)
     base0_position_before_variant = base1_position_before_variant - 1
     base0_position_after_variant = base1_position_after_variant - 1
 
@@ -267,7 +267,7 @@ def locus_read_generator(
             samfile=samfile,
             chromosome=chromosome,
             base0_position=base0_position_before_variant):
-        read = create_locus_read_from_pysam_pileup_element(
+        read = LocusRead.from_pysam_pileup_element(
             pileup_element,
             base0_position_before_variant=base0_position_before_variant,
             base0_position_after_variant=base0_position_after_variant,
@@ -281,10 +281,10 @@ def locus_read_generator(
 
     logger.info(
         "Found %d reads overlapping locus %s: %d-%d",
-            count,
-            chromosome,
-            base1_position_before_variant,
-            base1_position_after_variant)
+        count,
+        chromosome,
+        base1_position_before_variant,
+        base1_position_after_variant)
 
 def locus_reads_dataframe(*args, **kwargs):
     """
diff --git a/isovar/protein_sequences.py b/isovar/protein_sequences.py
index 34429c2..0934310 100644
--- a/isovar/protein_sequences.py
+++ b/isovar/protein_sequences.py
@@ -135,7 +135,9 @@ def summarize_translations(translations):
 
 def protein_sequence_sort_key(protein_sequence):
     """
-    Sort protein sequences lexicographically by three criteria:
+    Sort protein sequences lexicographically by six criteria:
+        - TODO: min number of reads covering each nucleotide of
+          the protein sequence >= 2
         - number of unique supporting reads
         - minimum mismatch versus a supporting reference transcript
         - number of supporting reference transcripts
diff --git a/isovar/reference_context.py b/isovar/reference_context.py
index 2e8dbd3..a023984 100644
--- a/isovar/reference_context.py
+++ b/isovar/reference_context.py
@@ -16,66 +16,14 @@
 from collections import namedtuple, OrderedDict, defaultdict
 import logging
 
-from skbio import DNA
-
 from .effect_prediction import reference_transcripts_for_variant
-from .variant_helpers import interbase_range_affected_by_variant_on_transcript
 from .dataframe_builder import DataFrameBuilder
-
+from .reference_sequence_key_with_reading_frame import (
+    ReferenceSequenceKeyWithReadingFrame,)
 
 logger = logging.getLogger(__name__)
 
 
-##########################
-#
-# SequenceKey
-# -----------
-#
-# Used to identify and group the distinct sequences occurring  on a set of
-# transcripts overlapping a variant locus
-#
-##########################
-
-SequenceKey = namedtuple(
-    "SequenceKey", [
-        "strand",
-        "sequence_before_variant_locus",
-        "sequence_at_variant_locus",
-        "sequence_after_variant_locus"
-    ]
-)
-
-
-##########################
-#
-# SequenceKeyWithReadingFrame
-# ---------------------------
-#
-# Includes all the fields of a SequenceKey, but also includes a reading frame
-# at the start of the reference sequence.
-#
-#
-##########################
-
-SequenceKeyWithReadingFrame = namedtuple(
-    "ReferenceContext", SequenceKey._fields + (
-        # if the reference context includes the 5' UTR then
-        # this is the offset to the start codon, otherwise it's the
-        # offset needed to get the first base of a codon
-        "offset_to_first_complete_codon",
-        # does this context overlap a start codon?
-        "overlaps_start_codon",
-        # does this context contain the whole trinucleotide start codon?
-        "contains_start_codon",
-        # does this context contain any UTR bases?
-        "contains_five_prime_utr",
-        # translation of complete codons in the reference context
-        # before the variant
-        "amino_acids_before_variant",
-    )
-)
-
-
 ##########################
 #
 # ReferenceContext
@@ -89,250 +37,11 @@
 
 ReferenceContext = namedtuple(
     "ReferenceContext",
-    SequenceKeyWithReadingFrame._fields + (
+    ReferenceSequenceKeyWithReadingFrame._fields + (
         "variant",
         "transcripts")
 )
 
-def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand):
-    """
-    Make sure that reference nucleotides we expect to see on the reference
-    transcript from a variant are the same ones we encounter.
-    """
-    if strand == "-":
-        ref_seq_on_transcript = str(
-            DNA(ref_seq_on_transcript).reverse_complement())
-
-    return ref_seq_on_transcript == variant.ref
-
-def sequence_key_for_variant_on_transcript(variant, transcript, context_size):
-    """
-    Extracts the reference sequence around a variant locus on a particular
-    transcript.
-
-    Parameters
-    ----------
-    variant : varcode.Variant
-
-    transcript : pyensembl.Transcript
-
-    context_size : int
-
-    Returns SequenceKey object with the following fields:
-        - strand
-        - sequence_before_variant_locus
-        - sequence_at_variant_locus
-        - sequence_after_variant_locus
-
-    Can also return None if Transcript lacks sufficiently long sequence
-    """
-
-    full_sequence = transcript.sequence
-
-    if full_sequence is None:
-        logger.warn(
-            "Expected transcript %s (overlapping %s) to have sequence",
-                transcript,
-                variant)
-        return None
-
-    if len(full_sequence) < 6:
-        # need at least 6 nucleotides for a start and stop codon
-        logger.warn(
-            "Sequence of %s (overlapping %s) too short: %d",
-                transcript,
-                variant,
-                len(full_sequence))
-        return None
-
-    # get the interbase range of offsets which capture all reference
-    # bases modified by the variant
-    variant_start_offset, variant_end_offset = \
-        interbase_range_affected_by_variant_on_transcript(
-            variant=variant,
-            transcript=transcript)
-
-    logger.info("Interbase offset range on %s for variant %s = %d:%d",
-        transcript,
-        variant,
-        variant_start_offset,
-        variant_end_offset)
-
-    prefix = full_sequence[
-        max(0, variant_start_offset - context_size):
-        variant_start_offset]
-
-    suffix = full_sequence[
-        variant_end_offset:
-        variant_end_offset + context_size]
-
-    ref_nucleotides_at_variant = full_sequence[
-        variant_start_offset:variant_end_offset]
-    if not variant_matches_reference_sequence(
-            variant=variant,
-            strand=transcript.strand,
-            ref_seq_on_transcript=ref_nucleotides_at_variant):
-        # TODO: once we're more confident about other logic in isovar,
-        # change this to a warning and return None to allow for modest
-        # differences between reference sequence patches, since
-        # GRCh38.p1 may differ at some positions from GRCh38.p5
-        raise ValueError(
-            "Wrong reference sequence for variant %s on transcript %s" % (
-                variant,
-                transcript))
-    return SequenceKey(
-        strand=transcript.strand,
-        sequence_before_variant_locus=prefix,
-        sequence_at_variant_locus=ref_nucleotides_at_variant,
-        sequence_after_variant_locus=suffix)
-
-
-def reading_frame_to_offset(reading_frame_at_start_of_sequence):
-    """
-    Given a reading frame (how many nucleotides into a codon) at the
-    start of a cDNA sequence, return the number of nucleotides which need
-    to be trimmed to start on a complete codon.
-
-    Parameters
-    ----------
-
-    reading_frame_at_start_of_sequence : int
-
-    Returns an int
-    """
-    if reading_frame_at_start_of_sequence < 0:
-        raise ValueError("Reading frame can't be negative: %d" % (
-            reading_frame_at_start_of_sequence,))
-    elif reading_frame_at_start_of_sequence > 2:
-        raise ValueError("Reading frame must be within 0 and 2, not %d" % (
-            reading_frame_at_start_of_sequence,))
-    # If we're 1 nucleotide into the codon then we need to shift
-    # over two more to restore the ORF. Likewise, if we're 2 nucleotides in
-    # then we have to shift over one more.
-    return (3 - reading_frame_at_start_of_sequence) % 3
-
-
-def sequence_key_with_reading_frame_for_variant_on_transcript(
-        variant,
-        transcript,
-        context_size):
-    """
-    Extracts the reference sequence around a variant locus on a particular
-    transcript and determines the reading frame at the start of that
-    sequence context.
-
-    Parameters
-    ----------
-    variant : varcode.Variant
-
-    transcript : pyensembl.Transcript
-
-    context_size : int
-
-    Returns SequenceKeyWithReadingFrame object or None if Transcript lacks
-    coding sequence, protein sequence or annotated start/stop codons.
-    """
-    if not transcript.contains_start_codon:
-        logger.info(
-            "Expected transcript %s for variant %s to have start codon",
-                transcript,
-                variant)
-        return None
-
-    if not transcript.contains_stop_codon:
-        logger.info(
-            "Expected transcript %s for variant %s to have stop codon",
-                transcript,
-                variant)
-        return None
-
-    if not transcript.protein_sequence:
-        logger.info(
-            "Expected transript %s for variant %s to have protein sequence",
-                transcript,
-                variant)
-        return None
-
-    sequence_key = sequence_key_for_variant_on_transcript(
-        variant=variant,
-        transcript=transcript,
-        context_size=context_size)
-
-    if sequence_key is None:
-        logger.info("No sequence key for variant %s on transcript %s",
-            variant,
-            transcript)
-        return None
-
-    # get the interbase range of offsets which capture all reference
-    # bases modified by the variant
-    variant_start_offset, variant_end_offset = \
-        interbase_range_affected_by_variant_on_transcript(
-            variant=variant,
-            transcript=transcript)
-
-    start_codon_idx = min(transcript.start_codon_spliced_offsets)
-
-    # skip any variants which occur in the 5' UTR or overlap the start codon
-    # since
-    #   (1) UTR variants have unclear coding effects and
-    #   (2) start-loss variants may result in a new start codon / reading
-    #       frame but we're not sure which one!
-    if variant_start_offset < start_codon_idx + 3:
-        logger.info(
-            "Skipping transcript %s for variant %s, must be after start codon",
-                transcript,
-                variant)
-        return None
-
-    stop_codon_idx = min(transcript.stop_codon_spliced_offsets)
-
-    # skip variants which affect the 3' UTR of the transcript since
-    # they don't have obvious coding effects on the protein sequence
-    if variant_start_offset >= stop_codon_idx + 3:
-        logger.info(
-            "Skipping transcript %s for variant %s, occurs in 3' UTR",
-                transcript,
-                variant)
-        return None
-
-    n_prefix = len(sequence_key.sequence_before_variant_locus)
-    prefix_start_idx = variant_start_offset - n_prefix
-    n_bases_between_start_and_variant = variant_start_offset - start_codon_idx
-    n_full_codons_before_variant = n_bases_between_start_and_variant // 3
-
-    # if the sequence before the variant contains more bases than the
-    # distance to the start codon, then by definition it must contain
-    # some untranslated bases
-    contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant)
-    # allows for the possibility that the first base in the sequence might
-    # be the first nucleotide of the start codon
-    contains_start_codon = (n_prefix >= n_bases_between_start_and_variant)
-    # the sequence context might only include the 2nd or 3rd bases of
-    # the start codon
-    overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3)
-
-    if contains_start_codon:
-        offset_to_first_complete_codon = start_codon_idx - prefix_start_idx
-        amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant]
-    else:
-        reading_frame = (prefix_start_idx - start_codon_idx) % 3
-        offset_to_first_complete_codon = reading_frame_to_offset(reading_frame)
-        n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3
-        amino_acids_before_variant = transcript.protein_sequence[
-            n_full_codons_before_variant - n_codons_in_prefix:
-            n_full_codons_before_variant]
-    return SequenceKeyWithReadingFrame(
-        strand=sequence_key.strand,
-        sequence_before_variant_locus=sequence_key.sequence_before_variant_locus,
-        sequence_at_variant_locus=sequence_key.sequence_at_variant_locus,
-        sequence_after_variant_locus=sequence_key.sequence_after_variant_locus,
-        offset_to_first_complete_codon=offset_to_first_complete_codon,
-        contains_start_codon=contains_start_codon,
-        overlaps_start_codon=overlaps_start_codon,
-        contains_five_prime_utr=contains_five_prime_utr,
-        amino_acids_before_variant=amino_acids_before_variant)
-
 def sort_key_decreasing_max_length_transcript_cds(reference_context):
     """
     Used to sort a sequence of ReferenceContext objects by the longest CDS
@@ -367,7 +76,7 @@ def reference_contexts_for_variant(
 
     for transcript in overlapping_transcripts:
         sequence_key_with_reading_frame = \
-            sequence_key_with_reading_frame_for_variant_on_transcript(
+            ReferenceSequenceKeyWithReadingFrame.from_variant_and_transcript(
                 variant=variant,
                 transcript=transcript,
                 context_size=context_size)
diff --git a/isovar/reference_sequence_key.py b/isovar/reference_sequence_key.py
new file mode 100644
index 0000000..5ade65d
--- /dev/null
+++ b/isovar/reference_sequence_key.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+import logging
+from collections import namedtuple
+
+from skbio import DNA
+
+from .variant_helpers import interbase_range_affected_by_variant_on_transcript
+
+logger = logging.getLogger(__name__)
+
+##########################
+#
+# ReferenceSequenceKey
+# -----------
+#
+# Used to identify and group the distinct sequences occurring on a set of
+# transcripts overlapping a variant locus
+#
+##########################
+
+class ReferenceSequenceKey(namedtuple("ReferenceSequenceKey", [
+        "strand",
+        "sequence_before_variant_locus",
+        "sequence_at_variant_locus",
+        "sequence_after_variant_locus"])):
+
+    @classmethod
+    def from_variant_and_transcript(cls, variant, transcript, context_size):
+        """
+        Extracts the reference sequence around a variant locus on a particular
+        transcript.
+
+        Parameters
+        ----------
+        variant : varcode.Variant
+
+        transcript : pyensembl.Transcript
+
+        context_size : int
+
+        Returns SequenceKey object with the following fields:
+            - strand
+            - sequence_before_variant_locus
+            - sequence_at_variant_locus
+            - sequence_after_variant_locus
+
+        Can also return None if Transcript lacks sufficiently long sequence
+        """
+        full_transcript_sequence = transcript.sequence
+
+        if full_transcript_sequence is None:
+            logger.warn(
+                "Expected transcript %s (overlapping %s) to have sequence",
+                transcript,
+                variant)
+            return None
+
+        if len(full_transcript_sequence) < 6:
+            # need at least 6 nucleotides for a start and stop codon
+            logger.warn(
+                "Sequence of %s (overlapping %s) too short: %d",
+                transcript,
+                variant,
+                len(full_transcript_sequence))
+            return None
+
+        # get the interbase range of offsets which capture all reference
+        # bases modified by the variant
+        variant_start_offset, variant_end_offset = \
+            interbase_range_affected_by_variant_on_transcript(
+                variant=variant,
+                transcript=transcript)
+
+        logger.info(
+            "Interbase offset range on %s for variant %s = %d:%d",
+            transcript,
+            variant,
+            variant_start_offset,
+            variant_end_offset)
+
+        prefix = full_transcript_sequence[
+            max(0, variant_start_offset - context_size):
+            variant_start_offset]
+
+        suffix = full_transcript_sequence[
+            variant_end_offset:
+            variant_end_offset + context_size]
+
+        ref_nucleotides_at_variant = full_transcript_sequence[
+            variant_start_offset:variant_end_offset]
+        if not variant_matches_reference_sequence(
+                variant=variant,
+                strand=transcript.strand,
+                ref_seq_on_transcript=ref_nucleotides_at_variant):
+            # TODO: once we're more confident about other logic in isovar,
+            # change this to a warning and return None to allow for modest
+            # differences between reference sequence patches, since
+            # GRCh38.p1 may differ at some positions from GRCh38.p5
+            raise ValueError(
+                "Wrong reference sequence for variant %s on transcript %s" % (
+                    variant,
+                    transcript))
+        return cls(
+            strand=transcript.strand,
+            sequence_before_variant_locus=prefix,
+            sequence_at_variant_locus=ref_nucleotides_at_variant,
+            sequence_after_variant_locus=suffix)
+
+
+def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand):
+    """
+    Make sure that reference nucleotides we expect to see on the reference
+    transcript from a variant are the same ones we encounter.
+    """
+    if strand == "-":
+        ref_seq_on_transcript = str(
+            DNA(ref_seq_on_transcript).reverse_complement())
+    return ref_seq_on_transcript == variant.ref
diff --git a/isovar/reference_sequence_key_with_reading_frame.py b/isovar/reference_sequence_key_with_reading_frame.py
new file mode 100644
index 0000000..d79d2a5
--- /dev/null
+++ b/isovar/reference_sequence_key_with_reading_frame.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+import logging
+from collections import namedtuple
+
+from .variant_helpers import interbase_range_affected_by_variant_on_transcript
+from .reference_sequence_key import ReferenceSequenceKey
+
+logger = logging.getLogger(__name__)
+
+
+##########################
+#
+# ReferenceSequenceKeyWithReadingFrame
+# ---------------------------
+#
+# Includes all the fields of a ReferenceSequenceKey, but also includes a reading frame
+# at the start of the reference sequence.
+#
+#
+##########################
+
+class SequenceKeyWithReadingFrame(
+        namedtuple(
+            "ReferenceSequenceKeyWithReadingFrame",
+            ReferenceSequenceKey._fields + (
+                # if the reference context includes the 5' UTR then
+                # this is the offset to the start codon, otherwise it's the
+                # offset needed to get the first base of a codon
+                "offset_to_first_complete_codon",
+                # does this context overlap a start codon?
+                "overlaps_start_codon",
+                # does this context contain the whole trinucleotide start codon?
+                "contains_start_codon",
+                # does this context contain any UTR bases?
+                "contains_five_prime_utr",
+                # translation of complete codons in the reference context
+                # before the variant
+                "amino_acids_before_variant"))):
+
+    @classmethod
+    def from_variant_and_transcript(
+            cls,
+            variant,
+            transcript,
+            context_size):
+        """
+        Extracts the reference sequence around a variant locus on a particular
+        transcript and determines the reading frame at the start of that
+        sequence context.
+
+        Parameters
+        ----------
+        variant : varcode.Variant
+
+        transcript : pyensembl.Transcript
+
+        context_size : int
+
+        Returns SequenceKeyWithReadingFrame object or None if Transcript lacks
+        coding sequence, protein sequence or annotated start/stop codons.
+        """
+        if not transcript.contains_start_codon:
+            logger.info(
+                "Expected transcript %s for variant %s to have start codon",
+                transcript,
+                variant)
+            return None
+
+        if not transcript.contains_stop_codon:
+            logger.info(
+                "Expected transcript %s for variant %s to have stop codon",
+                transcript,
+                variant)
+            return None
+
+        if not transcript.protein_sequence:
+            logger.info(
+                "Expected transript %s for variant %s to have protein sequence",
+                transcript,
+                variant)
+            return None
+
+        sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
+            variant=variant,
+            transcript=transcript,
+            context_size=context_size)
+
+        if sequence_key is None:
+            logger.info(
+                "No sequence key for variant %s on transcript %s",
+                variant,
+                transcript)
+            return None
+
+        # get the interbase range of offsets which capture all reference
+        # bases modified by the variant
+        variant_start_offset, variant_end_offset = \
+            interbase_range_affected_by_variant_on_transcript(
+                variant=variant,
+                transcript=transcript)
+
+        start_codon_idx = min(transcript.start_codon_spliced_offsets)
+
+        # skip any variants which occur in the 5' UTR or overlap the start codon
+        # since
+        #   (1) UTR variants have unclear coding effects and
+        #   (2) start-loss variants may result in a new start codon / reading
+        #       frame but we're not sure which one!
+        if variant_start_offset < start_codon_idx + 3:
+            logger.info(
+                "Skipping transcript %s for variant %s, must be after start codon",
+                transcript,
+                variant)
+            return None
+
+        stop_codon_idx = min(transcript.stop_codon_spliced_offsets)
+
+        # skip variants which affect the 3' UTR of the transcript since
+        # they don't have obvious coding effects on the protein sequence
+        if variant_start_offset >= stop_codon_idx + 3:
+            logger.info(
+                "Skipping transcript %s for variant %s, occurs in 3' UTR",
+                transcript,
+                variant)
+            return None
+
+        n_prefix = len(sequence_key.sequence_before_variant_locus)
+        prefix_start_idx = variant_start_offset - n_prefix
+        n_bases_between_start_and_variant = variant_start_offset - start_codon_idx
+        n_full_codons_before_variant = n_bases_between_start_and_variant // 3
+
+        # if the sequence before the variant contains more bases than the
+        # distance to the start codon, then by definition it must contain
+        # some untranslated bases
+        contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant)
+        # allows for the possibility that the first base in the sequence might
+        # be the first nucleotide of the start codon
+        contains_start_codon = (n_prefix >= n_bases_between_start_and_variant)
+        # the sequence context might only include the 2nd or 3rd bases of
+        # the start codon
+        overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3)
+
+        if contains_start_codon:
+            offset_to_first_complete_codon = start_codon_idx - prefix_start_idx
+            amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant]
+        else:
+            reading_frame = (prefix_start_idx - start_codon_idx) % 3
+            offset_to_first_complete_codon = reading_frame_to_offset(reading_frame)
+            n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3
+            amino_acids_before_variant = transcript.protein_sequence[
+                n_full_codons_before_variant - n_codons_in_prefix:
+                n_full_codons_before_variant]
+        return SequenceKeyWithReadingFrame(
+            strand=sequence_key.strand,
+            sequence_before_variant_locus=sequence_key.sequence_before_variant_locus,
+            sequence_at_variant_locus=sequence_key.sequence_at_variant_locus,
+            sequence_after_variant_locus=sequence_key.sequence_after_variant_locus,
+            offset_to_first_complete_codon=offset_to_first_complete_codon,
+            contains_start_codon=contains_start_codon,
+            overlaps_start_codon=overlaps_start_codon,
+            contains_five_prime_utr=contains_five_prime_utr,
+            amino_acids_before_variant=amino_acids_before_variant)
+
+
+def reading_frame_to_offset(reading_frame_at_start_of_sequence):
+    """
+    Given a reading frame (how many nucleotides into a codon) at the
+    start of a cDNA sequence, return the number of nucleotides which need
+    to be trimmed to start on a complete codon.
+
+    Parameters
+    ----------
+
+    reading_frame_at_start_of_sequence : int
+
+    Returns an int
+    """
+    if reading_frame_at_start_of_sequence < 0:
+        raise ValueError("Reading frame can't be negative: %d" % (
+            reading_frame_at_start_of_sequence,))
+    elif reading_frame_at_start_of_sequence > 2:
+        raise ValueError("Reading frame must be within 0 and 2, not %d" % (
+            reading_frame_at_start_of_sequence,))
+    # If we're 1 nucleotide into the codon then we need to shift
+    # over two more to restore the ORF. Likewise, if we're 2 nucleotides in
+    # then we have to shift over one more.
+    return (3 - reading_frame_at_start_of_sequence) % 3
diff --git a/test/test_reference_sequence_key.py b/test/test_reference_sequence_key.py
index fa40b94..766b7d0 100644
--- a/test/test_reference_sequence_key.py
+++ b/test/test_reference_sequence_key.py
@@ -14,10 +14,7 @@
 
 from __future__ import print_function, division, absolute_import
 
-from isovar.reference_context import (
-    sequence_key_for_variant_on_transcript,
-    SequenceKey,
-)
+from isovar.reference_sequence_key import ReferenceSequenceKey
 from varcode import Variant
 from pyensembl import ensembl_grch38
 from nose.tools import eq_
@@ -36,11 +33,11 @@ def test_sequence_key_for_variant_on_transcript_substitution():
     eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG")
     print(brca2_ref_seq)
     # get the 5 nucleotides before the variant and 10 nucleotides after
-    sequence_key = sequence_key_for_variant_on_transcript(
+    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
         variant=brca2_variant_rs769125639,
         transcript=brca2_001,
         context_size=10)
-    expected_sequence_key = SequenceKey(
+    expected_sequence_key = ReferenceSequenceKey(
         strand="+",
         sequence_before_variant_locus=brca2_ref_seq[:5],
         sequence_at_variant_locus="T",
@@ -59,11 +56,11 @@ def test_sequence_key_for_variant_on_transcript_deletion():
     eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG")
     print(brca2_ref_seq)
     # get the 5 nucleotides before the variant and 10 nucleotides after
-    sequence_key = sequence_key_for_variant_on_transcript(
+    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
         variant=brca2_variant_deletion,
         transcript=brca2_001,
         context_size=10)
-    expected_sequence_key = SequenceKey(
+    expected_sequence_key = ReferenceSequenceKey(
         strand="+",
         sequence_before_variant_locus=brca2_ref_seq[:5],
         sequence_at_variant_locus="T",
@@ -81,14 +78,14 @@ def test_sequence_key_for_variant_on_transcript_insertion():
     eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG")
     print(brca2_ref_seq)
     # get the 5 nucleotides before the variant and 10 nucleotides after
-    sequence_key = sequence_key_for_variant_on_transcript(
+    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
         variant=brca2_variant_insertion,
         transcript=brca2_001,
         context_size=10)
 
     # expecting nothing at the variant locus since we're inserting between
     # two reference nucleotides
-    expected_sequence_key = SequenceKey(
+    expected_sequence_key = ReferenceSequenceKey(
         strand="+",
         sequence_before_variant_locus=brca2_ref_seq[:6],
         sequence_at_variant_locus="",
@@ -108,12 +105,12 @@ def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand():
     eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")
 
     # get the 5 nucleotides before the variant and 10 nucleotides after
-    sequence_key = sequence_key_for_variant_on_transcript(
+    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
         variant=tp53_substitution,
         transcript=tp53_001,
         context_size=10)
 
-    expected_sequence_key = SequenceKey(
+    expected_sequence_key = ReferenceSequenceKey(
         strand="-",
         sequence_before_variant_locus="GGTCACTGCC",
         sequence_at_variant_locus="ATG",
@@ -132,12 +129,12 @@ def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand():
     eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")
 
     # get the 5 nucleotides before the variant and 10 nucleotides after
-    sequence_key = sequence_key_for_variant_on_transcript(
+    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
         variant=tp53_deletion,
         transcript=tp53_001,
         context_size=10)
 
-    expected_sequence_key = SequenceKey(
+    expected_sequence_key = ReferenceSequenceKey(
         strand="-",
         sequence_before_variant_locus="GGTCACTGCC",
         sequence_at_variant_locus="ATG",
@@ -160,12 +157,12 @@ def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand():
     #    GCGGCTCCTC_CAT_GGCAGTGACC
 
     # get the 5 nucleotides before the variant and 10 nucleotides after
-    sequence_key = sequence_key_for_variant_on_transcript(
+    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
         variant=tp53_insertion,
         transcript=tp53_001,
         context_size=10)
 
-    expected_sequence_key = SequenceKey(
+    expected_sequence_key = ReferenceSequenceKey(
         strand="-",
         sequence_before_variant_locus="CACTGCCATG",
         sequence_at_variant_locus="",

From d48b88e1d7b0776a4833b3b33405c908b72f53af Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 4 Nov 2016 13:59:44 -0400
Subject: [PATCH 05/11] fixed unit tests with ReferenceSequenceKey and
 ReferenceCodingSequenceKey

---
 isovar/allele_reads.py                        |  18 +-
 isovar/reference_context.py                   |   7 +-
 isovar/reference_sequence_key.py              |  13 +-
 ...ference_sequence_key_with_reading_frame.py | 201 -------------
 test/test_allele_reads.py                     |   4 +-
 ...ference_sequence_key_with_reading_frame.py | 283 ------------------
 6 files changed, 21 insertions(+), 505 deletions(-)
 delete mode 100644 isovar/reference_sequence_key_with_reading_frame.py
 delete mode 100644 test/test_reference_sequence_key_with_reading_frame.py

diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py
index 37d5a76..f6fb32e 100644
--- a/isovar/allele_reads.py
+++ b/isovar/allele_reads.py
@@ -36,12 +36,18 @@
 
 # subclassing from namedtuple to get a lightweight object with built-in
 # hashing and comparison while also being able to add methods
-class AlleleRead(
-        namedtuple("AlleleRead", "prefix allele suffix name sequence")):
-
-    @property
-    def sequence(self):
-        return self.prefix + self.allele + self.suffix
+AlleleReadBase = namedtuple(
+    "AlleleRead",
+    "prefix allele suffix name sequence")
+
+class AlleleRead(AlleleReadBase):
+    def __new__(cls, prefix, allele, suffix, name):
+        return AlleleReadBase(
+            prefix=prefix,
+            allele=allele,
+            suffix=suffix,
+            name=name,
+            sequence=prefix + allele + suffix)
 
     def __len__(self):
         return len(self.prefix) + len(self.allele) + len(self.suffix)
diff --git a/isovar/reference_context.py b/isovar/reference_context.py
index a023984..b90e3d2 100644
--- a/isovar/reference_context.py
+++ b/isovar/reference_context.py
@@ -18,8 +18,7 @@
 
 from .effect_prediction import reference_transcripts_for_variant
 from .dataframe_builder import DataFrameBuilder
-from .reference_sequence_key_with_reading_frame import (
-    ReferenceSequenceKeyWithReadingFrame,)
+from .reference_coding_sequence_key import ReferenceCodingSequenceKey
 
 logger = logging.getLogger(__name__)
 
@@ -37,7 +36,7 @@
 
 ReferenceContext = namedtuple(
     "ReferenceContext",
-    ReferenceSequenceKeyWithReadingFrame._fields + (
+    ReferenceCodingSequenceKey._fields + (
         "variant",
         "transcripts")
 )
@@ -76,7 +75,7 @@ def reference_contexts_for_variant(
 
     for transcript in overlapping_transcripts:
         sequence_key_with_reading_frame = \
-            ReferenceSequenceKeyWithReadingFrame.from_variant_and_transcript(
+            ReferenceCodingSequenceKey.from_variant_and_transcript(
                 variant=variant,
                 transcript=transcript,
                 context_size=context_size)
diff --git a/isovar/reference_sequence_key.py b/isovar/reference_sequence_key.py
index 5ade65d..d7f1548 100644
--- a/isovar/reference_sequence_key.py
+++ b/isovar/reference_sequence_key.py
@@ -22,21 +22,16 @@
 
 logger = logging.getLogger(__name__)
 
-##########################
-#
-# ReferenceSequenceKey
-# -----------
-#
-# Used to identify and group the distinct sequences occurring on a set of
-# transcripts overlapping a variant locus
-#
-##########################
 
 class ReferenceSequenceKey(namedtuple("ReferenceSequenceKey", [
         "strand",
         "sequence_before_variant_locus",
         "sequence_at_variant_locus",
         "sequence_after_variant_locus"])):
+    """
+    Used to identify and group the distinct sequences occurring on a set of
+    transcripts overlapping a variant locus
+    """
 
     @classmethod
     def from_variant_and_transcript(cls, variant, transcript, context_size):
diff --git a/isovar/reference_sequence_key_with_reading_frame.py b/isovar/reference_sequence_key_with_reading_frame.py
deleted file mode 100644
index d79d2a5..0000000
--- a/isovar/reference_sequence_key_with_reading_frame.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function, division, absolute_import
-import logging
-from collections import namedtuple
-
-from .variant_helpers import interbase_range_affected_by_variant_on_transcript
-from .reference_sequence_key import ReferenceSequenceKey
-
-logger = logging.getLogger(__name__)
-
-
-##########################
-#
-# ReferenceSequenceKeyWithReadingFrame
-# ---------------------------
-#
-# Includes all the fields of a ReferenceSequenceKey, but also includes a reading frame
-# at the start of the reference sequence.
-#
-#
-##########################
-
-class SequenceKeyWithReadingFrame(
-        namedtuple(
-            "ReferenceSequenceKeyWithReadingFrame",
-            ReferenceSequenceKey._fields + (
-                # if the reference context includes the 5' UTR then
-                # this is the offset to the start codon, otherwise it's the
-                # offset needed to get the first base of a codon
-                "offset_to_first_complete_codon",
-                # does this context overlap a start codon?
-                "overlaps_start_codon",
-                # does this context contain the whole trinucleotide start codon?
-                "contains_start_codon",
-                # does this context contain any UTR bases?
-                "contains_five_prime_utr",
-                # translation of complete codons in the reference context
-                # before the variant
-                "amino_acids_before_variant"))):
-
-    @classmethod
-    def from_variant_and_transcript(
-            cls,
-            variant,
-            transcript,
-            context_size):
-        """
-        Extracts the reference sequence around a variant locus on a particular
-        transcript and determines the reading frame at the start of that
-        sequence context.
-
-        Parameters
-        ----------
-        variant : varcode.Variant
-
-        transcript : pyensembl.Transcript
-
-        context_size : int
-
-        Returns SequenceKeyWithReadingFrame object or None if Transcript lacks
-        coding sequence, protein sequence or annotated start/stop codons.
-        """
-        if not transcript.contains_start_codon:
-            logger.info(
-                "Expected transcript %s for variant %s to have start codon",
-                transcript,
-                variant)
-            return None
-
-        if not transcript.contains_stop_codon:
-            logger.info(
-                "Expected transcript %s for variant %s to have stop codon",
-                transcript,
-                variant)
-            return None
-
-        if not transcript.protein_sequence:
-            logger.info(
-                "Expected transript %s for variant %s to have protein sequence",
-                transcript,
-                variant)
-            return None
-
-        sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
-            variant=variant,
-            transcript=transcript,
-            context_size=context_size)
-
-        if sequence_key is None:
-            logger.info(
-                "No sequence key for variant %s on transcript %s",
-                variant,
-                transcript)
-            return None
-
-        # get the interbase range of offsets which capture all reference
-        # bases modified by the variant
-        variant_start_offset, variant_end_offset = \
-            interbase_range_affected_by_variant_on_transcript(
-                variant=variant,
-                transcript=transcript)
-
-        start_codon_idx = min(transcript.start_codon_spliced_offsets)
-
-        # skip any variants which occur in the 5' UTR or overlap the start codon
-        # since
-        #   (1) UTR variants have unclear coding effects and
-        #   (2) start-loss variants may result in a new start codon / reading
-        #       frame but we're not sure which one!
-        if variant_start_offset < start_codon_idx + 3:
-            logger.info(
-                "Skipping transcript %s for variant %s, must be after start codon",
-                transcript,
-                variant)
-            return None
-
-        stop_codon_idx = min(transcript.stop_codon_spliced_offsets)
-
-        # skip variants which affect the 3' UTR of the transcript since
-        # they don't have obvious coding effects on the protein sequence
-        if variant_start_offset >= stop_codon_idx + 3:
-            logger.info(
-                "Skipping transcript %s for variant %s, occurs in 3' UTR",
-                transcript,
-                variant)
-            return None
-
-        n_prefix = len(sequence_key.sequence_before_variant_locus)
-        prefix_start_idx = variant_start_offset - n_prefix
-        n_bases_between_start_and_variant = variant_start_offset - start_codon_idx
-        n_full_codons_before_variant = n_bases_between_start_and_variant // 3
-
-        # if the sequence before the variant contains more bases than the
-        # distance to the start codon, then by definition it must contain
-        # some untranslated bases
-        contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant)
-        # allows for the possibility that the first base in the sequence might
-        # be the first nucleotide of the start codon
-        contains_start_codon = (n_prefix >= n_bases_between_start_and_variant)
-        # the sequence context might only include the 2nd or 3rd bases of
-        # the start codon
-        overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3)
-
-        if contains_start_codon:
-            offset_to_first_complete_codon = start_codon_idx - prefix_start_idx
-            amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant]
-        else:
-            reading_frame = (prefix_start_idx - start_codon_idx) % 3
-            offset_to_first_complete_codon = reading_frame_to_offset(reading_frame)
-            n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3
-            amino_acids_before_variant = transcript.protein_sequence[
-                n_full_codons_before_variant - n_codons_in_prefix:
-                n_full_codons_before_variant]
-        return SequenceKeyWithReadingFrame(
-            strand=sequence_key.strand,
-            sequence_before_variant_locus=sequence_key.sequence_before_variant_locus,
-            sequence_at_variant_locus=sequence_key.sequence_at_variant_locus,
-            sequence_after_variant_locus=sequence_key.sequence_after_variant_locus,
-            offset_to_first_complete_codon=offset_to_first_complete_codon,
-            contains_start_codon=contains_start_codon,
-            overlaps_start_codon=overlaps_start_codon,
-            contains_five_prime_utr=contains_five_prime_utr,
-            amino_acids_before_variant=amino_acids_before_variant)
-
-
-def reading_frame_to_offset(reading_frame_at_start_of_sequence):
-    """
-    Given a reading frame (how many nucleotides into a codon) at the
-    start of a cDNA sequence, return the number of nucleotides which need
-    to be trimmed to start on a complete codon.
-
-    Parameters
-    ----------
-
-    reading_frame_at_start_of_sequence : int
-
-    Returns an int
-    """
-    if reading_frame_at_start_of_sequence < 0:
-        raise ValueError("Reading frame can't be negative: %d" % (
-            reading_frame_at_start_of_sequence,))
-    elif reading_frame_at_start_of_sequence > 2:
-        raise ValueError("Reading frame must be within 0 and 2, not %d" % (
-            reading_frame_at_start_of_sequence,))
-    # If we're 1 nucleotide into the codon then we need to shift
-    # over two more to restore the ORF. Likewise, if we're 2 nucleotides in
-    # then we have to shift over one more.
-    return (3 - reading_frame_at_start_of_sequence) % 3
diff --git a/test/test_allele_reads.py b/test/test_allele_reads.py
index b2864ce..1c4aab8 100644
--- a/test/test_allele_reads.py
+++ b/test/test_allele_reads.py
@@ -1,5 +1,5 @@
 
-from isovar.allele_reads import AlleleRead, allele_read_from_locus_read
+from isovar.allele_reads import AlleleRead
 from isovar.locus_reads import LocusRead
 from nose.tools import eq_
 
@@ -16,7 +16,7 @@ def make_read_at_locus(prefix, alt, suffix, base_quality=30, name="dummy"):
 
 def test_allele_read_from_single_read_at_locus_trim_N_nucleotides():
     read_at_locus = make_read_at_locus(prefix="NCCN", alt="A", suffix="TNNA")
-    allele_read = allele_read_from_locus_read(read_at_locus, n_ref=1)
+    allele_read = AlleleRead.from_locus_read(read_at_locus, n_ref=1)
     print(allele_read)
     expected = AlleleRead(prefix="", allele="A", suffix="T", name="dummy")
     eq_(allele_read, expected)
diff --git a/test/test_reference_sequence_key_with_reading_frame.py b/test/test_reference_sequence_key_with_reading_frame.py
deleted file mode 100644
index 79f9ed9..0000000
--- a/test/test_reference_sequence_key_with_reading_frame.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function, division, absolute_import
-
-from isovar.reference_context import (
-    reading_frame_to_offset,
-    sequence_key_with_reading_frame_for_variant_on_transcript,
-    SequenceKeyWithReadingFrame,
-)
-from varcode import Variant
-from pyensembl import ensembl_grch38
-from nose.tools import eq_
-
-from testing_helpers import assert_equal_fields
-
-def test_reading_frame_to_offset():
-    eq_(reading_frame_to_offset(0), 0)
-    eq_(reading_frame_to_offset(1), 2)
-    eq_(reading_frame_to_offset(2), 1)
-
-
-def test_sequence_key_with_reading_frame_substitution_with_five_prime_utr():
-    # Replace second codon of TP53-001 with 'CCC', the surrounding context
-    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
-    # strand we have to take the reverse complement of the variant which turns
-    # it into CTC>GGG
-    tp53_substitution = Variant(
-        "17", 7676589, "CTC", "GGG", ensembl_grch38)
-    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
-
-    # Sequence of TP53 around second codon with 10 context nucleotides:
-    # In [51]: t.sequence[193-10:193+13]
-    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
-    # Which can be split into the following parts:
-    #  last 7 nt of 5' UTR: CACTGCC
-    #  start codon: ATG (translates to M)
-    #  2nd codon: GAG    <---- variant occurs here
-    #  3rd codon: GAG
-    #  4th codon: CCG
-    #  5th codon:  CAG
-    #  first nt of 6th codon: T
-    result = \
-        sequence_key_with_reading_frame_for_variant_on_transcript(
-            variant=tp53_substitution,
-            transcript=tp53_001,
-            context_size=10)
-    expected = SequenceKeyWithReadingFrame(
-        strand="-",
-        sequence_before_variant_locus="CACTGCCATG",
-        sequence_at_variant_locus="GAG",
-        sequence_after_variant_locus="GAGCCGCAGT",
-        offset_to_first_complete_codon=7,
-        contains_start_codon=True,
-        overlaps_start_codon=True,
-        contains_five_prime_utr=True,
-        amino_acids_before_variant="M")
-    assert_equal_fields(result, expected)
-
-def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
-    # Delete second codon of TP53-001, the surrounding context
-    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
-    # strand we have to take the reverse complement of the variant which turns
-    # it into 'CTC'>''
-    tp53_deletion = Variant(
-        "17", 7676589, "CTC", "", ensembl_grch38)
-    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
-
-    # Sequence of TP53 around second codon with 10 context nucleotides:
-    # In [51]: t.sequence[193-10:193+13]
-    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
-    # Which can be split into the following parts:
-    #  last 7 nt of 5' UTR: CACTGCC
-    #  start codon: ATG (translates to M)
-    #  2nd codon: GAG    <---- variant occurs here
-    #  3rd codon: GAG
-    #  4th codon: CCG
-    #  5th codon:  CAG
-    #  first nt of 6th codon: T
-
-    result = \
-        sequence_key_with_reading_frame_for_variant_on_transcript(
-            variant=tp53_deletion,
-            transcript=tp53_001,
-            context_size=10)
-    expected = SequenceKeyWithReadingFrame(
-        strand="-",
-        sequence_before_variant_locus="CACTGCCATG",
-        sequence_at_variant_locus="GAG",
-        sequence_after_variant_locus="GAGCCGCAGT",
-        offset_to_first_complete_codon=7,
-        contains_start_codon=True,
-        overlaps_start_codon=True,
-        contains_five_prime_utr=True,
-        amino_acids_before_variant="M")
-    assert_equal_fields(result, expected)
-
-
-def test_sequence_key_with_reading_frame_insertion():
-    # Insert nucleotide "T" after second codon of TP53-001, the
-    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
-    # the negative strand we have to take the reverse complement of the variant
-    # which turns it into 'CTC'>'CTCA'
-    tp53_insertion = Variant(
-        "17", 7676586, "CTC", "CTCA", ensembl_grch38)
-
-    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
-    # Sequence of TP53 around boundary of 2nd/3rd codons
-    # with 10 context nucleotides:
-    #   last 4 nt of 5' UTR: TGCC
-    #   start codon: ATG (translates to M)
-    #   2nd codon: GAG (translates to E)
-    #   <---- insertion variant occurs between these two codons
-    #   3rd codon: GAG
-    #   4th codon: CCG
-    #   5th codon:  CAG
-    #   first nt of 6th codon: T
-
-    result = \
-        sequence_key_with_reading_frame_for_variant_on_transcript(
-            variant=tp53_insertion,
-            transcript=tp53_001,
-            context_size=10)
-
-    expected = SequenceKeyWithReadingFrame(
-        strand="-",
-        sequence_before_variant_locus="TGCCATGGAG",
-        sequence_at_variant_locus="",
-        sequence_after_variant_locus="GAGCCGCAGT",
-        offset_to_first_complete_codon=4,
-        contains_start_codon=True,
-        overlaps_start_codon=True,
-        contains_five_prime_utr=True,
-        amino_acids_before_variant="ME")
-    assert_equal_fields(result, expected)
-
-def test_sequence_key_with_reading_frame_insertion_inside_start_codon():
-    # insert nucleotide "C" in the middle of the start codon of TP53-001,
-    # keeping only 1 nucleotide of context. In the reverse complement this
-    # becomes 'T'>'TG'
-    tp53_insertion = Variant(
-        "17", 7676592, "T", "TG", ensembl_grch38)
-
-    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
-
-    result = \
-        sequence_key_with_reading_frame_for_variant_on_transcript(
-            variant=tp53_insertion,
-            transcript=tp53_001,
-            context_size=1)
-    assert result is None, "Expected result to be None when variant affects start codon"
-
-def test_sequence_key_with_reading_frame_insertion_before_start_codon():
-    # insert nucleotide "T" before of the start codon of TP53-001,
-    tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38)
-
-    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
-
-    result = \
-        sequence_key_with_reading_frame_for_variant_on_transcript(
-            variant=tp53_insertion,
-            transcript=tp53_001,
-            context_size=1)
-    assert result is None, "Expected result to be None when variant before start codon"
-
-
-def test_sequence_key_with_reading_frame_insertion_context_6nt_contains_start():
-    # Insert nucleotide "T" after second codon of TP53-001,
-    # but in this test we're going to only keep enough context to see
-    # the start codon but none of the 5' UTR. In the reverse complement this
-    # variant becomes CTC>CTCA
-    tp53_insertion = Variant(
-        "17", 7676586, "CTC", "CTCA", ensembl_grch38)
-
-    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
-    # Sequence of TP53 around boundary of 2nd/3rd codons
-    # with 6 context nucleotides:
-    #   start codon: ATG (translates to M)
-    #   2nd codon: GAG (translates to E)
-    #   <---- insertion variant occurs between these two codons
-    #   3rd codon: GAG
-    #   4th codon: CCG
-
-    result = \
-        sequence_key_with_reading_frame_for_variant_on_transcript(
-            variant=tp53_insertion,
-            transcript=tp53_001,
-            context_size=6)
-
-    expected = SequenceKeyWithReadingFrame(
-        strand="-",
-        sequence_before_variant_locus="ATGGAG",
-        sequence_at_variant_locus="",
-        sequence_after_variant_locus="GAGCCG",
-        offset_to_first_complete_codon=0,
-        contains_start_codon=True,
-        overlaps_start_codon=True,
-        contains_five_prime_utr=False,
-        amino_acids_before_variant="ME")
-    assert_equal_fields(result, expected)
-
-
-def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start():
-    # Insert nucleotide "T" after second codon of TP53-001,
-    # but in this test we're going to only keep enough context to see
-    # a part of the start codon, thus the result shouldn't "contain"
-    # the start codon but does "overlap" it. In the reverse complement
-    # this variant becomes CTC>CTCA
-    tp53_insertion = Variant(
-        "17", 7676586, "CTC", "CTCA", ensembl_grch38)
-
-    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
-    # Sequence of TP53 around boundary of 2nd/3rd codons
-    # with 6 context nucleotides:
-    #   last two nt of start codon: TG
-    #   2nd codon: GAG (translates to E)
-    #   <---- insertion variant occurs between these two codons
-    #   3rd codon: GAG
-    #   first two nt of 4th codon: CC
-
-    result = \
-        sequence_key_with_reading_frame_for_variant_on_transcript(
-            variant=tp53_insertion,
-            transcript=tp53_001,
-            context_size=5)
-
-    expected = SequenceKeyWithReadingFrame(
-        strand="-",
-        sequence_before_variant_locus="TGGAG",
-        sequence_at_variant_locus="",
-        sequence_after_variant_locus="GAGCC",
-        offset_to_first_complete_codon=2,
-        contains_start_codon=False,
-        overlaps_start_codon=True,
-        contains_five_prime_utr=False,
-        amino_acids_before_variant="E")
-    assert_equal_fields(result, expected)
-
-
-def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
-    # Insert nucleotide "T" after second codon of TP53-001,
-    # but in this test we're going to only keep enough context to see
-    # the second codon (and no nucleotides from the start). In the reverse
-    # complement this variant becomes CTC>CTCA.
-
-    tp53_insertion = Variant(
-        "17", 7676586, "CTC", "CTCA", ensembl_grch38)
-
-    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
-    # Sequence of TP53 around boundary of 2nd/3rd codons
-    # with 6 context nucleotides:
-    #   2nd codon: GAG (translates to E)
-    #   <---- insertion variant occurs between these two codons
-    #   3rd codon: GAG
-
-    result = \
-        sequence_key_with_reading_frame_for_variant_on_transcript(
-            variant=tp53_insertion,
-            transcript=tp53_001,
-            context_size=3)
-
-    expected = SequenceKeyWithReadingFrame(
-        strand="-",
-        sequence_before_variant_locus="GAG",
-        sequence_at_variant_locus="",
-        sequence_after_variant_locus="GAG",
-        offset_to_first_complete_codon=0,
-        contains_start_codon=False,
-        overlaps_start_codon=False,
-        contains_five_prime_utr=False,
-        amino_acids_before_variant="E")
-    assert_equal_fields(result, expected)

From 1c88072520b0003a346b8e1d49272e8029ca25e9 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 4 Nov 2016 15:07:00 -0400
Subject: [PATCH 06/11] added reference coding sequence module and tests

---
 isovar/reference_coding_sequence_key.py    | 195 +++++++++++++++
 test/test_reference_coding_sequence_key.py | 274 +++++++++++++++++++++
 2 files changed, 469 insertions(+)
 create mode 100644 isovar/reference_coding_sequence_key.py
 create mode 100644 test/test_reference_coding_sequence_key.py

diff --git a/isovar/reference_coding_sequence_key.py b/isovar/reference_coding_sequence_key.py
new file mode 100644
index 0000000..7a9029e
--- /dev/null
+++ b/isovar/reference_coding_sequence_key.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+import logging
+from collections import namedtuple
+
+from .variant_helpers import interbase_range_affected_by_variant_on_transcript
+from .reference_sequence_key import ReferenceSequenceKey
+
+logger = logging.getLogger(__name__)
+
+class ReferenceCodingSequenceKey(
+        namedtuple(
+            "ReferenceCodingSequenceKey",
+            ReferenceSequenceKey._fields + (
+                # if the reference context includes the 5' UTR then
+                # this is the offset to the start codon, otherwise it's the
+                # offset needed to get the first base of a codon
+                "offset_to_first_complete_codon",
+                # does this context overlap a start codon?
+                "overlaps_start_codon",
+                # does this context contain the whole trinucleotide start codon?
+                "contains_start_codon",
+                # does this context contain any UTR bases?
+                "contains_five_prime_utr",
+                # translation of complete codons in the reference context
+                # before the variant
+                "amino_acids_before_variant"))):
+
+    """
+    Includes all the fields of a ReferenceSequenceKey, and adds on top of those
+    a reading frame and information about where the start codon / 5' UTR are
+    relative to this sequence fragment.
+    """
+
+    @classmethod
+    def from_variant_and_transcript(
+            cls,
+            variant,
+            transcript,
+            context_size):
+        """
+        Extracts the reference sequence around a variant locus on a particular
+        transcript and determines the reading frame at the start of that
+        sequence context.
+
+        Parameters
+        ----------
+        variant : varcode.Variant
+
+        transcript : pyensembl.Transcript
+
+        context_size : int
+
+        Returns SequenceKeyWithReadingFrame object or None if Transcript lacks
+        coding sequence, protein sequence or annotated start/stop codons.
+        """
+        if not transcript.contains_start_codon:
+            logger.info(
+                "Expected transcript %s for variant %s to have start codon",
+                transcript,
+                variant)
+            return None
+
+        if not transcript.contains_stop_codon:
+            logger.info(
+                "Expected transcript %s for variant %s to have stop codon",
+                transcript,
+                variant)
+            return None
+
+        if not transcript.protein_sequence:
+            logger.info(
+                "Expected transript %s for variant %s to have protein sequence",
+                transcript,
+                variant)
+            return None
+
+        sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
+            variant=variant,
+            transcript=transcript,
+            context_size=context_size)
+
+        if sequence_key is None:
+            logger.info(
+                "No sequence key for variant %s on transcript %s",
+                variant,
+                transcript)
+            return None
+
+        # get the interbase range of offsets which capture all reference
+        # bases modified by the variant
+        variant_start_offset, variant_end_offset = \
+            interbase_range_affected_by_variant_on_transcript(
+                variant=variant,
+                transcript=transcript)
+
+        start_codon_idx = min(transcript.start_codon_spliced_offsets)
+
+        # skip any variants which occur in the 5' UTR or overlap the start codon
+        # since
+        #   (1) UTR variants have unclear coding effects and
+        #   (2) start-loss variants may result in a new start codon / reading
+        #       frame but we're not sure which one!
+        if variant_start_offset < start_codon_idx + 3:
+            logger.info(
+                "Skipping transcript %s for variant %s, must be after start codon",
+                transcript,
+                variant)
+            return None
+
+        stop_codon_idx = min(transcript.stop_codon_spliced_offsets)
+
+        # skip variants which affect the 3' UTR of the transcript since
+        # they don't have obvious coding effects on the protein sequence
+        if variant_start_offset >= stop_codon_idx + 3:
+            logger.info(
+                "Skipping transcript %s for variant %s, occurs in 3' UTR",
+                transcript,
+                variant)
+            return None
+
+        n_prefix = len(sequence_key.sequence_before_variant_locus)
+        prefix_start_idx = variant_start_offset - n_prefix
+        n_bases_between_start_and_variant = variant_start_offset - start_codon_idx
+        n_full_codons_before_variant = n_bases_between_start_and_variant // 3
+
+        # if the sequence before the variant contains more bases than the
+        # distance to the start codon, then by definition it must contain
+        # some untranslated bases
+        contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant)
+        # allows for the possibility that the first base in the sequence might
+        # be the first nucleotide of the start codon
+        contains_start_codon = (n_prefix >= n_bases_between_start_and_variant)
+        # the sequence context might only include the 2nd or 3rd bases of
+        # the start codon
+        overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3)
+
+        if contains_start_codon:
+            offset_to_first_complete_codon = start_codon_idx - prefix_start_idx
+            amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant]
+        else:
+            reading_frame = (prefix_start_idx - start_codon_idx) % 3
+            offset_to_first_complete_codon = reading_frame_to_offset(reading_frame)
+            n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3
+            amino_acids_before_variant = transcript.protein_sequence[
+                n_full_codons_before_variant - n_codons_in_prefix:
+                n_full_codons_before_variant]
+        return cls(
+            strand=sequence_key.strand,
+            sequence_before_variant_locus=sequence_key.sequence_before_variant_locus,
+            sequence_at_variant_locus=sequence_key.sequence_at_variant_locus,
+            sequence_after_variant_locus=sequence_key.sequence_after_variant_locus,
+            offset_to_first_complete_codon=offset_to_first_complete_codon,
+            contains_start_codon=contains_start_codon,
+            overlaps_start_codon=overlaps_start_codon,
+            contains_five_prime_utr=contains_five_prime_utr,
+            amino_acids_before_variant=amino_acids_before_variant)
+
+
+def reading_frame_to_offset(reading_frame_at_start_of_sequence):
+    """
+    Given a reading frame (how many nucleotides into a codon) at the
+    start of a cDNA sequence, return the number of nucleotides which need
+    to be trimmed to start on a complete codon.
+
+    Parameters
+    ----------
+
+    reading_frame_at_start_of_sequence : int
+
+    Returns an int
+    """
+    if reading_frame_at_start_of_sequence < 0:
+        raise ValueError("Reading frame can't be negative: %d" % (
+            reading_frame_at_start_of_sequence,))
+    elif reading_frame_at_start_of_sequence > 2:
+        raise ValueError("Reading frame must be within 0 and 2, not %d" % (
+            reading_frame_at_start_of_sequence,))
+    # If we're 1 nucleotide into the codon then we need to shift
+    # over two more to restore the ORF. Likewise, if we're 2 nucleotides in
+    # then we have to shift over one more.
+    return (3 - reading_frame_at_start_of_sequence) % 3
diff --git a/test/test_reference_coding_sequence_key.py b/test/test_reference_coding_sequence_key.py
new file mode 100644
index 0000000..f60dabd
--- /dev/null
+++ b/test/test_reference_coding_sequence_key.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+
+from isovar.reference_coding_sequence_key import (
+    reading_frame_to_offset,
+    ReferenceCodingSequenceKey,
+)
+from varcode import Variant
+from pyensembl import ensembl_grch38
+from nose.tools import eq_
+
+from testing_helpers import assert_equal_fields
+
+def test_reading_frame_to_offset():
+    eq_(reading_frame_to_offset(0), 0)
+    eq_(reading_frame_to_offset(1), 2)
+    eq_(reading_frame_to_offset(2), 1)
+
+
+def test_sequence_key_with_reading_frame_substitution_with_five_prime_utr():
+    # Replace second codon of TP53-001 with 'CCC', the surrounding context
+    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
+    # strand we have to take the reverse complement of the variant which turns
+    # it into CTC>GGG
+    tp53_substitution = Variant(
+        "17", 7676589, "CTC", "GGG", ensembl_grch38)
+    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
+
+    # Sequence of TP53 around second codon with 10 context nucleotides:
+    # In [51]: t.sequence[193-10:193+13]
+    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
+    # Which can be split into the following parts:
+    #  last 7 nt of 5' UTR: CACTGCC
+    #  start codon: ATG (translates to M)
+    #  2nd codon: GAG    <---- variant occurs here
+    #  3rd codon: GAG
+    #  4th codon: CCG
+    #  5th codon:  CAG
+    #  first nt of 6th codon: T
+    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
+        variant=tp53_substitution,
+        transcript=tp53_001,
+        context_size=10)
+    expected = ReferenceCodingSequenceKey(
+        strand="-",
+        sequence_before_variant_locus="CACTGCCATG",
+        sequence_at_variant_locus="GAG",
+        sequence_after_variant_locus="GAGCCGCAGT",
+        offset_to_first_complete_codon=7,
+        contains_start_codon=True,
+        overlaps_start_codon=True,
+        contains_five_prime_utr=True,
+        amino_acids_before_variant="M")
+    assert_equal_fields(result, expected)
+
+def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
+    # Delete second codon of TP53-001, the surrounding context
+    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
+    # strand we have to take the reverse complement of the variant which turns
+    # it into 'CTC'>''
+    tp53_deletion = Variant(
+        "17", 7676589, "CTC", "", ensembl_grch38)
+    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
+
+    # Sequence of TP53 around second codon with 10 context nucleotides:
+    # In [51]: t.sequence[193-10:193+13]
+    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
+    # Which can be split into the following parts:
+    #  last 7 nt of 5' UTR: CACTGCC
+    #  start codon: ATG (translates to M)
+    #  2nd codon: GAG    <---- variant occurs here
+    #  3rd codon: GAG
+    #  4th codon: CCG
+    #  5th codon:  CAG
+    #  first nt of 6th codon: T
+
+    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
+        variant=tp53_deletion,
+        transcript=tp53_001,
+        context_size=10)
+    expected = ReferenceCodingSequenceKey(
+        strand="-",
+        sequence_before_variant_locus="CACTGCCATG",
+        sequence_at_variant_locus="GAG",
+        sequence_after_variant_locus="GAGCCGCAGT",
+        offset_to_first_complete_codon=7,
+        contains_start_codon=True,
+        overlaps_start_codon=True,
+        contains_five_prime_utr=True,
+        amino_acids_before_variant="M")
+    assert_equal_fields(result, expected)
+
+
+def test_sequence_key_with_reading_frame_insertion():
+    # Insert nucleotide "T" after second codon of TP53-001, the
+    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
+    # the negative strand we have to take the reverse complement of the variant
+    # which turns it into 'CTC'>'CTCA'
+    tp53_insertion = Variant(
+        "17", 7676586, "CTC", "CTCA", ensembl_grch38)
+
+    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
+    # Sequence of TP53 around boundary of 2nd/3rd codons
+    # with 10 context nucleotides:
+    #   last 4 nt of 5' UTR: TGCC
+    #   start codon: ATG (translates to M)
+    #   2nd codon: GAG (translates to E)
+    #   <---- insertion variant occurs between these two codons
+    #   3rd codon: GAG
+    #   4th codon: CCG
+    #   5th codon:  CAG
+    #   first nt of 6th codon: T
+
+    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
+        variant=tp53_insertion,
+        transcript=tp53_001,
+        context_size=10)
+
+    expected = ReferenceCodingSequenceKey(
+        strand="-",
+        sequence_before_variant_locus="TGCCATGGAG",
+        sequence_at_variant_locus="",
+        sequence_after_variant_locus="GAGCCGCAGT",
+        offset_to_first_complete_codon=4,
+        contains_start_codon=True,
+        overlaps_start_codon=True,
+        contains_five_prime_utr=True,
+        amino_acids_before_variant="ME")
+    assert_equal_fields(result, expected)
+
+def test_reference_coding_sequence_key_insertion_inside_start_codon():
+    # insert nucleotide "C" in the middle of the start codon of TP53-001,
+    # keeping only 1 nucleotide of context. In the reverse complement this
+    # becomes 'T'>'TG'
+    tp53_insertion = Variant(
+        "17", 7676592, "T", "TG", ensembl_grch38)
+
+    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
+
+    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
+        variant=tp53_insertion,
+        transcript=tp53_001,
+        context_size=1)
+    assert result is None, "Expected result to be None when variant affects start codon"
+
+def test_sequence_key_with_reading_frame_insertion_before_start_codon():
+    # insert nucleotide "T" before of the start codon of TP53-001,
+    tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38)
+
+    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
+
+    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
+        variant=tp53_insertion,
+        transcript=tp53_001,
+        context_size=1)
+    assert result is None, "Expected result to be None when variant before start codon"
+
+
+def test_sequence_key_with_reading_frame_insertion_context_6nt_contains_start():
+    # Insert nucleotide "T" after second codon of TP53-001,
+    # but in this test we're going to only keep enough context to see
+    # the start codon but none of the 5' UTR. In the reverse complement this
+    # variant becomes CTC>CTCA
+    tp53_insertion = Variant(
+        "17", 7676586, "CTC", "CTCA", ensembl_grch38)
+
+    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
+    # Sequence of TP53 around boundary of 2nd/3rd codons
+    # with 6 context nucleotides:
+    #   start codon: ATG (translates to M)
+    #   2nd codon: GAG (translates to E)
+    #   <---- insertion variant occurs between these two codons
+    #   3rd codon: GAG
+    #   4th codon: CCG
+
+    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
+        variant=tp53_insertion,
+        transcript=tp53_001,
+        context_size=6)
+
+    expected = ReferenceCodingSequenceKey(
+        strand="-",
+        sequence_before_variant_locus="ATGGAG",
+        sequence_at_variant_locus="",
+        sequence_after_variant_locus="GAGCCG",
+        offset_to_first_complete_codon=0,
+        contains_start_codon=True,
+        overlaps_start_codon=True,
+        contains_five_prime_utr=False,
+        amino_acids_before_variant="ME")
+    assert_equal_fields(result, expected)
+
+
+def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start():
+    # Insert nucleotide "T" after second codon of TP53-001,
+    # but in this test we're going to only keep enough context to see
+    # a part of the start codon, thus the result shouldn't "contain"
+    # the start codon but does "overlap" it. In the reverse complement
+    # this variant becomes CTC>CTCA
+    tp53_insertion = Variant(
+        "17", 7676586, "CTC", "CTCA", ensembl_grch38)
+
+    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
+    # Sequence of TP53 around boundary of 2nd/3rd codons
+    # with 6 context nucleotides:
+    #   last two nt of start codon: TG
+    #   2nd codon: GAG (translates to E)
+    #   <---- insertion variant occurs between these two codons
+    #   3rd codon: GAG
+    #   first two nt of 4th codon: CC
+
+    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
+        variant=tp53_insertion,
+        transcript=tp53_001,
+        context_size=5)
+
+    expected = ReferenceCodingSequenceKey(
+        strand="-",
+        sequence_before_variant_locus="TGGAG",
+        sequence_at_variant_locus="",
+        sequence_after_variant_locus="GAGCC",
+        offset_to_first_complete_codon=2,
+        contains_start_codon=False,
+        overlaps_start_codon=True,
+        contains_five_prime_utr=False,
+        amino_acids_before_variant="E")
+    assert_equal_fields(result, expected)
+
+
+def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
+    # Insert nucleotide "T" after second codon of TP53-001,
+    # but in this test we're going to only keep enough context to see
+    # the second codon (and no nucleotides from the start). In the reverse
+    # complement this variant becomes CTC>CTCA.
+
+    tp53_insertion = Variant(
+        "17", 7676586, "CTC", "CTCA", ensembl_grch38)
+
+    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
+    # Sequence of TP53 around boundary of 2nd/3rd codons
+    # with 6 context nucleotides:
+    #   2nd codon: GAG (translates to E)
+    #   <---- insertion variant occurs between these two codons
+    #   3rd codon: GAG
+
+    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
+        variant=tp53_insertion,
+        transcript=tp53_001,
+        context_size=3)
+
+    expected = ReferenceCodingSequenceKey(
+        strand="-",
+        sequence_before_variant_locus="GAG",
+        sequence_at_variant_locus="",
+        sequence_after_variant_locus="GAG",
+        offset_to_first_complete_codon=0,
+        contains_start_codon=False,
+        overlaps_start_codon=False,
+        contains_five_prime_utr=False,
+        amino_acids_before_variant="E")
+    assert_equal_fields(result, expected)

From 78b41804a2b2b76b9e55411f0523824f8da39d6d Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 4 Nov 2016 16:20:41 -0400
Subject: [PATCH 07/11] renamed CLI arg for assembly

---
 isovar/cli/variant_sequences.py |  4 ++--
 isovar/variant_sequences.py     | 36 ++++++++++++++-------------------
 2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/isovar/cli/variant_sequences.py b/isovar/cli/variant_sequences.py
index 14579a5..0f85713 100644
--- a/isovar/cli/variant_sequences.py
+++ b/isovar/cli/variant_sequences.py
@@ -39,7 +39,7 @@ def add_variant_sequence_args(
         help="Minimum number of reads supporting a variant sequence (default %(default)s)")
 
     rna_sequence_group.add_argument(
-        "--overlap-assembly",
+        "--variant-sequence-assembly",
         default=False,
         action="store_true")
 
@@ -82,7 +82,7 @@ def variant_sequences_generator_from_args(args):
         allele_reads_generator,
         min_reads_supporting_cdna_sequence=args.min_reads_supporting_variant_sequence,
         preferred_sequence_length=args.cdna_sequence_length,
-        overlap_assembly=args.overlap_assembly)
+        variant_cdna_sequence_assembly=args.variant_sequence_assembly)
 
 def variant_sequences_dataframe_from_args(args):
     variant_sequences_generator = variant_sequences_generator_from_args(args)
diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py
index 822607e..e882cbf 100644
--- a/isovar/variant_sequences.py
+++ b/isovar/variant_sequences.py
@@ -332,19 +332,22 @@ def filter_variant_sequences(
         variant_sequences=variant_sequences,
         preferred_sequence_length=preferred_sequence_length)
 
-def supporting_reads_to_variant_sequences(
-        variant_reads,
+def reads_to_variant_sequences(
+        variant,
+        reads,
         preferred_sequence_length,
         min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
         variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY):
     """
-    Collapse variant-support RNA reads into consensus sequences of approximately
-    the preferred length (may differ at the ends of transcripts), filter
-    consensus sequences by length and number of supporting RNA reads.
+    Collapse variant-supporting RNA reads into consensus sequences of
+    approximately the preferred length (may differ at the ends of transcripts),
+    filter consensus sequences by length and number of supporting RNA reads.
 
     Parameters
     ----------
-    variant_reads : list of AlleleRead objects
+    variant : varcode.Variant
+
+    reads : list of AlleleRead objects
         Should all support the same variant allele nucleotides.
 
     preferred_sequence_length : int
@@ -358,14 +361,14 @@ def supporting_reads_to_variant_sequences(
         Drop sequences which don't at least have this number of fully spanning
         reads.
 
-    overlap_assembly : bool
+    reads_to_variant_sequences : bool
         Construct variant sequences by merging overlapping reads. If False
         then variant sequences must be fully spanned by cDNA reads.
 
     Returns a collection of VariantSequence objects
     """
     # just in case variant_reads is a generator, convert it to a list
-    variant_reads = list(variant_reads)
+    variant_reads = list(filter_non_alt_reads_for_variant(variant, reads))
 
     if len(variant_reads) == 0:
         return []
@@ -406,6 +409,7 @@ def supporting_reads_to_variant_sequences(
             max(len(s) for s in variant_sequences))
     else:
         logger.info("After overlap assembly: 0 variant sequences")
+        return []
 
     variant_sequences = filter_variant_sequences(
         variant_sequences=variant_sequences,
@@ -421,23 +425,13 @@ def supporting_reads_to_variant_sequences(
             max(len(s) for s in variant_sequences))
     else:
         logger.info("After coverage & length filtering: 0 variant sequences")
+        return []
 
     # sort VariantSequence objects by decreasing order of supporting read
     # counts
     variant_sequences.sort(key=sort_key_decreasing_read_count)
     return variant_sequences
 
-def overlapping_reads_to_variant_sequences(
-        variant,
-        overlapping_reads,
-        min_reads_supporting_cdna_sequence,
-        preferred_sequence_length):
-    variant_reads = filter_non_alt_reads_for_variant(variant, overlapping_reads)
-    return supporting_reads_to_variant_sequences(
-        variant_reads,
-        preferred_sequence_length=preferred_sequence_length,
-        min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence)
-
 def reads_generator_to_sequences_generator(
         variant_and_reads_generator,
         min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
@@ -468,9 +462,9 @@ def reads_generator_to_sequences_generator(
         - list of VariantSequence objects
     """
     for variant, variant_reads in variant_and_reads_generator:
-        variant_sequences = overlapping_reads_to_variant_sequences(
+        variant_sequences = reads_to_variant_sequences(
             variant=variant,
-            overlapping_reads=variant_reads,
+            reads=variant_reads,
             min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence,
             preferred_sequence_length=preferred_sequence_length,
             variant_cdna_sequence_assembly=variant_cdna_sequence_assembly)

From e14e01938ab6e9011c4acd93a7d5db6a124f8b0c Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 4 Nov 2016 16:34:50 -0400
Subject: [PATCH 08/11] added caching of pyensembl

---
 .travis.yml                 |  6 ++++++
 isovar/variant_sequences.py | 15 +++++++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 01c9b9c..d15bdef 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,6 +3,12 @@ language: python
 python:
   - "2.7"
   - "3.4"
+cache:
+  pip: true
+  # cache directory used for Ensembl downloads of GTF and FASTA files
+  # along with the indexed db of intervals and ID mappings and pickles
+  # of sequence dictionaries
+  directories: /home/travis/.cache/pyensembl/
 before_install:
   # Commands below copied from: http://conda.pydata.org/docs/travis.html
   # We do this conditionally because it saves us some downloading if the
diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py
index e882cbf..586749e 100644
--- a/isovar/variant_sequences.py
+++ b/isovar/variant_sequences.py
@@ -35,7 +35,7 @@
 
 # using this base class to define the core fields of a VariantSequence
 # but inheriting it from it to allow the addition of helper methods
-VariantSequenceFields = namedtuple(
+VariantSequenceBase = namedtuple(
     "VariantSequence",
     [
         # nucleotides before a variant
@@ -47,13 +47,12 @@
         # since we often want to look at prefix+alt+suffix, let's cache it
         "sequence",
         # reads which were used to determine this sequences
-        "reads",
-    ])
+        "reads"])
 
-class VariantSequence(VariantSequenceFields):
+class VariantSequence(VariantSequenceBase):
     def __new__(cls, prefix, alt, suffix, reads):
         # construct sequence from prefix + alt + suffix
-        return VariantSequenceFields.__new__(
+        return VariantSequenceBase.__new__(
             cls,
             prefix=prefix,
             alt=alt,
@@ -227,8 +226,8 @@ def trim_by_coverage(self, min_reads):
             suffix=self.suffix[:last_covered_index - variant_end_index + 1],
             reads=self.reads)
 
-def sort_key_decreasing_read_count(variant_sequence):
-    return -len(variant_sequence.reads)
+    def sort_key_decreasing_read_count(self):
+        return -len(self.reads)
 
 def initial_variant_sequences_from_reads(
         variant_reads,
@@ -429,7 +428,7 @@ def reads_to_variant_sequences(
 
     # sort VariantSequence objects by decreasing order of supporting read
     # counts
-    variant_sequences.sort(key=sort_key_decreasing_read_count)
+    variant_sequences.sort(key=VariantSequence.sort_key_decreasing_read_count)
     return variant_sequences
 
 def reads_generator_to_sequences_generator(

From 881aa04810a9b8077da6521b58783577468b5010 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 4 Nov 2016 16:36:51 -0400
Subject: [PATCH 09/11] version bump due to large change in logic

---
 isovar/__init__.py          | 2 +-
 isovar/variant_sequences.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/isovar/__init__.py b/isovar/__init__.py
index 42419f9..f4cac02 100644
--- a/isovar/__init__.py
+++ b/isovar/__init__.py
@@ -14,4 +14,4 @@
 
 from __future__ import print_function, division, absolute_import
 
-__version__ = "0.2.4"
+__version__ = "0.3.0"
diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py
index 586749e..4fa52db 100644
--- a/isovar/variant_sequences.py
+++ b/isovar/variant_sequences.py
@@ -452,7 +452,7 @@ def reads_generator_to_sequences_generator(
     sequence_length : int
         Desired sequence length, including variant nucleotides
 
-    overlap_assembly : bool
+    variant_cdna_sequence_assembly : bool
         Construct variant sequences by merging overlapping reads. If False
         then variant sequences must be fully spanned by cDNA reads.
 

From bb757cbe04d3c3d5d3e1063ec6fdd1c79fa39994 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 4 Nov 2016 16:48:52 -0400
Subject: [PATCH 10/11] assembly unit test with mock reads

---
 test/test_assembly.py | 49 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/test/test_assembly.py b/test/test_assembly.py
index 1bfe127..3ea5d4c 100644
--- a/test/test_assembly.py
+++ b/test/test_assembly.py
@@ -59,11 +59,46 @@ def test_assemble_transcript_fragments_snv():
         assert len(s) > max_read_length, \
             "Expected assembled sequences to be longer than read length (%d)" % (
                 max_read_length,)
-"""
-def test_assemble_simple_sequences():
-    AlleleRead
-    VariantSequence
-"""
 
-if __name__ == "__main__":
-    test_assemble_transcript_fragments_snv()
+def test_assembly_of_simple_sequence_from_mock_reads():
+    #
+    # Read sequences:
+    #    AAAAA|CC|TTTTT
+    #    AAAAA|CC|TTTTT
+    #   GAAAAA|CC|TTTTTG
+    #     AAAA|CC|TTTT
+    reads = [
+        # two identical reads with sequence AAAAA|CC|TTTTT
+        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"),
+        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"),
+        # longer sequence GAAAAA|CC|TTTTTG
+        AlleleRead(prefix="G" + "A" * 5, allele="CC", suffix="T" * 5 + "G", name="longer"),
+        # shorter sequence AAAA|CC|TTTT
+        AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4, name="shorter"),
+    ]
+    expected_variant_sequence = VariantSequence(
+        prefix="G" + "A" * 5, alt="CC", suffix="T" * 5 + "G", reads=reads)
+    initial_variant_sequences = initial_variant_sequences_from_reads(reads)
+    # expecting one fewer sequence than reads since two of the reads are
+    # duplicates
+    eq_(len(initial_variant_sequences), len(reads) - 1)
+
+    assembled_variant_sequences = iterative_overlap_assembly(
+        initial_variant_sequences,
+        min_overlap_size=1)
+
+    # since no reads contradict each other then we should get back a single
+    # assembled sequence
+    eq_(len(assembled_variant_sequences),
+        1,
+        "Unexpected number of variant sequences: %s" % (assembled_variant_sequences,))
+    assembled_variant_sequence = assembled_variant_sequences[0]
+    eq_(assembled_variant_sequence, expected_variant_sequence)
+
+    eq_(len(assembled_variant_sequence.reads), len(reads))
+
+    eq_(assembled_variant_sequence.min_coverage(), 1)
+    # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with
+    # all 4/4 reads
+    expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14
+    eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage)

From dcfd2ed6a0d21209b125851d5e68b03cad972e18 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 4 Nov 2016 18:03:04 -0400
Subject: [PATCH 11/11] fixed name of reads arg

---
 isovar/reference_coding_sequence_key.py |  6 +--
 isovar/reference_context.py             | 55 +++++++++++++------------
 isovar/translation.py                   |  7 ++--
 test/test_allele_counts.py              |  2 +-
 test/test_variant_sequences.py          |  7 ++--
 5 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/isovar/reference_coding_sequence_key.py b/isovar/reference_coding_sequence_key.py
index 7a9029e..d37bc04 100644
--- a/isovar/reference_coding_sequence_key.py
+++ b/isovar/reference_coding_sequence_key.py
@@ -40,9 +40,9 @@ class ReferenceCodingSequenceKey(
                 "amino_acids_before_variant"))):
 
     """
-    Includes all the fields of a ReferenceSequenceKey, and adds on top of those
-    a reading frame and information about where the start codon / 5' UTR are
-    relative to this sequence fragment.
+    ReferenceCodingSequenceKey includes all the fields of a ReferenceSequenceKey,
+    and additionally tracks the reading frame and information about where the
+    start codon and 5' UTR are relative to this sequence fragment.
     """
 
     @classmethod
diff --git a/isovar/reference_context.py b/isovar/reference_context.py
index b90e3d2..1b0d262 100644
--- a/isovar/reference_context.py
+++ b/isovar/reference_context.py
@@ -34,19 +34,31 @@
 #
 ##########################
 
-ReferenceContext = namedtuple(
-    "ReferenceContext",
-    ReferenceCodingSequenceKey._fields + (
-        "variant",
-        "transcripts")
-)
-
-def sort_key_decreasing_max_length_transcript_cds(reference_context):
-    """
-    Used to sort a sequence of ReferenceContext objects by the longest CDS
-    in each context's list of transcripts.
-    """
-    return -max(len(t.coding_sequence) for t in reference_context.transcripts)
+class ReferenceContext(namedtuple(
+        "ReferenceContext",
+        ReferenceCodingSequenceKey._fields + ("variant", "transcripts"))):
+
+    @classmethod
+    def from_reference_coding_sequence_key(cls, key, variant, transcripts):
+        return cls(
+            strand=key.strand,
+            sequence_before_variant_locus=key.sequence_before_variant_locus,
+            sequence_at_variant_locus=key.sequence_at_variant_locus,
+            sequence_after_variant_locus=key.sequence_after_variant_locus,
+            offset_to_first_complete_codon=key.offset_to_first_complete_codon,
+            contains_start_codon=key.contains_start_codon,
+            overlaps_start_codon=key.overlaps_start_codon,
+            contains_five_prime_utr=key.contains_five_prime_utr,
+            amino_acids_before_variant=key.amino_acids_before_variant,
+            variant=variant,
+            transcripts=transcripts)
+
+    def sort_key_decreasing_max_length_transcript_cds(self):
+        """
+        Used to sort a sequence of ReferenceContext objects by the longest CDS
+        in each context's list of transcripts.
+        """
+        return -max(len(t.coding_sequence) for t in self.transcripts)
 
 def reference_contexts_for_variant(
         variant,
@@ -83,21 +95,12 @@ def reference_contexts_for_variant(
             sequence_groups[sequence_key_with_reading_frame].append(transcript)
 
     reference_contexts = [
-        ReferenceContext(
-            strand=key.strand,
-            sequence_before_variant_locus=key.sequence_before_variant_locus,
-            sequence_at_variant_locus=key.sequence_at_variant_locus,
-            sequence_after_variant_locus=key.sequence_after_variant_locus,
-            offset_to_first_complete_codon=key.offset_to_first_complete_codon,
-            contains_start_codon=key.contains_start_codon,
-            overlaps_start_codon=key.overlaps_start_codon,
-            contains_five_prime_utr=key.contains_five_prime_utr,
-            amino_acids_before_variant=key.amino_acids_before_variant,
-            variant=variant,
-            transcripts=matching_transcripts)
+        ReferenceContext.from_reference_coding_sequence_key(
+            key, variant, matching_transcripts)
         for (key, matching_transcripts) in sequence_groups.items()
     ]
-    reference_contexts.sort(key=sort_key_decreasing_max_length_transcript_cds)
+    reference_contexts.sort(
+        key=ReferenceContext.sort_key_decreasing_max_length_transcript_cds)
     return reference_contexts
 
 def reference_contexts_for_variants(
diff --git a/isovar/translation.py b/isovar/translation.py
index d1ec571..a246201 100644
--- a/isovar/translation.py
+++ b/isovar/translation.py
@@ -26,7 +26,7 @@
 from skbio import DNA
 
 from .reference_context import reference_contexts_for_variant
-from .variant_sequences import supporting_reads_to_variant_sequences
+from .variant_sequences import reads_to_variant_sequences
 
 from .default_parameters import (
     MIN_TRANSCRIPT_PREFIX_LENGTH,
@@ -553,8 +553,9 @@ def translate_variant_reads(
     # need to clip nucleotides at the start/end of the sequence
     cdna_sequence_length = (protein_sequence_length + 1) * 3
 
-    variant_sequences = supporting_reads_to_variant_sequences(
-        variant_reads=variant_reads,
+    variant_sequences = reads_to_variant_sequences(
+        variant=variant,
+        reads=variant_reads,
         preferred_sequence_length=cdna_sequence_length,
         min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence,
         variant_cdna_sequence_assembly=variant_cdna_sequence_assembly)
diff --git a/test/test_allele_counts.py b/test/test_allele_counts.py
index 4d76abe..777b286 100644
--- a/test/test_allele_counts.py
+++ b/test/test_allele_counts.py
@@ -12,7 +12,7 @@ def test_allele_count_dataframe():
     ]
     df = allele_counts_dataframe([(variant, reads)])
     assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df,)
-    row = df.irow(0)
+    row = df.iloc[0]
     eq_(row.n_ref, 2)
     eq_(row.n_alt, 1)
     eq_(row.n_other, 0)
diff --git a/test/test_variant_sequences.py b/test/test_variant_sequences.py
index 2191bdf..42ea0a1 100644
--- a/test/test_variant_sequences.py
+++ b/test/test_variant_sequences.py
@@ -16,7 +16,7 @@
 
 from nose.tools import eq_
 from varcode import Variant
-from isovar.variant_sequences import supporting_reads_to_variant_sequences
+from isovar.variant_sequences import reads_to_variant_sequences
 from isovar.variant_reads import reads_supporting_variant
 
 from testing_helpers import load_bam
@@ -34,8 +34,9 @@ def test_sequence_counts_snv():
         chromosome=chromosome,
         variant=variant)
 
-    variant_sequences = supporting_reads_to_variant_sequences(
-        variant_reads,
+    variant_sequences = reads_to_variant_sequences(
+        variant=variant,
+        reads=variant_reads,
         preferred_sequence_length=61)
     assert len(variant_sequences) == 1
     for variant_sequence in variant_sequences: