From d13e68da3b95774c3727184eb8f5fdc764521c2e Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Thu, 27 Oct 2016 19:42:44 -0400 Subject: [PATCH 01/11] assemble reads and then trimmed assembled sequences down to desired levels of coverage --- isovar/allele_reads.py | 26 +-- isovar/assembly.py | 55 ++---- isovar/variant_sequences.py | 176 ++++++++++++++++-- script/isovar-variant-sequences.py | 4 +- .../isovar-translate-variants-results.csv | 12 +- test/test_assembly.py | 11 +- 6 files changed, 203 insertions(+), 81 deletions(-) diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py index 4508559..9d76460 100644 --- a/isovar/allele_reads.py +++ b/isovar/allele_reads.py @@ -84,7 +84,7 @@ def allele_read_from_locus_read(locus_read, n_ref): if ref_pos_before is None: logger.warn( "Missing reference pos for nucleotide before variant on read: %s", - locus_read) + locus_read) return None ref_pos_after = reference_positions[read_pos_after] @@ -92,7 +92,7 @@ def allele_read_from_locus_read(locus_read, n_ref): if ref_pos_after is None: logger.warn( "Missing reference pos for nucleotide after variant on read: %s", - locus_read) + locus_read) return None if n_ref == 0: @@ -102,9 +102,9 @@ def allele_read_from_locus_read(locus_read, n_ref): # don't use this read logger.debug( "Positions before (%d) and after (%d) variant should be adjacent on read %s", - ref_pos_before, - ref_pos_after, - locus_read) + ref_pos_before, + ref_pos_after, + locus_read) return None # insertions require a sequence of non-aligned bases @@ -125,9 +125,9 @@ def allele_read_from_locus_read(locus_read, n_ref): # don't use this read logger.debug( "Positions before (%d) and after (%d) variant should be adjacent on read %s", - ref_pos_before, - ref_pos_after, - locus_read) + ref_pos_before, + ref_pos_after, + locus_read) return None nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after] @@ -206,9 +206,11 @@ def reads_overlapping_variant( if chromosome is None: chromosome = variant.contig - logger.info("Gathering variant reads for variant %s (chromosome=%s)", + logger.info( + "Gathering variant reads for variant %s (chromosome = %s, gene names = %s)", variant, - chromosome) + chromosome, + variant.gene_names) base1_position, ref, alt = trim_variant(variant) @@ -271,7 +273,9 @@ def reads_overlapping_variants(variants, samfile, **kwargs): else: logger.warn( "Chromosome '%s' from variant %s not in alignment file %s", - chromosome, variant, samfile.filename) + chromosome, + variant, + samfile.filename) continue allele_reads = reads_overlapping_variant( samfile=samfile, diff --git a/isovar/assembly.py b/isovar/assembly.py index c19156f..6edd2b6 100644 --- a/isovar/assembly.py +++ b/isovar/assembly.py @@ -19,9 +19,6 @@ # I'd rather just use list.copy but Python 2.7 doesn't support it from copy import copy -from .read_helpers import group_unique_sequences -from .variant_sequences import VariantSequence - logger = logging.getLogger(__name__) @@ -68,14 +65,6 @@ def sort_by_decreasing_total_length(seq): """ return -len(seq.sequence) -def combine_variant_sequences(variant_sequence1, variant_sequence2): - return VariantSequence( - prefix=variant_sequence1.prefix, - alt=variant_sequence1.alt, - suffix=variant_sequence2.suffix, - reads=set( - variant_sequence1.reads).union(set(variant_sequence2.reads))) - def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE): """ Greedily merge overlapping sequences into longer sequences. @@ -103,10 +92,15 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE): variant_sequence2, min_overlap_size=min_overlap_size): continue - combined = combine_variant_sequences( - variant_sequence1, variant_sequence2) + combined = variant_sequence1.combine(variant_sequence2) if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)): # if the combined sequence isn't any longer, then why bother? + logger.info( + "Skipping %s since doesn't increase length beyond max(%d, %d)", + combined, + len(variant_sequence1), + len(variant_sequence2) + ) continue if combined.sequence in merged_variant_sequences: # it's possible to get the same merged sequence from distinct @@ -120,8 +114,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE): # sequence is supported by any one read. existing_record_with_same_sequence = merged_variant_sequences[ combined.sequence] - combined_with_more_reads = combine_variant_sequences( - existing_record_with_same_sequence, + combined_with_more_reads = existing_record_with_same_sequence.combine( combined) merged_variant_sequences[combined.sequence] = combined_with_more_reads else: @@ -168,7 +161,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence): """ return -len(variant_sequence.reads), -len(variant_sequence.sequence) -def iterative_assembly( +def iterative_overlap_assembly( variant_sequences, min_overlap_size=30, n_merge_iters=2): @@ -185,9 +178,9 @@ def iterative_assembly( logger.info( "Iteration #%d of assembly: generated %d sequences (from %d)", - i + 1, - len(variant_sequences), - len(previous_sequences)) + i + 1, + len(variant_sequences), + len(previous_sequences)) if len(variant_sequences) == 0: # if the greedy merge procedure fails for all pairs of candidate @@ -199,31 +192,9 @@ def iterative_assembly( variant_sequences = collapse_substrings(variant_sequences) logger.info( "After collapsing subsequences, %d distinct sequences left", - len(variant_sequences)) + len(variant_sequences)) if len(variant_sequences) == 1: # once we have only one sequence then there's no point trying # to further combine sequences break return variant_sequences - -def assemble_reads_into_variant_sequences( - variant_reads, - min_overlap_size=30, - n_merge_iters=2): - """ - Turn a collection of AlleleRead objects into a collection of VariantSequence - objects, which are returned in order of (# supported reads, sequence length) - """ - initial_variant_sequences = [ - VariantSequence( - prefix=prefix, - alt=alt, - suffix=suffix, - reads=reads) - for (prefix, alt, suffix), reads in - group_unique_sequences(variant_reads).items() - ] - return iterative_assembly( - initial_variant_sequences, - min_overlap_size=min_overlap_size, - n_merge_iters=n_merge_iters) diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py index a12fa0c..d62fc8e 100644 --- a/isovar/variant_sequences.py +++ b/isovar/variant_sequences.py @@ -13,8 +13,10 @@ # limitations under the License. from __future__ import print_function, division, absolute_import -from collections import namedtuple import logging +from collections import namedtuple + +import numpy as np from .read_helpers import ( get_single_allele_from_reads, @@ -26,11 +28,10 @@ VARIANT_CDNA_SEQUENCE_LENGTH, ) from .dataframe_builder import dataframe_from_generator - +from .assembly import iterative_overlap_assembly, collapse_substrings logger = logging.getLogger(__name__) - # using this base class to define the core fields of a VariantSequence # but inheriting it from it to allow the addition of helper methods VariantSequenceFields = namedtuple( @@ -81,8 +82,6 @@ def contains(self, other): def left_overlaps(self, other, min_overlap_size=1): """ Does this VariantSequence overlap another on the left side? - The alleles must match and they must share at least `min_overlap_size` - nucleotides in their prefix or suffix. """ if self.alt != other.alt: @@ -107,10 +106,14 @@ def left_overlaps(self, other, min_overlap_size=1): # p2 a2 s2 = XX Y ZZZZZZZZZ # ... # then we can combine them into a longer sequence - return ( + sequence_overlaps = ( self.prefix.endswith(other.prefix) and other.suffix.startswith(self.suffix) ) + prefix_overlap_size = len(self.prefix) - len(other.prefix) + suffix_overlap_size = len(other.suffix) - len(self.suffix) + return sequence_overlaps and ( + (prefix_overlap_size + suffix_overlap_size) >= min_overlap_size) def add_reads(self, reads): """ @@ -122,6 +125,106 @@ def add_reads(self, reads): suffix=self.suffix, reads=self.reads.union(reads)) + def combine(self, other_sequence): + """ + If this sequence is the prefix of another sequence, combine + them into a single VariantSequence object. + """ + if other_sequence.alt != self.alt: + raise ValueError( + "Cannot combine %s and %s with mismatching alt sequences" % ( + self, + other_sequence)) + elif self.left_overlaps(other_sequence): + # If sequences are like AABC and ABCC + return VariantSequence( + prefix=self.prefix, + alt=self.alt, + suffix=other_sequence.suffix, + reads=self.reads.union(other_sequence.reads)) + elif other_sequence.left_overlaps(self): + return other_sequence.combine(self) + elif self.contains(other_sequence): + return self.add_reads(other_sequence.reads) + elif other_sequence.contains(self): + return other_sequence.add_reads(self.reads) + else: + raise ValueError("%s does not overlap with %s" % (self, other_sequence)) + + def variant_indices(self): + """ + When we combine prefix + alt + suffix into a single string, + what are is base-0 index interval which gets us back the alt + sequence? First returned index is inclusive, the second is exclusive. + """ + variant_start_index = len(self.prefix) + variant_len = len(self.alt) + variant_end_index = variant_start_index + variant_len + return variant_start_index, variant_end_index + + def coverage(self): + """ + Returns NumPy array indicating number of reads covering each + nucleotides of this sequence. + """ + variant_start_index, variant_end_index = self.variant_indices() + n_nucleotides = len(self) + coverage_array = np.zeros(n_nucleotides, dtype="int32") + for read in self.reads: + coverage_array[ + max(0, variant_start_index - len(read.prefix)): + min(n_nucleotides, variant_end_index + len(read.suffix))] += 1 + return coverage_array + + def min_coverage(self): + return np.min(self.coverage()) + + def mean_coverage(self): + return np.mean(self.coverage()) + + def trim_by_coverage(self, min_reads): + """ + Given the min number of reads overlapping each nucleotide of + a variant sequence, trim this sequence by getting rid of positions + which are overlapped by fewer reads than specified. + """ + read_count_array = self.coverage() + + sufficient_coverage_mask = read_count_array >= min_reads + sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask) + if len(sufficient_coverage_indices) == 0: + logger.debug("No bases in %s have coverage >= %d" % (self, min_reads)) + return VariantSequence(prefix="", alt="", suffix="", reads=self.reads) + variant_start_index, variant_end_index = self.variant_indices() + # assuming that coverage drops off monotonically away from + # variant nucleotides + first_covered_index = sufficient_coverage_indices.min() + last_covered_index = sufficient_coverage_indices.max() + # adding 1 to last_covered_index since it's an inclusive index + # whereas variant_end_index is the end of a half-open interval + if (first_covered_index > variant_start_index or + last_covered_index + 1 < variant_end_index): + # Example: + # Nucleotide sequence: + # ACCCTTTT|AA|GGCGCGCC + # Coverage: + # 12222333|44|33333211 + # Then the mask for bases covered >= 4x would be: + # ________|**|________ + # with indices: + # first_covered_index = 9 + # last_covered_index = 10 + # variant_start_index = 9 + # variant_end_index = 11 + logger.debug("Some variant bases in %s don't have coverage >= %d" % ( + self, min_reads)) + return VariantSequence(prefix="", alt="", suffix="", reads=self.reads) + return VariantSequence( + prefix=self.prefix[first_covered_index:], + alt=self.alt, + suffix=self.suffix[:last_covered_index - variant_end_index + 1], + reads=self.reads) + def sort_key_decreasing_read_count(variant_sequence): return -len(variant_sequence.reads) @@ -148,6 +251,7 @@ def all_variant_sequences_supported_by_variant_reads( for ((prefix, alt, suffix), reads) in unique_sequence_groups.items() ] + def filter_variant_sequences_by_read_support( variant_sequences, min_reads_supporting_cdna_sequence): @@ -155,11 +259,12 @@ def filter_variant_sequences_by_read_support( variant_sequences = [ s for s in variant_sequences - if len(s.reads) >= min_reads_supporting_cdna_sequence + if s.min_coverage() >= min_reads_supporting_cdna_sequence ] n_dropped = n_total - len(variant_sequences) if n_dropped > 0: - logger.info("Dropped %d/%d variant sequences less than %d supporting reads", + logger.info( + "Dropped %d/%d variant sequences less than %d supporting reads", n_dropped, n_total, min_reads_supporting_cdna_sequence) @@ -179,19 +284,38 @@ def filter_variant_sequences_by_length( min_required_sequence_length = min( max_observed_sequence_length, preferred_sequence_length) - variant_sequences = [ s for s in variant_sequences if len(s.sequence) >= min_required_sequence_length ] n_dropped = n_total - len(variant_sequences) if n_dropped > 0: - logger.info("Dropped %d/%d variant sequences shorter than %d", + logger.info( + "Dropped %d/%d variant sequences shorter than %d", n_dropped, n_total, min_required_sequence_length) return variant_sequences +def trim_variant_sequences(variant_sequences, min_reads_supporting_cdna_sequence): + """ + Trim VariantSequences to desired coverage and then combine any + subsequences which get generated. + """ + n_total = len(variant_sequences) + trimmed_variant_sequences = [ + variant_sequence.trim_by_coverage(min_reads_supporting_cdna_sequence) + for variant_sequence in variant_sequences + ] + collapsed_variant_sequences = collapse_substrings(trimmed_variant_sequences) + n_after_trimming = len(collapsed_variant_sequences) + logger.info( + "Kept %d/%d variant sequences after read coverage trimming to >=%dx", + n_after_trimming, + n_total, + min_reads_supporting_cdna_sequence) + return collapsed_variant_sequences + def filter_variant_sequences( variant_sequences, preferred_sequence_length, @@ -200,9 +324,8 @@ def filter_variant_sequences( Drop variant sequences which are shorter than request or don't have enough supporting reads. """ - variant_sequences = filter_variant_sequences_by_read_support( - variant_sequences=variant_sequences, - min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence) + variant_sequences = trim_variant_sequences( + variant_sequences, min_reads_supporting_cdna_sequence) return filter_variant_sequences_by_length( variant_sequences=variant_sequences, preferred_sequence_length=preferred_sequence_length) @@ -259,11 +382,38 @@ def supporting_reads_to_variant_sequences( max_nucleotides_before_variant=max_nucleotides_before_variant, max_nucleotides_after_variant=max_nucleotides_after_variant) + logger.info( + "Initial pool of %d variant sequences (min length=%d, max length=%d)", + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + + variant_sequences = iterative_overlap_assembly(variant_sequences) + + if variant_sequences: + logger.info( + "After overlap assembly: %d variant sequences (min length=%d, max length=%d)", + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + else: + logger.info("After overlap assembly: 0 variant sequences") + variant_sequences = filter_variant_sequences( variant_sequences=variant_sequences, preferred_sequence_length=preferred_sequence_length, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence) + if variant_sequences: + logger.info( + ("After coverage & length filtering: %d variant sequences " + "(min length=%d, max length=%d)"), + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + else: + logger.info("After coverage & length filtering: 0 variant sequences") + # sort VariantSequence objects by decreasing order of supporting read # counts variant_sequences.sort(key=sort_key_decreasing_read_count) diff --git a/script/isovar-variant-sequences.py b/script/isovar-variant-sequences.py index 122908c..397fc69 100644 --- a/script/isovar-variant-sequences.py +++ b/script/isovar-variant-sequences.py @@ -24,8 +24,8 @@ variant_sequences_dataframe_from_args ) - -logging.config.fileConfig(pkg_resources.resource_filename('isovar.cli', 'logging.conf')) +logging.config.fileConfig( + pkg_resources.resource_filename('isovar.cli', 'logging.conf')) logger = logging.getLogger(__name__) parser = make_variant_sequences_arg_parser(add_sequence_length_arg=True) diff --git a/test/data/b16.f10/isovar-translate-variants-results.csv b/test/data/b16.f10/isovar-translate-variants-results.csv index bc59adb..f70f03f 100644 --- a/test/data/b16.f10/isovar-translate-variants-results.csv +++ b/test/data/b16.f10/isovar-translate-variants-results.csv @@ -1,9 +1,3 @@ -,chr,pos,ref,alt,amino_acids,variant_aa_interval_start,variant_aa_interval_end,ends_with_stop_codon,frameshift,translations,supporting_variant_reads,total_variant_reads,supporting_transcripts,total_transcripts,gene -0,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGIRTEDFF,14,27,False,True,1,16,27,1,1,Klf6 -1,13,5864876,C,CG,DSQEDLWTKFILARGGRRRRNQN,14,23,True,True,1,8,27,1,1,Klf6 -2,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGTTDCARN,14,27,False,True,1,2,27,1,1,Klf6 -3,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGIRTDDFF,14,27,False,True,1,1,27,1,1,Klf6 -4,4,45802539,G,C,CLQGRTTSYSTAAPLPNPIPNPEICY,14,15,False,False,1,3,3,1,1,Aldh1b1 -5,9,82927102,G,T,TMVAWDRHDNTVINYNCVVMSIPSYHS,14,15,False,False,2,11,13,1,1,Phip -6,9,82927102,G,T,TMVAWDRHDNTVIQYNCVVMSIPSYHS,14,15,False,False,1,2,13,1,1,Phip -7,X,8125624,C,A,NKLQGHSAPVLDVIDVKHGRAVALQLV,14,15,False,False,2,8,8,3,3,Wdr13 +,chr,pos,ref,alt,amino_acids,variant_aa_interval_start,variant_aa_interval_end,ends_with_stop_codon,frameshift,translations,overlapping_reads,ref_reads,alt_reads,alt_reads_supporting_protein_sequence,transcripts_overlapping_variant,transcripts_supporting_protein_sequence,gene +0,9,82927102,G,T,AWDRHDNTVIHAVNNMTLKV,10,11,False,False,1,56,39,17,5,1,1,Phip +1,X,8125624,C,A,QGHSAPVLDVIVNCDESLLA,10,11,False,False,1,71,46,25,8,3,3,Wdr13 diff --git a/test/test_assembly.py b/test/test_assembly.py index 3a8fda1..a5cf7c6 100644 --- a/test/test_assembly.py +++ b/test/test_assembly.py @@ -14,8 +14,11 @@ from __future__ import print_function, division, absolute_import -from isovar.variant_reads import reads_supporting_variant -from isovar.assembly import assemble_reads_into_variant_sequences +from isovar.variant_reads import ( + reads_supporting_variant, + all_variant_sequences_supported_by_variant_reads, +) +from isovar.assembly import iterative_overlap_assembly from pyensembl import ensembl_grch38 from varcode import Variant from nose.tools import eq_ @@ -39,8 +42,8 @@ def test_assemble_transcript_fragments_snv(): samfile=samfile, chromosome=chromosome,) - sequences = assemble_reads_into_variant_sequences( - variant_reads, + sequences = iterative_overlap_assembly( + all_variant_sequences_supported_by_variant_reads(variant_reads), min_overlap_size=30) assert len(sequences) > 0 From 0e653dd04f65421c7fe4cc7b11fc342d4a805cb9 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Thu, 27 Oct 2016 21:18:16 -0400 Subject: [PATCH 02/11] testing with and without read overlap assembly --- isovar/assembly.py | 10 ---------- isovar/cli/variant_sequences.py | 8 +++++++- isovar/default_parameters.py | 5 +++++ isovar/protein_sequences.py | 8 ++++++-- isovar/translation.py | 21 +++++++++++++-------- isovar/variant_sequences.py | 31 ++++++++++++++++++++++--------- test/test_assembly.py | 8 +++----- test/test_protein_sequences.py | 26 ++++++++++++++------------ 8 files changed, 70 insertions(+), 47 deletions(-) diff --git a/isovar/assembly.py b/isovar/assembly.py index 6edd2b6..86c78fc 100644 --- a/isovar/assembly.py +++ b/isovar/assembly.py @@ -22,7 +22,6 @@ logger = logging.getLogger(__name__) - MIN_OVERLAP_SIZE = 30 def sort_by_decreasing_prefix_length(seq): @@ -93,15 +92,6 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE): min_overlap_size=min_overlap_size): continue combined = variant_sequence1.combine(variant_sequence2) - if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)): - # if the combined sequence isn't any longer, then why bother? - logger.info( - "Skipping %s since doesn't increase length beyond max(%d, %d)", - combined, - len(variant_sequence1), - len(variant_sequence2) - ) - continue if combined.sequence in merged_variant_sequences: # it's possible to get the same merged sequence from distinct # input sequences diff --git a/isovar/cli/variant_sequences.py b/isovar/cli/variant_sequences.py index b27fd89..14579a5 100644 --- a/isovar/cli/variant_sequences.py +++ b/isovar/cli/variant_sequences.py @@ -38,6 +38,11 @@ def add_variant_sequence_args( default=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, help="Minimum number of reads supporting a variant sequence (default %(default)s)") + rna_sequence_group.add_argument( + "--overlap-assembly", + default=False, + action="store_true") + # when cDNA sequence length can be inferred from a protein length then # we may want to omit this arg if add_sequence_length_arg: @@ -76,7 +81,8 @@ def variant_sequences_generator_from_args(args): return reads_generator_to_sequences_generator( allele_reads_generator, min_reads_supporting_cdna_sequence=args.min_reads_supporting_variant_sequence, - preferred_sequence_length=args.cdna_sequence_length) + preferred_sequence_length=args.cdna_sequence_length, + overlap_assembly=args.overlap_assembly) def variant_sequences_dataframe_from_args(args): variant_sequences_generator = variant_sequences_generator_from_args(args) diff --git a/isovar/default_parameters.py b/isovar/default_parameters.py index b279b05..1bbab69 100644 --- a/isovar/default_parameters.py +++ b/isovar/default_parameters.py @@ -72,3 +72,8 @@ # number of protein sequences we want to return per variant MAX_PROTEIN_SEQUENCES_PER_VARIANT = 1 + +# run overlap assembly algorithm to construct variant +# sequences from multiple reads which only partially +# overlap (rather than fully spanning a coding sequence) +VARIANT_CDNA_SEQUENCE_ASSEMBLY = False diff --git a/isovar/protein_sequences.py b/isovar/protein_sequences.py index ff3b158..af55d21 100644 --- a/isovar/protein_sequences.py +++ b/isovar/protein_sequences.py @@ -28,6 +28,7 @@ PROTEIN_SEQUENCE_LENGTH, MAX_PROTEIN_SEQUENCES_PER_VARIANT, MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, + VARIANT_CDNA_SEQUENCE_ASSEMBLY ) from .dataframe_builder import dataframe_from_generator from .translation import translate_variant_reads, TranslationKey @@ -158,7 +159,8 @@ def reads_generator_to_protein_sequences_generator( min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, - max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT): + max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """" Translates each coding variant in a collection to one or more Translation objects, which are then aggregated into equivalent @@ -190,6 +192,7 @@ def reads_generator_to_protein_sequences_generator( max_protein_sequences_per_variant : int Number of protein sequences to return for each ProteinSequence + variant_cdna_sequence_assembly : bool Yields pairs of a Variant and a list of ProteinSequence objects """ @@ -214,7 +217,8 @@ def reads_generator_to_protein_sequences_generator( protein_sequence_length=protein_sequence_length, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches) + max_transcript_mismatches=max_transcript_mismatches, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) protein_sequences = [] for (key, equivalent_translations) in group_translations(translations).items(): diff --git a/isovar/translation.py b/isovar/translation.py index fc87fed..d1ec571 100644 --- a/isovar/translation.py +++ b/isovar/translation.py @@ -33,6 +33,7 @@ MAX_REFERENCE_TRANSCRIPT_MISMATCHES, PROTEIN_SEQUENCE_LENGTH, MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, + VARIANT_CDNA_SEQUENCE_ASSEMBLY, ) from .dataframe_builder import dataframe_from_generator @@ -467,9 +468,9 @@ def translate_variant_sequence( logger.warn( ("Truncating amino acid sequence %s from variant sequence %s " "to only %d elements loses all variant residues"), - variant_amino_acids, - variant_sequence, - protein_sequence_length) + variant_amino_acids, + variant_sequence, + protein_sequence_length) return None # if the protein is too long then short it, which implies we're no longer # stopping due to a stop codon and that the variant amino acids might @@ -542,7 +543,8 @@ def translate_variant_reads( transcript_id_whitelist=None, min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, - max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES): + max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): if len(variant_reads) == 0: logger.info("No supporting reads for variant %s", variant) return [] @@ -554,7 +556,8 @@ def translate_variant_reads( variant_sequences = supporting_reads_to_variant_sequences( variant_reads=variant_reads, preferred_sequence_length=cdna_sequence_length, - min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence) + min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) if not variant_sequences: logger.info("No spanning cDNA sequences for variant %s", variant) @@ -572,7 +575,7 @@ def translate_variant_reads( if context_size < min_transcript_prefix_length: logger.info( "Skipping variant %s, none of the cDNA sequences have sufficient context", - variant) + variant) return [] reference_contexts = reference_contexts_for_variant( @@ -592,7 +595,8 @@ def translate_variants( protein_sequence_length=PROTEIN_SEQUENCE_LENGTH, min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, - max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES): + max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """ Translates each coding variant in a collection to one or more protein fragment sequences (if the variant is not filtered and its spanning RNA @@ -636,7 +640,8 @@ def translate_variants( transcript_id_whitelist=transcript_id_whitelist, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches) + max_transcript_mismatches=max_transcript_mismatches, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) yield variant, translations def translations_generator_to_dataframe(translations_generator): diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py index d62fc8e..ba02b58 100644 --- a/isovar/variant_sequences.py +++ b/isovar/variant_sequences.py @@ -26,6 +26,7 @@ from .default_parameters import ( MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, VARIANT_CDNA_SEQUENCE_LENGTH, + VARIANT_CDNA_SEQUENCE_ASSEMBLY, ) from .dataframe_builder import dataframe_from_generator from .assembly import iterative_overlap_assembly, collapse_substrings @@ -112,8 +113,8 @@ def left_overlaps(self, other, min_overlap_size=1): ) prefix_overlap_size = len(self.prefix) - len(other.prefix) suffix_overlap_size = len(other.suffix) - len(self.suffix) - return sequence_overlaps and ( - (prefix_overlap_size + suffix_overlap_size) >= min_overlap_size) + overlap_size = prefix_overlap_size + suffix_overlap_size + len(self.alt) + return sequence_overlaps and overlap_size >= min_overlap_size def add_reads(self, reads): """ @@ -189,7 +190,7 @@ def trim_by_coverage(self, min_reads): which are overlapped by fewer reads than specified. """ read_count_array = self.coverage() - + logger.info("%s (len=%d)" % (read_count_array, len(read_count_array))) sufficient_coverage_mask = read_count_array >= min_reads sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask) if len(sufficient_coverage_indices) == 0: @@ -228,7 +229,7 @@ def trim_by_coverage(self, min_reads): def sort_key_decreasing_read_count(variant_sequence): return -len(variant_sequence.reads) -def all_variant_sequences_supported_by_variant_reads( +def initial_variant_sequences_from_reads( variant_reads, max_nucleotides_before_variant=None, max_nucleotides_after_variant=None): @@ -333,7 +334,8 @@ def filter_variant_sequences( def supporting_reads_to_variant_sequences( variant_reads, preferred_sequence_length, - min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE): + min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """ Collapse variant-support RNA reads into consensus sequences of approximately the preferred length (may differ at the ends of transcripts), filter @@ -355,6 +357,10 @@ def supporting_reads_to_variant_sequences( Drop sequences which don't at least have this number of fully spanning reads. + overlap_assembly : bool + Construct variant sequences by merging overlapping reads. If False + then variant sequences must be fully spanned by cDNA reads. + Returns a collection of VariantSequence objects """ # just in case variant_reads is a generator, convert it to a list @@ -377,7 +383,7 @@ def supporting_reads_to_variant_sequences( max_nucleotides_before_variant = ( n_surrounding_nucleotides - max_nucleotides_after_variant) - variant_sequences = all_variant_sequences_supported_by_variant_reads( + variant_sequences = initial_variant_sequences_from_reads( variant_reads=variant_reads, max_nucleotides_before_variant=max_nucleotides_before_variant, max_nucleotides_after_variant=max_nucleotides_after_variant) @@ -388,7 +394,8 @@ def supporting_reads_to_variant_sequences( min(len(s) for s in variant_sequences), max(len(s) for s in variant_sequences)) - variant_sequences = iterative_overlap_assembly(variant_sequences) + if variant_cdna_sequence_assembly: + variant_sequences = iterative_overlap_assembly(variant_sequences) if variant_sequences: logger.info( @@ -433,7 +440,8 @@ def overlapping_reads_to_variant_sequences( def reads_generator_to_sequences_generator( variant_and_reads_generator, min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, - preferred_sequence_length=VARIANT_CDNA_SEQUENCE_LENGTH): + preferred_sequence_length=VARIANT_CDNA_SEQUENCE_LENGTH, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """ For each variant, collect all possible sequence contexts around the variant which are spanned by at least min_reads. @@ -450,6 +458,10 @@ def reads_generator_to_sequences_generator( sequence_length : int Desired sequence length, including variant nucleotides + overlap_assembly : bool + Construct variant sequences by merging overlapping reads. If False + then variant sequences must be fully spanned by cDNA reads. + Yields pairs with the following fields: - Variant - list of VariantSequence objects @@ -459,7 +471,8 @@ def reads_generator_to_sequences_generator( variant=variant, overlapping_reads=variant_reads, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, - preferred_sequence_length=preferred_sequence_length) + preferred_sequence_length=preferred_sequence_length, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) yield variant, variant_sequences def variant_sequences_generator_to_dataframe(variant_sequences_generator): diff --git a/test/test_assembly.py b/test/test_assembly.py index a5cf7c6..6929e0c 100644 --- a/test/test_assembly.py +++ b/test/test_assembly.py @@ -14,10 +14,8 @@ from __future__ import print_function, division, absolute_import -from isovar.variant_reads import ( - reads_supporting_variant, - all_variant_sequences_supported_by_variant_reads, -) +from isovar.variant_reads import reads_supporting_variant +from isovar.variant_sequences import initial_variant_sequences_from_reads from isovar.assembly import iterative_overlap_assembly from pyensembl import ensembl_grch38 from varcode import Variant @@ -43,7 +41,7 @@ def test_assemble_transcript_fragments_snv(): chromosome=chromosome,) sequences = iterative_overlap_assembly( - all_variant_sequences_supported_by_variant_reads(variant_reads), + initial_variant_sequences_from_reads(variant_reads), min_overlap_size=30) assert len(sequences) > 0 diff --git a/test/test_protein_sequences.py b/test/test_protein_sequences.py index caf129a..08bb822 100644 --- a/test/test_protein_sequences.py +++ b/test/test_protein_sequences.py @@ -166,18 +166,20 @@ def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant(): samfile=samfile, min_mapping_quality=0) - protein_sequences_generator = reads_generator_to_protein_sequences_generator( - allele_reads_generator, - max_protein_sequences_per_variant=1) - df = protein_sequences_generator_to_dataframe(protein_sequences_generator) - print(df) - eq_( - len(df), - len(expressed_variants), - "Expected %d/%d entries to have RNA support, got %d" % ( + for overlap_assembly in [False, True]: + protein_sequences_generator = reads_generator_to_protein_sequences_generator( + allele_reads_generator, + max_protein_sequences_per_variant=1, + variant_cdna_sequence_assembly=overlap_assembly) + df = protein_sequences_generator_to_dataframe(protein_sequences_generator) + print(df) + eq_( + len(df), len(expressed_variants), - len(combined_variants), - len(df),)) + "Expected %d/%d entries to have RNA support, got %d" % ( + len(expressed_variants), + len(combined_variants), + len(df),)) def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(): # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped @@ -207,7 +209,7 @@ def test_variants_to_protein_sequences_dataframe_protein_sequence_length(): "--vcf", data_path("data/b16.f10/b16.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", - "--protein-sequence-length", str(desired_length) + "--protein-sequence-length", str(desired_length), ]) df = protein_sequences_dataframe_from_args(args) eq_( From b80d3d47d97fe499aaadfc23ddcdb526a7318448 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Mon, 31 Oct 2016 17:51:43 -0400 Subject: [PATCH 03/11] got unit tests with assmembly passing, working on adding more --- isovar/protein_sequences.py | 20 ++++++++-- isovar/variant_sequences.py | 3 +- test/test_assembly.py | 7 ++++ test/test_protein_sequences.py | 70 +++++++++++++++++++++++----------- 4 files changed, 74 insertions(+), 26 deletions(-) diff --git a/isovar/protein_sequences.py b/isovar/protein_sequences.py index af55d21..34429c2 100644 --- a/isovar/protein_sequences.py +++ b/isovar/protein_sequences.py @@ -21,6 +21,7 @@ from __future__ import print_function, division, absolute_import from collections import namedtuple, defaultdict +import logging from .default_parameters import ( MIN_TRANSCRIPT_PREFIX_LENGTH, @@ -48,6 +49,8 @@ # ########################## +logger = logging.getLogger(__name__) + ProteinSequence = namedtuple( "ProteinSequence", TranslationKey._fields + ( @@ -66,10 +69,10 @@ # number of unique read names from all the VariantSequence objects # from each translation "alt_reads_supporting_protein_sequence", - # how many reference transcripts overlap the variant locus? + # IDs of transcripts overlapping the variant locus "transcripts_overlapping_variant", - # how many reference transcripts were used to establish the - # reading frame for this protein sequence + # IDs of reference transcripts used to establish the reading frame for + # this protein sequence "transcripts_supporting_protein_sequence", # name of gene of the reference transcripts used in Translation # objects @@ -193,6 +196,9 @@ def reads_generator_to_protein_sequences_generator( Number of protein sequences to return for each ProteinSequence variant_cdna_sequence_assembly : bool + If True, then assemble variant cDNA sequences based on overlap of + RNA reads. If False, then variant cDNA sequences must be fully spanned + and contained within RNA reads. Yields pairs of a Variant and a list of ProteinSequence objects """ @@ -222,11 +228,18 @@ def reads_generator_to_protein_sequences_generator( protein_sequences = [] for (key, equivalent_translations) in group_translations(translations).items(): + # get the variant read names, transcript IDs and gene names for # protein sequence we're about to construct alt_reads_supporting_protein_sequence, group_transcript_ids, group_gene_names = \ summarize_translations(equivalent_translations) + logger.info( + "%s: %s alt reads supporting protein sequence (gene names = %s)", + key, + len(alt_reads_supporting_protein_sequence), + group_gene_names) + protein_sequence = translation_key_to_protein_sequence( translation_key=key, translations=equivalent_translations, @@ -237,6 +250,7 @@ def reads_generator_to_protein_sequences_generator( transcripts_supporting_protein_sequence=group_transcript_ids, transcripts_overlapping_variant=overlapping_transcript_ids, gene=list(group_gene_names)) + logger.info("%s: protein sequence = %s" % (key, protein_sequence.amino_acids)) protein_sequences.append(protein_sequence) # sort protein sequences before returning the top results diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py index ba02b58..822607e 100644 --- a/isovar/variant_sequences.py +++ b/isovar/variant_sequences.py @@ -190,7 +190,8 @@ def trim_by_coverage(self, min_reads): which are overlapped by fewer reads than specified. """ read_count_array = self.coverage() - logger.info("%s (len=%d)" % (read_count_array, len(read_count_array))) + logger.info("Coverage: %s (len=%d)" % ( + read_count_array, len(read_count_array))) sufficient_coverage_mask = read_count_array >= min_reads sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask) if len(sufficient_coverage_indices) == 0: diff --git a/test/test_assembly.py b/test/test_assembly.py index 6929e0c..1bfe127 100644 --- a/test/test_assembly.py +++ b/test/test_assembly.py @@ -16,6 +16,8 @@ from isovar.variant_reads import reads_supporting_variant from isovar.variant_sequences import initial_variant_sequences_from_reads +from isovar.allele_reads import AlleleRead +from isovar.variant_sequences import VariantSequence from isovar.assembly import iterative_overlap_assembly from pyensembl import ensembl_grch38 from varcode import Variant @@ -57,6 +59,11 @@ def test_assemble_transcript_fragments_snv(): assert len(s) > max_read_length, \ "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,) +""" +def test_assemble_simple_sequences(): + AlleleRead + VariantSequence +""" if __name__ == "__main__": test_assemble_transcript_fragments_snv() diff --git a/test/test_protein_sequences.py b/test/test_protein_sequences.py index 08bb822..5973203 100644 --- a/test/test_protein_sequences.py +++ b/test/test_protein_sequences.py @@ -152,34 +152,58 @@ def test_sort_protein_sequences(): ] eq_(sort_protein_sequences(unsorted_protein_sequences), expected_order) - -def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant(): - expressed_variants = load_vcf("data/b16.f10/b16.expressed.vcf") - not_expressed_variants = load_vcf("data/b16.f10/b16.not-expressed.vcf") +def variants_to_protein_sequences_dataframe( + expressed_vcf="data/b16.f10/b16.expressed.vcf", + not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf", + tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam", + min_mapping_quality=0, + max_protein_sequences_per_variant=1, + variant_cdna_sequence_assembly=False): + """ + Helper function to load pair of VCFs and tumor RNA BAM + and use them to generate a DataFrame of expressed variant protein + sequences. + """ + expressed_variants = load_vcf(expressed_vcf) + not_expressed_variants = load_vcf(not_expressed_vcf) combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) - samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") + samfile = load_bam(tumor_rna_bam) allele_reads_generator = reads_overlapping_variants( variants=combined_variants, samfile=samfile, - min_mapping_quality=0) - - for overlap_assembly in [False, True]: - protein_sequences_generator = reads_generator_to_protein_sequences_generator( - allele_reads_generator, - max_protein_sequences_per_variant=1, - variant_cdna_sequence_assembly=overlap_assembly) - df = protein_sequences_generator_to_dataframe(protein_sequences_generator) - print(df) - eq_( - len(df), + min_mapping_quality=min_mapping_quality) + + protein_sequences_generator = reads_generator_to_protein_sequences_generator( + allele_reads_generator, + max_protein_sequences_per_variant=max_protein_sequences_per_variant, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) + df = protein_sequences_generator_to_dataframe(protein_sequences_generator) + return df, expressed_variants, combined_variants + +def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_with_assembly(): + df, expressed_variants, combined_variants = \ + variants_to_protein_sequences_dataframe(variant_cdna_sequence_assembly=True) + print(df) + eq_(len(df), + len(expressed_variants), + "Expected %d/%d entries to have RNA support, got %d" % ( len(expressed_variants), - "Expected %d/%d entries to have RNA support, got %d" % ( - len(expressed_variants), - len(combined_variants), - len(df),)) + len(combined_variants), + len(df))) + +def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_without_assembly(): + df, expressed_variants, combined_variants = \ + variants_to_protein_sequences_dataframe(variant_cdna_sequence_assembly=False) + print(df) + eq_(len(df), + len(expressed_variants), + "Expected %d/%d entries to have RNA support, got %d" % ( + len(expressed_variants), + len(combined_variants), + len(df))) def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(): # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped @@ -222,10 +246,12 @@ def test_variants_to_protein_sequences_dataframe_protein_sequence_length(): protein_sequences = df["amino_acids"] print(protein_sequences) protien_sequence_lengths = protein_sequences.str.len() - assert (protien_sequence_lengths == desired_length).all(), protien_sequence_lengths + assert (protien_sequence_lengths == desired_length).all(), ( + protien_sequence_lengths,) if __name__ == "__main__": test_sort_protein_sequences() - test_variants_to_protein_sequences_dataframe_one_sequence_per_variant() + test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_with_assembly() + test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_without_assembly() test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality() test_variants_to_protein_sequences_dataframe_protein_sequence_length() From a8ae1601a49b75672e97acfa9ac827dcd1c7cb9c Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 4 Nov 2016 13:20:55 -0400 Subject: [PATCH 04/11] splitting reference_context into three modules --- isovar/allele_reads.py | 195 ++++++------ isovar/locus_reads.py | 274 ++++++++-------- isovar/protein_sequences.py | 4 +- isovar/reference_context.py | 299 +----------------- isovar/reference_sequence_key.py | 132 ++++++++ ...ference_sequence_key_with_reading_frame.py | 201 ++++++++++++ test/test_reference_sequence_key.py | 29 +- 7 files changed, 584 insertions(+), 550 deletions(-) create mode 100644 isovar/reference_sequence_key.py create mode 100644 isovar/reference_sequence_key_with_reading_frame.py diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py index 9d76460..37d5a76 100644 --- a/isovar/allele_reads.py +++ b/isovar/allele_reads.py @@ -34,121 +34,114 @@ logger = logging.getLogger(__name__) - # subclassing from namedtuple to get a lightweight object with built-in # hashing and comparison while also being able to add methods -AlleleReadFields = namedtuple( - "AlleleRead", - "prefix allele suffix name sequence") - -class AlleleRead(AlleleReadFields): - def __new__(cls, prefix, allele, suffix, name): - # construct sequence from prefix + alt + suffix - return AlleleReadFields.__new__( - cls, - prefix=prefix, - allele=allele, - suffix=suffix, - name=name, - sequence=prefix + allele + suffix) +class AlleleRead( + namedtuple("AlleleRead", "prefix allele suffix name sequence")): + + @property + def sequence(self): + return self.prefix + self.allele + self.suffix def __len__(self): return len(self.prefix) + len(self.allele) + len(self.suffix) -def allele_read_from_locus_read(locus_read, n_ref): - """ - Given a single ReadAtLocus object, return either an AlleleRead or None - - Parameters - ---------- - locus_read : LocusRead - Read which overlaps a variant locus but doesn't necessarily contain the - alternate nucleotides - - n_ref : int - Number of reference positions we are expecting to be modified or - deleted (for insertions this should be 0) - """ - sequence = locus_read.sequence - reference_positions = locus_read.reference_positions - - # positions of the nucleotides before and after the variant within - # the read sequence - read_pos_before = locus_read.base0_read_position_before_variant - read_pos_after = locus_read.base0_read_position_after_variant - - # positions of the nucleotides before and after the variant on the - # reference genome - ref_pos_before = reference_positions[read_pos_before] - - if ref_pos_before is None: - logger.warn( - "Missing reference pos for nucleotide before variant on read: %s", - locus_read) - return None - - ref_pos_after = reference_positions[read_pos_after] - - if ref_pos_after is None: - logger.warn( - "Missing reference pos for nucleotide after variant on read: %s", - locus_read) - return None - - if n_ref == 0: - if ref_pos_after - ref_pos_before != 1: - # if the number of nucleotides skipped isn't the same - # as the number of reference nucleotides in the variant then - # don't use this read - logger.debug( - "Positions before (%d) and after (%d) variant should be adjacent on read %s", - ref_pos_before, - ref_pos_after, + @classmethod + def from_locus_read(cls, locus_read, n_ref): + """ + Given a single LocusRead object, return either an AlleleRead or None + + Parameters + ---------- + locus_read : LocusRead + Read which overlaps a variant locus but doesn't necessarily contain the + alternate nucleotides + + n_ref : int + Number of reference positions we are expecting to be modified or + deleted (for insertions this should be 0) + """ + sequence = locus_read.sequence + reference_positions = locus_read.reference_positions + + # positions of the nucleotides before and after the variant within + # the read sequence + read_pos_before = locus_read.base0_read_position_before_variant + read_pos_after = locus_read.base0_read_position_after_variant + + # positions of the nucleotides before and after the variant on the + # reference genome + ref_pos_before = reference_positions[read_pos_before] + + if ref_pos_before is None: + logger.warn( + "Missing reference pos for nucleotide before variant on read: %s", locus_read) return None - # insertions require a sequence of non-aligned bases - # followed by the subsequence reference position - ref_positions_for_inserted = reference_positions[ - read_pos_before + 1:read_pos_after] - if any(insert_pos is not None for insert_pos in ref_positions_for_inserted): - # all these inserted nucleotides should *not* align to the - # reference - logger.debug( - "Skipping read, inserted nucleotides shouldn't map to reference") - return None - else: - # substitutions and deletions - if ref_pos_after - ref_pos_before != n_ref + 1: - # if the number of nucleotides skipped isn't the same - # as the number of reference nucleotides in the variant then - # don't use this read - logger.debug( - "Positions before (%d) and after (%d) variant should be adjacent on read %s", - ref_pos_before, - ref_pos_after, + ref_pos_after = reference_positions[read_pos_after] + + if ref_pos_after is None: + logger.warn( + "Missing reference pos for nucleotide after variant on read: %s", locus_read) return None - nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after] - - prefix = sequence[:read_pos_before + 1] - suffix = sequence[read_pos_after:] - - prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix) - prefix, suffix = trim_N_nucleotides(prefix, suffix) - - return AlleleRead( - prefix, - nucleotides_at_variant_locus, - suffix, - name=locus_read.name) + if n_ref == 0: + if ref_pos_after - ref_pos_before != 1: + # if the number of nucleotides skipped isn't the same + # as the number of reference nucleotides in the variant then + # don't use this read + logger.debug( + "Positions before (%d) and after (%d) variant should be adjacent on read %s", + ref_pos_before, + ref_pos_after, + locus_read) + return None + + # insertions require a sequence of non-aligned bases + # followed by the subsequence reference position + ref_positions_for_inserted = reference_positions[ + read_pos_before + 1:read_pos_after] + if any(insert_pos is not None for insert_pos in ref_positions_for_inserted): + # all these inserted nucleotides should *not* align to the + # reference + logger.debug( + "Skipping read, inserted nucleotides shouldn't map to reference") + return None + else: + # substitutions and deletions + if ref_pos_after - ref_pos_before != n_ref + 1: + # if the number of nucleotides skipped isn't the same + # as the number of reference nucleotides in the variant then + # don't use this read + logger.debug( + ("Positions before (%d) and after (%d) variant should be " + "adjacent on read %s"), + ref_pos_before, + ref_pos_after, + locus_read) + return None + + nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after] + + prefix = sequence[:read_pos_before + 1] + suffix = sequence[read_pos_after:] + + prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix) + prefix, suffix = trim_N_nucleotides(prefix, suffix) + + return cls( + prefix, + nucleotides_at_variant_locus, + suffix, + name=locus_read.name) def allele_reads_from_locus_reads(locus_reads, n_ref): """ - Given a collection of ReadAtLocus objects, returns a - list of VariantRead objects (which are split into prefix/variant/suffix - nucleotides). + Given a collection of LocusRead objects, returns a + list of AlleleRead objects + (which are split into prefix/allele/suffix nucleotide strings). Parameters ---------- @@ -161,7 +154,7 @@ def allele_reads_from_locus_reads(locus_reads, n_ref): """ for locus_read in locus_reads: - allele_read = allele_read_from_locus_read(locus_read, n_ref) + allele_read = AlleleRead.from_locus_read(locus_read, n_ref) if allele_read is None: continue else: diff --git a/isovar/locus_reads.py b/isovar/locus_reads.py index 971595b..f5f8a81 100644 --- a/isovar/locus_reads.py +++ b/isovar/locus_reads.py @@ -33,154 +33,154 @@ logger = logging.getLogger(__name__) - -LocusRead = namedtuple( - "LocusRead", - [ +class LocusRead(namedtuple("LocusRead", [ "name", "sequence", "reference_positions", "quality_scores", "base0_read_position_before_variant", - "base0_read_position_after_variant", - ]) - -def create_locus_read_from_pysam_pileup_element( - pileup_element, - base0_position_before_variant, - base0_position_after_variant, - use_secondary_alignments, - use_duplicate_reads, - min_mapping_quality): - """ - Parameters - ---------- - pileup_element : pysam.PileupRead - - base0_position_before_variant : int + "base0_read_position_after_variant"])): - base0_position_after_variant : int - - use_secondary_alignments : bool - - use_duplicate_reads : bool - - min_mapping_quality : int - - Returns LocusRead or None - """ - read = pileup_element.alignment - - # For future reference, may get overlapping reads - # which can be identified by having the same name - name = read.query_name - - if name is None: - logger.warn("Read missing name at position %d", - base0_position_before_variant + 1) - return None - - if read.is_unmapped: - logger.warn( - "How did we get unmapped read '%s' in a pileup?", name) - return None - - if pileup_element.is_refskip: - # if read sequence doesn't actually align to the reference - # base before a variant, skip it - logger.debug("Skipping pileup element with CIGAR alignment N (intron)") - return None - elif pileup_element.is_del: - logger.debug( - "Skipping deletion at position %d (read name = %s)", + @classmethod + def from_pysam_pileup_element( + cls, + pileup_element, + base0_position_before_variant, + base0_position_after_variant, + use_secondary_alignments, + use_duplicate_reads, + min_mapping_quality): + """ + Parameters + ---------- + pileup_element : pysam.PileupRead + + base0_position_before_variant : int + + base0_position_after_variant : int + + use_secondary_alignments : bool + + use_duplicate_reads : bool + + min_mapping_quality : int + + Returns LocusRead or None + """ + read = pileup_element.alignment + + # For future reference, may get overlapping reads + # which can be identified by having the same name + name = read.query_name + + if name is None: + logger.warn( + "Read missing name at position %d", + base0_position_before_variant + 1) + return None + + if read.is_unmapped: + logger.warn( + "How did we get unmapped read '%s' in a pileup?", name) + return None + + if pileup_element.is_refskip: + # if read sequence doesn't actually align to the reference + # base before a variant, skip it + logger.debug("Skipping pileup element with CIGAR alignment N (intron)") + return None + elif pileup_element.is_del: + logger.debug( + "Skipping deletion at position %d (read name = %s)", base0_position_before_variant + 1, name) - return None + return None - if read.is_secondary and not use_secondary_alignments: - logger.debug("Skipping secondary alignment of read '%s'", name) - return None + if read.is_secondary and not use_secondary_alignments: + logger.debug("Skipping secondary alignment of read '%s'", name) + return None - if read.is_duplicate and not use_duplicate_reads: - logger.debug("Skipping duplicate read '%s'", name) - return None + if read.is_duplicate and not use_duplicate_reads: + logger.debug("Skipping duplicate read '%s'", name) + return None - mapping_quality = read.mapping_quality + mapping_quality = read.mapping_quality - missing_mapping_quality = mapping_quality is None - if min_mapping_quality > 0 and missing_mapping_quality: - logger.debug("Skipping read '%s' due to missing MAPQ", name) - return None - elif mapping_quality < min_mapping_quality: - logger.debug( - "Skipping read '%s' due to low MAPQ: %d < %d", + missing_mapping_quality = mapping_quality is None + if min_mapping_quality > 0 and missing_mapping_quality: + logger.debug("Skipping read '%s' due to missing MAPQ", name) + return None + elif mapping_quality < min_mapping_quality: + logger.debug( + "Skipping read '%s' due to low MAPQ: %d < %d", read.mapping_quality, mapping_quality, min_mapping_quality) - return None - - sequence = read.query_sequence - - if sequence is None: - logger.warn("Read '%s' missing sequence", name) - return None - - base_qualities = read.query_qualities - - if base_qualities is None: - logger.warn("Read '%s' missing base qualities", name) - return None - - # - # Documentation for pysam.AlignedSegment.get_reference_positions: - # ------------------------------------------------------------------ - # By default, this method only returns positions in the reference - # that are within the alignment. If full_length is set, None values - # will be included for any soft-clipped or unaligned positions - # within the read. The returned list will thus be of the same length - # as the read. - # - # Source: - # http://pysam.readthedocs.org/en/latest/ - # api.html#pysam.AlignedSegment.get_reference_positions - # - # We want a None value for every read position that does not have a - # corresponding reference position. - reference_positions = read.get_reference_positions( - full_length=True) - - # pysam uses base-0 positions everywhere except region strings - # Source: - # http://pysam.readthedocs.org/en/latest/faq.html#pysam-coordinates-are-wrong - if base0_position_before_variant not in reference_positions: - logger.debug( - "Skipping read '%s' because first position %d not mapped", + return None + + sequence = read.query_sequence + + if sequence is None: + logger.warn("Read '%s' missing sequence", name) + return None + + base_qualities = read.query_qualities + + if base_qualities is None: + logger.warn("Read '%s' missing base qualities", name) + return None + + # + # Documentation for pysam.AlignedSegment.get_reference_positions: + # ------------------------------------------------------------------ + # By default, this method only returns positions in the reference + # that are within the alignment. If full_length is set, None values + # will be included for any soft-clipped or unaligned positions + # within the read. The returned list will thus be of the same length + # as the read. + # + # Source: + # http://pysam.readthedocs.org/en/latest/ + # api.html#pysam.AlignedSegment.get_reference_positions + # + # We want a None value for every read position that does not have a + # corresponding reference position. + reference_positions = read.get_reference_positions( + full_length=True) + + # pysam uses base-0 positions everywhere except region strings + # Source: + # http://pysam.readthedocs.org/en/latest/faq.html#pysam-coordinates-are-wrong + if base0_position_before_variant not in reference_positions: + logger.debug( + "Skipping read '%s' because first position %d not mapped", name, base0_position_before_variant) - return None - else: - base0_read_position_before_variant = reference_positions.index( - base0_position_before_variant) - - if base0_position_after_variant not in reference_positions: - logger.debug( - "Skipping read '%s' because last position %d not mapped", + return None + else: + base0_read_position_before_variant = reference_positions.index( + base0_position_before_variant) + + if base0_position_after_variant not in reference_positions: + logger.debug( + "Skipping read '%s' because last position %d not mapped", name, base0_position_after_variant) - return None - else: - base0_read_position_after_variant = reference_positions.index( - base0_position_after_variant) - - if isinstance(sequence, bytes): - sequence = sequence.decode('ascii') - return LocusRead( - name=name, - sequence=sequence, - reference_positions=reference_positions, - quality_scores=base_qualities, - base0_read_position_before_variant=base0_read_position_before_variant, - base0_read_position_after_variant=base0_read_position_after_variant) + return None + else: + base0_read_position_after_variant = reference_positions.index( + base0_position_after_variant) + + if isinstance(sequence, bytes): + sequence = sequence.decode('ascii') + + return cls( + name=name, + sequence=sequence, + reference_positions=reference_positions, + quality_scores=base_qualities, + base0_read_position_before_variant=base0_read_position_before_variant, + base0_read_position_after_variant=base0_read_position_after_variant) def pileup_reads_at_position(samfile, chromosome, base0_position): """ @@ -250,9 +250,9 @@ def locus_read_generator( logger.debug( "Gathering reads at locus %s: %d-%d", - chromosome, - base1_position_before_variant, - base1_position_after_variant) + chromosome, + base1_position_before_variant, + base1_position_after_variant) base0_position_before_variant = base1_position_before_variant - 1 base0_position_after_variant = base1_position_after_variant - 1 @@ -267,7 +267,7 @@ def locus_read_generator( samfile=samfile, chromosome=chromosome, base0_position=base0_position_before_variant): - read = create_locus_read_from_pysam_pileup_element( + read = LocusRead.from_pysam_pileup_element( pileup_element, base0_position_before_variant=base0_position_before_variant, base0_position_after_variant=base0_position_after_variant, @@ -281,10 +281,10 @@ def locus_read_generator( logger.info( "Found %d reads overlapping locus %s: %d-%d", - count, - chromosome, - base1_position_before_variant, - base1_position_after_variant) + count, + chromosome, + base1_position_before_variant, + base1_position_after_variant) def locus_reads_dataframe(*args, **kwargs): """ diff --git a/isovar/protein_sequences.py b/isovar/protein_sequences.py index 34429c2..0934310 100644 --- a/isovar/protein_sequences.py +++ b/isovar/protein_sequences.py @@ -135,7 +135,9 @@ def summarize_translations(translations): def protein_sequence_sort_key(protein_sequence): """ - Sort protein sequences lexicographically by three criteria: + Sort protein sequences lexicographically by six criteria: + - TODO: min number of reads covering each nucleotide of + the protein sequence >= 2 - number of unique supporting reads - minimum mismatch versus a supporting reference transcript - number of supporting reference transcripts diff --git a/isovar/reference_context.py b/isovar/reference_context.py index 2e8dbd3..a023984 100644 --- a/isovar/reference_context.py +++ b/isovar/reference_context.py @@ -16,66 +16,14 @@ from collections import namedtuple, OrderedDict, defaultdict import logging -from skbio import DNA - from .effect_prediction import reference_transcripts_for_variant -from .variant_helpers import interbase_range_affected_by_variant_on_transcript from .dataframe_builder import DataFrameBuilder - +from .reference_sequence_key_with_reading_frame import ( + ReferenceSequenceKeyWithReadingFrame,) logger = logging.getLogger(__name__) -########################## -# -# SequenceKey -# ----------- -# -# Used to identify and group the distinct sequences occurring on a set of -# transcripts overlapping a variant locus -# -########################## - -SequenceKey = namedtuple( - "SequenceKey", [ - "strand", - "sequence_before_variant_locus", - "sequence_at_variant_locus", - "sequence_after_variant_locus" - ] -) - - -########################## -# -# SequenceKeyWithReadingFrame -# --------------------------- -# -# Includes all the fields of a SequenceKey, but also includes a reading frame -# at the start of the reference sequence. -# -# -########################## - -SequenceKeyWithReadingFrame = namedtuple( - "ReferenceContext", SequenceKey._fields + ( - # if the reference context includes the 5' UTR then - # this is the offset to the start codon, otherwise it's the - # offset needed to get the first base of a codon - "offset_to_first_complete_codon", - # does this context overlap a start codon? - "overlaps_start_codon", - # does this context contain the whole trinucleotide start codon? - "contains_start_codon", - # does this context contain any UTR bases? - "contains_five_prime_utr", - # translation of complete codons in the reference context - # before the variant - "amino_acids_before_variant", - ) -) - - ########################## # # ReferenceContext @@ -89,250 +37,11 @@ ReferenceContext = namedtuple( "ReferenceContext", - SequenceKeyWithReadingFrame._fields + ( + ReferenceSequenceKeyWithReadingFrame._fields + ( "variant", "transcripts") ) -def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand): - """ - Make sure that reference nucleotides we expect to see on the reference - transcript from a variant are the same ones we encounter. - """ - if strand == "-": - ref_seq_on_transcript = str( - DNA(ref_seq_on_transcript).reverse_complement()) - - return ref_seq_on_transcript == variant.ref - -def sequence_key_for_variant_on_transcript(variant, transcript, context_size): - """ - Extracts the reference sequence around a variant locus on a particular - transcript. - - Parameters - ---------- - variant : varcode.Variant - - transcript : pyensembl.Transcript - - context_size : int - - Returns SequenceKey object with the following fields: - - strand - - sequence_before_variant_locus - - sequence_at_variant_locus - - sequence_after_variant_locus - - Can also return None if Transcript lacks sufficiently long sequence - """ - - full_sequence = transcript.sequence - - if full_sequence is None: - logger.warn( - "Expected transcript %s (overlapping %s) to have sequence", - transcript, - variant) - return None - - if len(full_sequence) < 6: - # need at least 6 nucleotides for a start and stop codon - logger.warn( - "Sequence of %s (overlapping %s) too short: %d", - transcript, - variant, - len(full_sequence)) - return None - - # get the interbase range of offsets which capture all reference - # bases modified by the variant - variant_start_offset, variant_end_offset = \ - interbase_range_affected_by_variant_on_transcript( - variant=variant, - transcript=transcript) - - logger.info("Interbase offset range on %s for variant %s = %d:%d", - transcript, - variant, - variant_start_offset, - variant_end_offset) - - prefix = full_sequence[ - max(0, variant_start_offset - context_size): - variant_start_offset] - - suffix = full_sequence[ - variant_end_offset: - variant_end_offset + context_size] - - ref_nucleotides_at_variant = full_sequence[ - variant_start_offset:variant_end_offset] - if not variant_matches_reference_sequence( - variant=variant, - strand=transcript.strand, - ref_seq_on_transcript=ref_nucleotides_at_variant): - # TODO: once we're more confident about other logic in isovar, - # change this to a warning and return None to allow for modest - # differences between reference sequence patches, since - # GRCh38.p1 may differ at some positions from GRCh38.p5 - raise ValueError( - "Wrong reference sequence for variant %s on transcript %s" % ( - variant, - transcript)) - return SequenceKey( - strand=transcript.strand, - sequence_before_variant_locus=prefix, - sequence_at_variant_locus=ref_nucleotides_at_variant, - sequence_after_variant_locus=suffix) - - -def reading_frame_to_offset(reading_frame_at_start_of_sequence): - """ - Given a reading frame (how many nucleotides into a codon) at the - start of a cDNA sequence, return the number of nucleotides which need - to be trimmed to start on a complete codon. - - Parameters - ---------- - - reading_frame_at_start_of_sequence : int - - Returns an int - """ - if reading_frame_at_start_of_sequence < 0: - raise ValueError("Reading frame can't be negative: %d" % ( - reading_frame_at_start_of_sequence,)) - elif reading_frame_at_start_of_sequence > 2: - raise ValueError("Reading frame must be within 0 and 2, not %d" % ( - reading_frame_at_start_of_sequence,)) - # If we're 1 nucleotide into the codon then we need to shift - # over two more to restore the ORF. Likewise, if we're 2 nucleotides in - # then we have to shift over one more. - return (3 - reading_frame_at_start_of_sequence) % 3 - - -def sequence_key_with_reading_frame_for_variant_on_transcript( - variant, - transcript, - context_size): - """ - Extracts the reference sequence around a variant locus on a particular - transcript and determines the reading frame at the start of that - sequence context. - - Parameters - ---------- - variant : varcode.Variant - - transcript : pyensembl.Transcript - - context_size : int - - Returns SequenceKeyWithReadingFrame object or None if Transcript lacks - coding sequence, protein sequence or annotated start/stop codons. - """ - if not transcript.contains_start_codon: - logger.info( - "Expected transcript %s for variant %s to have start codon", - transcript, - variant) - return None - - if not transcript.contains_stop_codon: - logger.info( - "Expected transcript %s for variant %s to have stop codon", - transcript, - variant) - return None - - if not transcript.protein_sequence: - logger.info( - "Expected transript %s for variant %s to have protein sequence", - transcript, - variant) - return None - - sequence_key = sequence_key_for_variant_on_transcript( - variant=variant, - transcript=transcript, - context_size=context_size) - - if sequence_key is None: - logger.info("No sequence key for variant %s on transcript %s", - variant, - transcript) - return None - - # get the interbase range of offsets which capture all reference - # bases modified by the variant - variant_start_offset, variant_end_offset = \ - interbase_range_affected_by_variant_on_transcript( - variant=variant, - transcript=transcript) - - start_codon_idx = min(transcript.start_codon_spliced_offsets) - - # skip any variants which occur in the 5' UTR or overlap the start codon - # since - # (1) UTR variants have unclear coding effects and - # (2) start-loss variants may result in a new start codon / reading - # frame but we're not sure which one! - if variant_start_offset < start_codon_idx + 3: - logger.info( - "Skipping transcript %s for variant %s, must be after start codon", - transcript, - variant) - return None - - stop_codon_idx = min(transcript.stop_codon_spliced_offsets) - - # skip variants which affect the 3' UTR of the transcript since - # they don't have obvious coding effects on the protein sequence - if variant_start_offset >= stop_codon_idx + 3: - logger.info( - "Skipping transcript %s for variant %s, occurs in 3' UTR", - transcript, - variant) - return None - - n_prefix = len(sequence_key.sequence_before_variant_locus) - prefix_start_idx = variant_start_offset - n_prefix - n_bases_between_start_and_variant = variant_start_offset - start_codon_idx - n_full_codons_before_variant = n_bases_between_start_and_variant // 3 - - # if the sequence before the variant contains more bases than the - # distance to the start codon, then by definition it must contain - # some untranslated bases - contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant) - # allows for the possibility that the first base in the sequence might - # be the first nucleotide of the start codon - contains_start_codon = (n_prefix >= n_bases_between_start_and_variant) - # the sequence context might only include the 2nd or 3rd bases of - # the start codon - overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3) - - if contains_start_codon: - offset_to_first_complete_codon = start_codon_idx - prefix_start_idx - amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant] - else: - reading_frame = (prefix_start_idx - start_codon_idx) % 3 - offset_to_first_complete_codon = reading_frame_to_offset(reading_frame) - n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3 - amino_acids_before_variant = transcript.protein_sequence[ - n_full_codons_before_variant - n_codons_in_prefix: - n_full_codons_before_variant] - return SequenceKeyWithReadingFrame( - strand=sequence_key.strand, - sequence_before_variant_locus=sequence_key.sequence_before_variant_locus, - sequence_at_variant_locus=sequence_key.sequence_at_variant_locus, - sequence_after_variant_locus=sequence_key.sequence_after_variant_locus, - offset_to_first_complete_codon=offset_to_first_complete_codon, - contains_start_codon=contains_start_codon, - overlaps_start_codon=overlaps_start_codon, - contains_five_prime_utr=contains_five_prime_utr, - amino_acids_before_variant=amino_acids_before_variant) - def sort_key_decreasing_max_length_transcript_cds(reference_context): """ Used to sort a sequence of ReferenceContext objects by the longest CDS @@ -367,7 +76,7 @@ def reference_contexts_for_variant( for transcript in overlapping_transcripts: sequence_key_with_reading_frame = \ - sequence_key_with_reading_frame_for_variant_on_transcript( + ReferenceSequenceKeyWithReadingFrame.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=context_size) diff --git a/isovar/reference_sequence_key.py b/isovar/reference_sequence_key.py new file mode 100644 index 0000000..5ade65d --- /dev/null +++ b/isovar/reference_sequence_key.py @@ -0,0 +1,132 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import +import logging +from collections import namedtuple + +from skbio import DNA + +from .variant_helpers import interbase_range_affected_by_variant_on_transcript + +logger = logging.getLogger(__name__) + +########################## +# +# ReferenceSequenceKey +# ----------- +# +# Used to identify and group the distinct sequences occurring on a set of +# transcripts overlapping a variant locus +# +########################## + +class ReferenceSequenceKey(namedtuple("ReferenceSequenceKey", [ + "strand", + "sequence_before_variant_locus", + "sequence_at_variant_locus", + "sequence_after_variant_locus"])): + + @classmethod + def from_variant_and_transcript(cls, variant, transcript, context_size): + """ + Extracts the reference sequence around a variant locus on a particular + transcript. + + Parameters + ---------- + variant : varcode.Variant + + transcript : pyensembl.Transcript + + context_size : int + + Returns SequenceKey object with the following fields: + - strand + - sequence_before_variant_locus + - sequence_at_variant_locus + - sequence_after_variant_locus + + Can also return None if Transcript lacks sufficiently long sequence + """ + full_transcript_sequence = transcript.sequence + + if full_transcript_sequence is None: + logger.warn( + "Expected transcript %s (overlapping %s) to have sequence", + transcript, + variant) + return None + + if len(full_transcript_sequence) < 6: + # need at least 6 nucleotides for a start and stop codon + logger.warn( + "Sequence of %s (overlapping %s) too short: %d", + transcript, + variant, + len(full_transcript_sequence)) + return None + + # get the interbase range of offsets which capture all reference + # bases modified by the variant + variant_start_offset, variant_end_offset = \ + interbase_range_affected_by_variant_on_transcript( + variant=variant, + transcript=transcript) + + logger.info( + "Interbase offset range on %s for variant %s = %d:%d", + transcript, + variant, + variant_start_offset, + variant_end_offset) + + prefix = full_transcript_sequence[ + max(0, variant_start_offset - context_size): + variant_start_offset] + + suffix = full_transcript_sequence[ + variant_end_offset: + variant_end_offset + context_size] + + ref_nucleotides_at_variant = full_transcript_sequence[ + variant_start_offset:variant_end_offset] + if not variant_matches_reference_sequence( + variant=variant, + strand=transcript.strand, + ref_seq_on_transcript=ref_nucleotides_at_variant): + # TODO: once we're more confident about other logic in isovar, + # change this to a warning and return None to allow for modest + # differences between reference sequence patches, since + # GRCh38.p1 may differ at some positions from GRCh38.p5 + raise ValueError( + "Wrong reference sequence for variant %s on transcript %s" % ( + variant, + transcript)) + return cls( + strand=transcript.strand, + sequence_before_variant_locus=prefix, + sequence_at_variant_locus=ref_nucleotides_at_variant, + sequence_after_variant_locus=suffix) + + +def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand): + """ + Make sure that reference nucleotides we expect to see on the reference + transcript from a variant are the same ones we encounter. + """ + if strand == "-": + ref_seq_on_transcript = str( + DNA(ref_seq_on_transcript).reverse_complement()) + return ref_seq_on_transcript == variant.ref diff --git a/isovar/reference_sequence_key_with_reading_frame.py b/isovar/reference_sequence_key_with_reading_frame.py new file mode 100644 index 0000000..d79d2a5 --- /dev/null +++ b/isovar/reference_sequence_key_with_reading_frame.py @@ -0,0 +1,201 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import +import logging +from collections import namedtuple + +from .variant_helpers import interbase_range_affected_by_variant_on_transcript +from .reference_sequence_key import ReferenceSequenceKey + +logger = logging.getLogger(__name__) + + +########################## +# +# ReferenceSequenceKeyWithReadingFrame +# --------------------------- +# +# Includes all the fields of a ReferenceSequenceKey, but also includes a reading frame +# at the start of the reference sequence. +# +# +########################## + +class SequenceKeyWithReadingFrame( + namedtuple( + "ReferenceSequenceKeyWithReadingFrame", + ReferenceSequenceKey._fields + ( + # if the reference context includes the 5' UTR then + # this is the offset to the start codon, otherwise it's the + # offset needed to get the first base of a codon + "offset_to_first_complete_codon", + # does this context overlap a start codon? + "overlaps_start_codon", + # does this context contain the whole trinucleotide start codon? + "contains_start_codon", + # does this context contain any UTR bases? + "contains_five_prime_utr", + # translation of complete codons in the reference context + # before the variant + "amino_acids_before_variant"))): + + @classmethod + def from_variant_and_transcript( + cls, + variant, + transcript, + context_size): + """ + Extracts the reference sequence around a variant locus on a particular + transcript and determines the reading frame at the start of that + sequence context. + + Parameters + ---------- + variant : varcode.Variant + + transcript : pyensembl.Transcript + + context_size : int + + Returns SequenceKeyWithReadingFrame object or None if Transcript lacks + coding sequence, protein sequence or annotated start/stop codons. + """ + if not transcript.contains_start_codon: + logger.info( + "Expected transcript %s for variant %s to have start codon", + transcript, + variant) + return None + + if not transcript.contains_stop_codon: + logger.info( + "Expected transcript %s for variant %s to have stop codon", + transcript, + variant) + return None + + if not transcript.protein_sequence: + logger.info( + "Expected transript %s for variant %s to have protein sequence", + transcript, + variant) + return None + + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( + variant=variant, + transcript=transcript, + context_size=context_size) + + if sequence_key is None: + logger.info( + "No sequence key for variant %s on transcript %s", + variant, + transcript) + return None + + # get the interbase range of offsets which capture all reference + # bases modified by the variant + variant_start_offset, variant_end_offset = \ + interbase_range_affected_by_variant_on_transcript( + variant=variant, + transcript=transcript) + + start_codon_idx = min(transcript.start_codon_spliced_offsets) + + # skip any variants which occur in the 5' UTR or overlap the start codon + # since + # (1) UTR variants have unclear coding effects and + # (2) start-loss variants may result in a new start codon / reading + # frame but we're not sure which one! + if variant_start_offset < start_codon_idx + 3: + logger.info( + "Skipping transcript %s for variant %s, must be after start codon", + transcript, + variant) + return None + + stop_codon_idx = min(transcript.stop_codon_spliced_offsets) + + # skip variants which affect the 3' UTR of the transcript since + # they don't have obvious coding effects on the protein sequence + if variant_start_offset >= stop_codon_idx + 3: + logger.info( + "Skipping transcript %s for variant %s, occurs in 3' UTR", + transcript, + variant) + return None + + n_prefix = len(sequence_key.sequence_before_variant_locus) + prefix_start_idx = variant_start_offset - n_prefix + n_bases_between_start_and_variant = variant_start_offset - start_codon_idx + n_full_codons_before_variant = n_bases_between_start_and_variant // 3 + + # if the sequence before the variant contains more bases than the + # distance to the start codon, then by definition it must contain + # some untranslated bases + contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant) + # allows for the possibility that the first base in the sequence might + # be the first nucleotide of the start codon + contains_start_codon = (n_prefix >= n_bases_between_start_and_variant) + # the sequence context might only include the 2nd or 3rd bases of + # the start codon + overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3) + + if contains_start_codon: + offset_to_first_complete_codon = start_codon_idx - prefix_start_idx + amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant] + else: + reading_frame = (prefix_start_idx - start_codon_idx) % 3 + offset_to_first_complete_codon = reading_frame_to_offset(reading_frame) + n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3 + amino_acids_before_variant = transcript.protein_sequence[ + n_full_codons_before_variant - n_codons_in_prefix: + n_full_codons_before_variant] + return SequenceKeyWithReadingFrame( + strand=sequence_key.strand, + sequence_before_variant_locus=sequence_key.sequence_before_variant_locus, + sequence_at_variant_locus=sequence_key.sequence_at_variant_locus, + sequence_after_variant_locus=sequence_key.sequence_after_variant_locus, + offset_to_first_complete_codon=offset_to_first_complete_codon, + contains_start_codon=contains_start_codon, + overlaps_start_codon=overlaps_start_codon, + contains_five_prime_utr=contains_five_prime_utr, + amino_acids_before_variant=amino_acids_before_variant) + + +def reading_frame_to_offset(reading_frame_at_start_of_sequence): + """ + Given a reading frame (how many nucleotides into a codon) at the + start of a cDNA sequence, return the number of nucleotides which need + to be trimmed to start on a complete codon. + + Parameters + ---------- + + reading_frame_at_start_of_sequence : int + + Returns an int + """ + if reading_frame_at_start_of_sequence < 0: + raise ValueError("Reading frame can't be negative: %d" % ( + reading_frame_at_start_of_sequence,)) + elif reading_frame_at_start_of_sequence > 2: + raise ValueError("Reading frame must be within 0 and 2, not %d" % ( + reading_frame_at_start_of_sequence,)) + # If we're 1 nucleotide into the codon then we need to shift + # over two more to restore the ORF. Likewise, if we're 2 nucleotides in + # then we have to shift over one more. + return (3 - reading_frame_at_start_of_sequence) % 3 diff --git a/test/test_reference_sequence_key.py b/test/test_reference_sequence_key.py index fa40b94..766b7d0 100644 --- a/test/test_reference_sequence_key.py +++ b/test/test_reference_sequence_key.py @@ -14,10 +14,7 @@ from __future__ import print_function, division, absolute_import -from isovar.reference_context import ( - sequence_key_for_variant_on_transcript, - SequenceKey, -) +from isovar.reference_sequence_key import ReferenceSequenceKey from varcode import Variant from pyensembl import ensembl_grch38 from nose.tools import eq_ @@ -36,11 +33,11 @@ def test_sequence_key_for_variant_on_transcript_substitution(): eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG") print(brca2_ref_seq) # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=brca2_variant_rs769125639, transcript=brca2_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="+", sequence_before_variant_locus=brca2_ref_seq[:5], sequence_at_variant_locus="T", @@ -59,11 +56,11 @@ def test_sequence_key_for_variant_on_transcript_deletion(): eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG") print(brca2_ref_seq) # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=brca2_variant_deletion, transcript=brca2_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="+", sequence_before_variant_locus=brca2_ref_seq[:5], sequence_at_variant_locus="T", @@ -81,14 +78,14 @@ def test_sequence_key_for_variant_on_transcript_insertion(): eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG") print(brca2_ref_seq) # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=brca2_variant_insertion, transcript=brca2_001, context_size=10) # expecting nothing at the variant locus since we're inserting between # two reference nucleotides - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="+", sequence_before_variant_locus=brca2_ref_seq[:6], sequence_at_variant_locus="", @@ -108,12 +105,12 @@ def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand(): eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_substitution, transcript=tp53_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="GGTCACTGCC", sequence_at_variant_locus="ATG", @@ -132,12 +129,12 @@ def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand(): eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="GGTCACTGCC", sequence_at_variant_locus="ATG", @@ -160,12 +157,12 @@ def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand(): # GCGGCTCCTC_CAT_GGCAGTGACC # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="", From d48b88e1d7b0776a4833b3b33405c908b72f53af Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 4 Nov 2016 13:59:44 -0400 Subject: [PATCH 05/11] fixed unit tests with ReferenceSequenceKey and ReferenceCodingSequenceKey --- isovar/allele_reads.py | 18 +- isovar/reference_context.py | 7 +- isovar/reference_sequence_key.py | 13 +- ...ference_sequence_key_with_reading_frame.py | 201 ------------- test/test_allele_reads.py | 4 +- ...ference_sequence_key_with_reading_frame.py | 283 ------------------ 6 files changed, 21 insertions(+), 505 deletions(-) delete mode 100644 isovar/reference_sequence_key_with_reading_frame.py delete mode 100644 test/test_reference_sequence_key_with_reading_frame.py diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py index 37d5a76..f6fb32e 100644 --- a/isovar/allele_reads.py +++ b/isovar/allele_reads.py @@ -36,12 +36,18 @@ # subclassing from namedtuple to get a lightweight object with built-in # hashing and comparison while also being able to add methods -class AlleleRead( - namedtuple("AlleleRead", "prefix allele suffix name sequence")): - - @property - def sequence(self): - return self.prefix + self.allele + self.suffix +AlleleReadBase = namedtuple( + "AlleleRead", + "prefix allele suffix name sequence") + +class AlleleRead(AlleleReadBase): + def __new__(cls, prefix, allele, suffix, name): + return AlleleReadBase( + prefix=prefix, + allele=allele, + suffix=suffix, + name=name, + sequence=prefix + allele + suffix) def __len__(self): return len(self.prefix) + len(self.allele) + len(self.suffix) diff --git a/isovar/reference_context.py b/isovar/reference_context.py index a023984..b90e3d2 100644 --- a/isovar/reference_context.py +++ b/isovar/reference_context.py @@ -18,8 +18,7 @@ from .effect_prediction import reference_transcripts_for_variant from .dataframe_builder import DataFrameBuilder -from .reference_sequence_key_with_reading_frame import ( - ReferenceSequenceKeyWithReadingFrame,) +from .reference_coding_sequence_key import ReferenceCodingSequenceKey logger = logging.getLogger(__name__) @@ -37,7 +36,7 @@ ReferenceContext = namedtuple( "ReferenceContext", - ReferenceSequenceKeyWithReadingFrame._fields + ( + ReferenceCodingSequenceKey._fields + ( "variant", "transcripts") ) @@ -76,7 +75,7 @@ def reference_contexts_for_variant( for transcript in overlapping_transcripts: sequence_key_with_reading_frame = \ - ReferenceSequenceKeyWithReadingFrame.from_variant_and_transcript( + ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=context_size) diff --git a/isovar/reference_sequence_key.py b/isovar/reference_sequence_key.py index 5ade65d..d7f1548 100644 --- a/isovar/reference_sequence_key.py +++ b/isovar/reference_sequence_key.py @@ -22,21 +22,16 @@ logger = logging.getLogger(__name__) -########################## -# -# ReferenceSequenceKey -# ----------- -# -# Used to identify and group the distinct sequences occurring on a set of -# transcripts overlapping a variant locus -# -########################## class ReferenceSequenceKey(namedtuple("ReferenceSequenceKey", [ "strand", "sequence_before_variant_locus", "sequence_at_variant_locus", "sequence_after_variant_locus"])): + """ + Used to identify and group the distinct sequences occurring on a set of + transcripts overlapping a variant locus + """ @classmethod def from_variant_and_transcript(cls, variant, transcript, context_size): diff --git a/isovar/reference_sequence_key_with_reading_frame.py b/isovar/reference_sequence_key_with_reading_frame.py deleted file mode 100644 index d79d2a5..0000000 --- a/isovar/reference_sequence_key_with_reading_frame.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function, division, absolute_import -import logging -from collections import namedtuple - -from .variant_helpers import interbase_range_affected_by_variant_on_transcript -from .reference_sequence_key import ReferenceSequenceKey - -logger = logging.getLogger(__name__) - - -########################## -# -# ReferenceSequenceKeyWithReadingFrame -# --------------------------- -# -# Includes all the fields of a ReferenceSequenceKey, but also includes a reading frame -# at the start of the reference sequence. -# -# -########################## - -class SequenceKeyWithReadingFrame( - namedtuple( - "ReferenceSequenceKeyWithReadingFrame", - ReferenceSequenceKey._fields + ( - # if the reference context includes the 5' UTR then - # this is the offset to the start codon, otherwise it's the - # offset needed to get the first base of a codon - "offset_to_first_complete_codon", - # does this context overlap a start codon? - "overlaps_start_codon", - # does this context contain the whole trinucleotide start codon? - "contains_start_codon", - # does this context contain any UTR bases? - "contains_five_prime_utr", - # translation of complete codons in the reference context - # before the variant - "amino_acids_before_variant"))): - - @classmethod - def from_variant_and_transcript( - cls, - variant, - transcript, - context_size): - """ - Extracts the reference sequence around a variant locus on a particular - transcript and determines the reading frame at the start of that - sequence context. - - Parameters - ---------- - variant : varcode.Variant - - transcript : pyensembl.Transcript - - context_size : int - - Returns SequenceKeyWithReadingFrame object or None if Transcript lacks - coding sequence, protein sequence or annotated start/stop codons. - """ - if not transcript.contains_start_codon: - logger.info( - "Expected transcript %s for variant %s to have start codon", - transcript, - variant) - return None - - if not transcript.contains_stop_codon: - logger.info( - "Expected transcript %s for variant %s to have stop codon", - transcript, - variant) - return None - - if not transcript.protein_sequence: - logger.info( - "Expected transript %s for variant %s to have protein sequence", - transcript, - variant) - return None - - sequence_key = ReferenceSequenceKey.from_variant_and_transcript( - variant=variant, - transcript=transcript, - context_size=context_size) - - if sequence_key is None: - logger.info( - "No sequence key for variant %s on transcript %s", - variant, - transcript) - return None - - # get the interbase range of offsets which capture all reference - # bases modified by the variant - variant_start_offset, variant_end_offset = \ - interbase_range_affected_by_variant_on_transcript( - variant=variant, - transcript=transcript) - - start_codon_idx = min(transcript.start_codon_spliced_offsets) - - # skip any variants which occur in the 5' UTR or overlap the start codon - # since - # (1) UTR variants have unclear coding effects and - # (2) start-loss variants may result in a new start codon / reading - # frame but we're not sure which one! - if variant_start_offset < start_codon_idx + 3: - logger.info( - "Skipping transcript %s for variant %s, must be after start codon", - transcript, - variant) - return None - - stop_codon_idx = min(transcript.stop_codon_spliced_offsets) - - # skip variants which affect the 3' UTR of the transcript since - # they don't have obvious coding effects on the protein sequence - if variant_start_offset >= stop_codon_idx + 3: - logger.info( - "Skipping transcript %s for variant %s, occurs in 3' UTR", - transcript, - variant) - return None - - n_prefix = len(sequence_key.sequence_before_variant_locus) - prefix_start_idx = variant_start_offset - n_prefix - n_bases_between_start_and_variant = variant_start_offset - start_codon_idx - n_full_codons_before_variant = n_bases_between_start_and_variant // 3 - - # if the sequence before the variant contains more bases than the - # distance to the start codon, then by definition it must contain - # some untranslated bases - contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant) - # allows for the possibility that the first base in the sequence might - # be the first nucleotide of the start codon - contains_start_codon = (n_prefix >= n_bases_between_start_and_variant) - # the sequence context might only include the 2nd or 3rd bases of - # the start codon - overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3) - - if contains_start_codon: - offset_to_first_complete_codon = start_codon_idx - prefix_start_idx - amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant] - else: - reading_frame = (prefix_start_idx - start_codon_idx) % 3 - offset_to_first_complete_codon = reading_frame_to_offset(reading_frame) - n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3 - amino_acids_before_variant = transcript.protein_sequence[ - n_full_codons_before_variant - n_codons_in_prefix: - n_full_codons_before_variant] - return SequenceKeyWithReadingFrame( - strand=sequence_key.strand, - sequence_before_variant_locus=sequence_key.sequence_before_variant_locus, - sequence_at_variant_locus=sequence_key.sequence_at_variant_locus, - sequence_after_variant_locus=sequence_key.sequence_after_variant_locus, - offset_to_first_complete_codon=offset_to_first_complete_codon, - contains_start_codon=contains_start_codon, - overlaps_start_codon=overlaps_start_codon, - contains_five_prime_utr=contains_five_prime_utr, - amino_acids_before_variant=amino_acids_before_variant) - - -def reading_frame_to_offset(reading_frame_at_start_of_sequence): - """ - Given a reading frame (how many nucleotides into a codon) at the - start of a cDNA sequence, return the number of nucleotides which need - to be trimmed to start on a complete codon. - - Parameters - ---------- - - reading_frame_at_start_of_sequence : int - - Returns an int - """ - if reading_frame_at_start_of_sequence < 0: - raise ValueError("Reading frame can't be negative: %d" % ( - reading_frame_at_start_of_sequence,)) - elif reading_frame_at_start_of_sequence > 2: - raise ValueError("Reading frame must be within 0 and 2, not %d" % ( - reading_frame_at_start_of_sequence,)) - # If we're 1 nucleotide into the codon then we need to shift - # over two more to restore the ORF. Likewise, if we're 2 nucleotides in - # then we have to shift over one more. - return (3 - reading_frame_at_start_of_sequence) % 3 diff --git a/test/test_allele_reads.py b/test/test_allele_reads.py index b2864ce..1c4aab8 100644 --- a/test/test_allele_reads.py +++ b/test/test_allele_reads.py @@ -1,5 +1,5 @@ -from isovar.allele_reads import AlleleRead, allele_read_from_locus_read +from isovar.allele_reads import AlleleRead from isovar.locus_reads import LocusRead from nose.tools import eq_ @@ -16,7 +16,7 @@ def make_read_at_locus(prefix, alt, suffix, base_quality=30, name="dummy"): def test_allele_read_from_single_read_at_locus_trim_N_nucleotides(): read_at_locus = make_read_at_locus(prefix="NCCN", alt="A", suffix="TNNA") - allele_read = allele_read_from_locus_read(read_at_locus, n_ref=1) + allele_read = AlleleRead.from_locus_read(read_at_locus, n_ref=1) print(allele_read) expected = AlleleRead(prefix="", allele="A", suffix="T", name="dummy") eq_(allele_read, expected) diff --git a/test/test_reference_sequence_key_with_reading_frame.py b/test/test_reference_sequence_key_with_reading_frame.py deleted file mode 100644 index 79f9ed9..0000000 --- a/test/test_reference_sequence_key_with_reading_frame.py +++ /dev/null @@ -1,283 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function, division, absolute_import - -from isovar.reference_context import ( - reading_frame_to_offset, - sequence_key_with_reading_frame_for_variant_on_transcript, - SequenceKeyWithReadingFrame, -) -from varcode import Variant -from pyensembl import ensembl_grch38 -from nose.tools import eq_ - -from testing_helpers import assert_equal_fields - -def test_reading_frame_to_offset(): - eq_(reading_frame_to_offset(0), 0) - eq_(reading_frame_to_offset(1), 2) - eq_(reading_frame_to_offset(2), 1) - - -def test_sequence_key_with_reading_frame_substitution_with_five_prime_utr(): - # Replace second codon of TP53-001 with 'CCC', the surrounding context - # includes nucleotides from the 5' UTR. Since TP53 is on the negative - # strand we have to take the reverse complement of the variant which turns - # it into CTC>GGG - tp53_substitution = Variant( - "17", 7676589, "CTC", "GGG", ensembl_grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - - # Sequence of TP53 around second codon with 10 context nucleotides: - # In [51]: t.sequence[193-10:193+13] - # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' - # Which can be split into the following parts: - # last 7 nt of 5' UTR: CACTGCC - # start codon: ATG (translates to M) - # 2nd codon: GAG <---- variant occurs here - # 3rd codon: GAG - # 4th codon: CCG - # 5th codon: CAG - # first nt of 6th codon: T - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_substitution, - transcript=tp53_001, - context_size=10) - expected = SequenceKeyWithReadingFrame( - strand="-", - sequence_before_variant_locus="CACTGCCATG", - sequence_at_variant_locus="GAG", - sequence_after_variant_locus="GAGCCGCAGT", - offset_to_first_complete_codon=7, - contains_start_codon=True, - overlaps_start_codon=True, - contains_five_prime_utr=True, - amino_acids_before_variant="M") - assert_equal_fields(result, expected) - -def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): - # Delete second codon of TP53-001, the surrounding context - # includes nucleotides from the 5' UTR. Since TP53 is on the negative - # strand we have to take the reverse complement of the variant which turns - # it into 'CTC'>'' - tp53_deletion = Variant( - "17", 7676589, "CTC", "", ensembl_grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - - # Sequence of TP53 around second codon with 10 context nucleotides: - # In [51]: t.sequence[193-10:193+13] - # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' - # Which can be split into the following parts: - # last 7 nt of 5' UTR: CACTGCC - # start codon: ATG (translates to M) - # 2nd codon: GAG <---- variant occurs here - # 3rd codon: GAG - # 4th codon: CCG - # 5th codon: CAG - # first nt of 6th codon: T - - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_deletion, - transcript=tp53_001, - context_size=10) - expected = SequenceKeyWithReadingFrame( - strand="-", - sequence_before_variant_locus="CACTGCCATG", - sequence_at_variant_locus="GAG", - sequence_after_variant_locus="GAGCCGCAGT", - offset_to_first_complete_codon=7, - contains_start_codon=True, - overlaps_start_codon=True, - contains_five_prime_utr=True, - amino_acids_before_variant="M") - assert_equal_fields(result, expected) - - -def test_sequence_key_with_reading_frame_insertion(): - # Insert nucleotide "T" after second codon of TP53-001, the - # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on - # the negative strand we have to take the reverse complement of the variant - # which turns it into 'CTC'>'CTCA' - tp53_insertion = Variant( - "17", 7676586, "CTC", "CTCA", ensembl_grch38) - - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - # Sequence of TP53 around boundary of 2nd/3rd codons - # with 10 context nucleotides: - # last 4 nt of 5' UTR: TGCC - # start codon: ATG (translates to M) - # 2nd codon: GAG (translates to E) - # <---- insertion variant occurs between these two codons - # 3rd codon: GAG - # 4th codon: CCG - # 5th codon: CAG - # first nt of 6th codon: T - - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=10) - - expected = SequenceKeyWithReadingFrame( - strand="-", - sequence_before_variant_locus="TGCCATGGAG", - sequence_at_variant_locus="", - sequence_after_variant_locus="GAGCCGCAGT", - offset_to_first_complete_codon=4, - contains_start_codon=True, - overlaps_start_codon=True, - contains_five_prime_utr=True, - amino_acids_before_variant="ME") - assert_equal_fields(result, expected) - -def test_sequence_key_with_reading_frame_insertion_inside_start_codon(): - # insert nucleotide "C" in the middle of the start codon of TP53-001, - # keeping only 1 nucleotide of context. In the reverse complement this - # becomes 'T'>'TG' - tp53_insertion = Variant( - "17", 7676592, "T", "TG", ensembl_grch38) - - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=1) - assert result is None, "Expected result to be None when variant affects start codon" - -def test_sequence_key_with_reading_frame_insertion_before_start_codon(): - # insert nucleotide "T" before of the start codon of TP53-001, - tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38) - - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=1) - assert result is None, "Expected result to be None when variant before start codon" - - -def test_sequence_key_with_reading_frame_insertion_context_6nt_contains_start(): - # Insert nucleotide "T" after second codon of TP53-001, - # but in this test we're going to only keep enough context to see - # the start codon but none of the 5' UTR. In the reverse complement this - # variant becomes CTC>CTCA - tp53_insertion = Variant( - "17", 7676586, "CTC", "CTCA", ensembl_grch38) - - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - # Sequence of TP53 around boundary of 2nd/3rd codons - # with 6 context nucleotides: - # start codon: ATG (translates to M) - # 2nd codon: GAG (translates to E) - # <---- insertion variant occurs between these two codons - # 3rd codon: GAG - # 4th codon: CCG - - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=6) - - expected = SequenceKeyWithReadingFrame( - strand="-", - sequence_before_variant_locus="ATGGAG", - sequence_at_variant_locus="", - sequence_after_variant_locus="GAGCCG", - offset_to_first_complete_codon=0, - contains_start_codon=True, - overlaps_start_codon=True, - contains_five_prime_utr=False, - amino_acids_before_variant="ME") - assert_equal_fields(result, expected) - - -def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(): - # Insert nucleotide "T" after second codon of TP53-001, - # but in this test we're going to only keep enough context to see - # a part of the start codon, thus the result shouldn't "contain" - # the start codon but does "overlap" it. In the reverse complement - # this variant becomes CTC>CTCA - tp53_insertion = Variant( - "17", 7676586, "CTC", "CTCA", ensembl_grch38) - - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - # Sequence of TP53 around boundary of 2nd/3rd codons - # with 6 context nucleotides: - # last two nt of start codon: TG - # 2nd codon: GAG (translates to E) - # <---- insertion variant occurs between these two codons - # 3rd codon: GAG - # first two nt of 4th codon: CC - - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=5) - - expected = SequenceKeyWithReadingFrame( - strand="-", - sequence_before_variant_locus="TGGAG", - sequence_at_variant_locus="", - sequence_after_variant_locus="GAGCC", - offset_to_first_complete_codon=2, - contains_start_codon=False, - overlaps_start_codon=True, - contains_five_prime_utr=False, - amino_acids_before_variant="E") - assert_equal_fields(result, expected) - - -def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): - # Insert nucleotide "T" after second codon of TP53-001, - # but in this test we're going to only keep enough context to see - # the second codon (and no nucleotides from the start). In the reverse - # complement this variant becomes CTC>CTCA. - - tp53_insertion = Variant( - "17", 7676586, "CTC", "CTCA", ensembl_grch38) - - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - # Sequence of TP53 around boundary of 2nd/3rd codons - # with 6 context nucleotides: - # 2nd codon: GAG (translates to E) - # <---- insertion variant occurs between these two codons - # 3rd codon: GAG - - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=3) - - expected = SequenceKeyWithReadingFrame( - strand="-", - sequence_before_variant_locus="GAG", - sequence_at_variant_locus="", - sequence_after_variant_locus="GAG", - offset_to_first_complete_codon=0, - contains_start_codon=False, - overlaps_start_codon=False, - contains_five_prime_utr=False, - amino_acids_before_variant="E") - assert_equal_fields(result, expected) From 1c88072520b0003a346b8e1d49272e8029ca25e9 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 4 Nov 2016 15:07:00 -0400 Subject: [PATCH 06/11] added reference coding sequence module and tests --- isovar/reference_coding_sequence_key.py | 195 +++++++++++++++ test/test_reference_coding_sequence_key.py | 274 +++++++++++++++++++++ 2 files changed, 469 insertions(+) create mode 100644 isovar/reference_coding_sequence_key.py create mode 100644 test/test_reference_coding_sequence_key.py diff --git a/isovar/reference_coding_sequence_key.py b/isovar/reference_coding_sequence_key.py new file mode 100644 index 0000000..7a9029e --- /dev/null +++ b/isovar/reference_coding_sequence_key.py @@ -0,0 +1,195 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import +import logging +from collections import namedtuple + +from .variant_helpers import interbase_range_affected_by_variant_on_transcript +from .reference_sequence_key import ReferenceSequenceKey + +logger = logging.getLogger(__name__) + +class ReferenceCodingSequenceKey( + namedtuple( + "ReferenceCodingSequenceKey", + ReferenceSequenceKey._fields + ( + # if the reference context includes the 5' UTR then + # this is the offset to the start codon, otherwise it's the + # offset needed to get the first base of a codon + "offset_to_first_complete_codon", + # does this context overlap a start codon? + "overlaps_start_codon", + # does this context contain the whole trinucleotide start codon? + "contains_start_codon", + # does this context contain any UTR bases? + "contains_five_prime_utr", + # translation of complete codons in the reference context + # before the variant + "amino_acids_before_variant"))): + + """ + Includes all the fields of a ReferenceSequenceKey, and adds on top of those + a reading frame and information about where the start codon / 5' UTR are + relative to this sequence fragment. + """ + + @classmethod + def from_variant_and_transcript( + cls, + variant, + transcript, + context_size): + """ + Extracts the reference sequence around a variant locus on a particular + transcript and determines the reading frame at the start of that + sequence context. + + Parameters + ---------- + variant : varcode.Variant + + transcript : pyensembl.Transcript + + context_size : int + + Returns SequenceKeyWithReadingFrame object or None if Transcript lacks + coding sequence, protein sequence or annotated start/stop codons. + """ + if not transcript.contains_start_codon: + logger.info( + "Expected transcript %s for variant %s to have start codon", + transcript, + variant) + return None + + if not transcript.contains_stop_codon: + logger.info( + "Expected transcript %s for variant %s to have stop codon", + transcript, + variant) + return None + + if not transcript.protein_sequence: + logger.info( + "Expected transript %s for variant %s to have protein sequence", + transcript, + variant) + return None + + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( + variant=variant, + transcript=transcript, + context_size=context_size) + + if sequence_key is None: + logger.info( + "No sequence key for variant %s on transcript %s", + variant, + transcript) + return None + + # get the interbase range of offsets which capture all reference + # bases modified by the variant + variant_start_offset, variant_end_offset = \ + interbase_range_affected_by_variant_on_transcript( + variant=variant, + transcript=transcript) + + start_codon_idx = min(transcript.start_codon_spliced_offsets) + + # skip any variants which occur in the 5' UTR or overlap the start codon + # since + # (1) UTR variants have unclear coding effects and + # (2) start-loss variants may result in a new start codon / reading + # frame but we're not sure which one! + if variant_start_offset < start_codon_idx + 3: + logger.info( + "Skipping transcript %s for variant %s, must be after start codon", + transcript, + variant) + return None + + stop_codon_idx = min(transcript.stop_codon_spliced_offsets) + + # skip variants which affect the 3' UTR of the transcript since + # they don't have obvious coding effects on the protein sequence + if variant_start_offset >= stop_codon_idx + 3: + logger.info( + "Skipping transcript %s for variant %s, occurs in 3' UTR", + transcript, + variant) + return None + + n_prefix = len(sequence_key.sequence_before_variant_locus) + prefix_start_idx = variant_start_offset - n_prefix + n_bases_between_start_and_variant = variant_start_offset - start_codon_idx + n_full_codons_before_variant = n_bases_between_start_and_variant // 3 + + # if the sequence before the variant contains more bases than the + # distance to the start codon, then by definition it must contain + # some untranslated bases + contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant) + # allows for the possibility that the first base in the sequence might + # be the first nucleotide of the start codon + contains_start_codon = (n_prefix >= n_bases_between_start_and_variant) + # the sequence context might only include the 2nd or 3rd bases of + # the start codon + overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3) + + if contains_start_codon: + offset_to_first_complete_codon = start_codon_idx - prefix_start_idx + amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant] + else: + reading_frame = (prefix_start_idx - start_codon_idx) % 3 + offset_to_first_complete_codon = reading_frame_to_offset(reading_frame) + n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3 + amino_acids_before_variant = transcript.protein_sequence[ + n_full_codons_before_variant - n_codons_in_prefix: + n_full_codons_before_variant] + return cls( + strand=sequence_key.strand, + sequence_before_variant_locus=sequence_key.sequence_before_variant_locus, + sequence_at_variant_locus=sequence_key.sequence_at_variant_locus, + sequence_after_variant_locus=sequence_key.sequence_after_variant_locus, + offset_to_first_complete_codon=offset_to_first_complete_codon, + contains_start_codon=contains_start_codon, + overlaps_start_codon=overlaps_start_codon, + contains_five_prime_utr=contains_five_prime_utr, + amino_acids_before_variant=amino_acids_before_variant) + + +def reading_frame_to_offset(reading_frame_at_start_of_sequence): + """ + Given a reading frame (how many nucleotides into a codon) at the + start of a cDNA sequence, return the number of nucleotides which need + to be trimmed to start on a complete codon. + + Parameters + ---------- + + reading_frame_at_start_of_sequence : int + + Returns an int + """ + if reading_frame_at_start_of_sequence < 0: + raise ValueError("Reading frame can't be negative: %d" % ( + reading_frame_at_start_of_sequence,)) + elif reading_frame_at_start_of_sequence > 2: + raise ValueError("Reading frame must be within 0 and 2, not %d" % ( + reading_frame_at_start_of_sequence,)) + # If we're 1 nucleotide into the codon then we need to shift + # over two more to restore the ORF. Likewise, if we're 2 nucleotides in + # then we have to shift over one more. + return (3 - reading_frame_at_start_of_sequence) % 3 diff --git a/test/test_reference_coding_sequence_key.py b/test/test_reference_coding_sequence_key.py new file mode 100644 index 0000000..f60dabd --- /dev/null +++ b/test/test_reference_coding_sequence_key.py @@ -0,0 +1,274 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +from isovar.reference_coding_sequence_key import ( + reading_frame_to_offset, + ReferenceCodingSequenceKey, +) +from varcode import Variant +from pyensembl import ensembl_grch38 +from nose.tools import eq_ + +from testing_helpers import assert_equal_fields + +def test_reading_frame_to_offset(): + eq_(reading_frame_to_offset(0), 0) + eq_(reading_frame_to_offset(1), 2) + eq_(reading_frame_to_offset(2), 1) + + +def test_sequence_key_with_reading_frame_substitution_with_five_prime_utr(): + # Replace second codon of TP53-001 with 'CCC', the surrounding context + # includes nucleotides from the 5' UTR. Since TP53 is on the negative + # strand we have to take the reverse complement of the variant which turns + # it into CTC>GGG + tp53_substitution = Variant( + "17", 7676589, "CTC", "GGG", ensembl_grch38) + tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + + # Sequence of TP53 around second codon with 10 context nucleotides: + # In [51]: t.sequence[193-10:193+13] + # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' + # Which can be split into the following parts: + # last 7 nt of 5' UTR: CACTGCC + # start codon: ATG (translates to M) + # 2nd codon: GAG <---- variant occurs here + # 3rd codon: GAG + # 4th codon: CCG + # 5th codon: CAG + # first nt of 6th codon: T + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_substitution, + transcript=tp53_001, + context_size=10) + expected = ReferenceCodingSequenceKey( + strand="-", + sequence_before_variant_locus="CACTGCCATG", + sequence_at_variant_locus="GAG", + sequence_after_variant_locus="GAGCCGCAGT", + offset_to_first_complete_codon=7, + contains_start_codon=True, + overlaps_start_codon=True, + contains_five_prime_utr=True, + amino_acids_before_variant="M") + assert_equal_fields(result, expected) + +def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): + # Delete second codon of TP53-001, the surrounding context + # includes nucleotides from the 5' UTR. Since TP53 is on the negative + # strand we have to take the reverse complement of the variant which turns + # it into 'CTC'>'' + tp53_deletion = Variant( + "17", 7676589, "CTC", "", ensembl_grch38) + tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + + # Sequence of TP53 around second codon with 10 context nucleotides: + # In [51]: t.sequence[193-10:193+13] + # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' + # Which can be split into the following parts: + # last 7 nt of 5' UTR: CACTGCC + # start codon: ATG (translates to M) + # 2nd codon: GAG <---- variant occurs here + # 3rd codon: GAG + # 4th codon: CCG + # 5th codon: CAG + # first nt of 6th codon: T + + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_deletion, + transcript=tp53_001, + context_size=10) + expected = ReferenceCodingSequenceKey( + strand="-", + sequence_before_variant_locus="CACTGCCATG", + sequence_at_variant_locus="GAG", + sequence_after_variant_locus="GAGCCGCAGT", + offset_to_first_complete_codon=7, + contains_start_codon=True, + overlaps_start_codon=True, + contains_five_prime_utr=True, + amino_acids_before_variant="M") + assert_equal_fields(result, expected) + + +def test_sequence_key_with_reading_frame_insertion(): + # Insert nucleotide "T" after second codon of TP53-001, the + # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on + # the negative strand we have to take the reverse complement of the variant + # which turns it into 'CTC'>'CTCA' + tp53_insertion = Variant( + "17", 7676586, "CTC", "CTCA", ensembl_grch38) + + tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + # Sequence of TP53 around boundary of 2nd/3rd codons + # with 10 context nucleotides: + # last 4 nt of 5' UTR: TGCC + # start codon: ATG (translates to M) + # 2nd codon: GAG (translates to E) + # <---- insertion variant occurs between these two codons + # 3rd codon: GAG + # 4th codon: CCG + # 5th codon: CAG + # first nt of 6th codon: T + + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=10) + + expected = ReferenceCodingSequenceKey( + strand="-", + sequence_before_variant_locus="TGCCATGGAG", + sequence_at_variant_locus="", + sequence_after_variant_locus="GAGCCGCAGT", + offset_to_first_complete_codon=4, + contains_start_codon=True, + overlaps_start_codon=True, + contains_five_prime_utr=True, + amino_acids_before_variant="ME") + assert_equal_fields(result, expected) + +def test_reference_coding_sequence_key_insertion_inside_start_codon(): + # insert nucleotide "C" in the middle of the start codon of TP53-001, + # keeping only 1 nucleotide of context. In the reverse complement this + # becomes 'T'>'TG' + tp53_insertion = Variant( + "17", 7676592, "T", "TG", ensembl_grch38) + + tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=1) + assert result is None, "Expected result to be None when variant affects start codon" + +def test_sequence_key_with_reading_frame_insertion_before_start_codon(): + # insert nucleotide "T" before of the start codon of TP53-001, + tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38) + + tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=1) + assert result is None, "Expected result to be None when variant before start codon" + + +def test_sequence_key_with_reading_frame_insertion_context_6nt_contains_start(): + # Insert nucleotide "T" after second codon of TP53-001, + # but in this test we're going to only keep enough context to see + # the start codon but none of the 5' UTR. In the reverse complement this + # variant becomes CTC>CTCA + tp53_insertion = Variant( + "17", 7676586, "CTC", "CTCA", ensembl_grch38) + + tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + # Sequence of TP53 around boundary of 2nd/3rd codons + # with 6 context nucleotides: + # start codon: ATG (translates to M) + # 2nd codon: GAG (translates to E) + # <---- insertion variant occurs between these two codons + # 3rd codon: GAG + # 4th codon: CCG + + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=6) + + expected = ReferenceCodingSequenceKey( + strand="-", + sequence_before_variant_locus="ATGGAG", + sequence_at_variant_locus="", + sequence_after_variant_locus="GAGCCG", + offset_to_first_complete_codon=0, + contains_start_codon=True, + overlaps_start_codon=True, + contains_five_prime_utr=False, + amino_acids_before_variant="ME") + assert_equal_fields(result, expected) + + +def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(): + # Insert nucleotide "T" after second codon of TP53-001, + # but in this test we're going to only keep enough context to see + # a part of the start codon, thus the result shouldn't "contain" + # the start codon but does "overlap" it. In the reverse complement + # this variant becomes CTC>CTCA + tp53_insertion = Variant( + "17", 7676586, "CTC", "CTCA", ensembl_grch38) + + tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + # Sequence of TP53 around boundary of 2nd/3rd codons + # with 6 context nucleotides: + # last two nt of start codon: TG + # 2nd codon: GAG (translates to E) + # <---- insertion variant occurs between these two codons + # 3rd codon: GAG + # first two nt of 4th codon: CC + + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=5) + + expected = ReferenceCodingSequenceKey( + strand="-", + sequence_before_variant_locus="TGGAG", + sequence_at_variant_locus="", + sequence_after_variant_locus="GAGCC", + offset_to_first_complete_codon=2, + contains_start_codon=False, + overlaps_start_codon=True, + contains_five_prime_utr=False, + amino_acids_before_variant="E") + assert_equal_fields(result, expected) + + +def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): + # Insert nucleotide "T" after second codon of TP53-001, + # but in this test we're going to only keep enough context to see + # the second codon (and no nucleotides from the start). In the reverse + # complement this variant becomes CTC>CTCA. + + tp53_insertion = Variant( + "17", 7676586, "CTC", "CTCA", ensembl_grch38) + + tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + # Sequence of TP53 around boundary of 2nd/3rd codons + # with 6 context nucleotides: + # 2nd codon: GAG (translates to E) + # <---- insertion variant occurs between these two codons + # 3rd codon: GAG + + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=3) + + expected = ReferenceCodingSequenceKey( + strand="-", + sequence_before_variant_locus="GAG", + sequence_at_variant_locus="", + sequence_after_variant_locus="GAG", + offset_to_first_complete_codon=0, + contains_start_codon=False, + overlaps_start_codon=False, + contains_five_prime_utr=False, + amino_acids_before_variant="E") + assert_equal_fields(result, expected) From 78b41804a2b2b76b9e55411f0523824f8da39d6d Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 4 Nov 2016 16:20:41 -0400 Subject: [PATCH 07/11] renamed CLI arg for assembly --- isovar/cli/variant_sequences.py | 4 ++-- isovar/variant_sequences.py | 36 ++++++++++++++------------------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/isovar/cli/variant_sequences.py b/isovar/cli/variant_sequences.py index 14579a5..0f85713 100644 --- a/isovar/cli/variant_sequences.py +++ b/isovar/cli/variant_sequences.py @@ -39,7 +39,7 @@ def add_variant_sequence_args( help="Minimum number of reads supporting a variant sequence (default %(default)s)") rna_sequence_group.add_argument( - "--overlap-assembly", + "--variant-sequence-assembly", default=False, action="store_true") @@ -82,7 +82,7 @@ def variant_sequences_generator_from_args(args): allele_reads_generator, min_reads_supporting_cdna_sequence=args.min_reads_supporting_variant_sequence, preferred_sequence_length=args.cdna_sequence_length, - overlap_assembly=args.overlap_assembly) + variant_cdna_sequence_assembly=args.variant_sequence_assembly) def variant_sequences_dataframe_from_args(args): variant_sequences_generator = variant_sequences_generator_from_args(args) diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py index 822607e..e882cbf 100644 --- a/isovar/variant_sequences.py +++ b/isovar/variant_sequences.py @@ -332,19 +332,22 @@ def filter_variant_sequences( variant_sequences=variant_sequences, preferred_sequence_length=preferred_sequence_length) -def supporting_reads_to_variant_sequences( - variant_reads, +def reads_to_variant_sequences( + variant, + reads, preferred_sequence_length, min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """ - Collapse variant-support RNA reads into consensus sequences of approximately - the preferred length (may differ at the ends of transcripts), filter - consensus sequences by length and number of supporting RNA reads. + Collapse variant-supporting RNA reads into consensus sequences of + approximately the preferred length (may differ at the ends of transcripts), + filter consensus sequences by length and number of supporting RNA reads. Parameters ---------- - variant_reads : list of AlleleRead objects + variant : varcode.Variant + + reads : list of AlleleRead objects Should all support the same variant allele nucleotides. preferred_sequence_length : int @@ -358,14 +361,14 @@ def supporting_reads_to_variant_sequences( Drop sequences which don't at least have this number of fully spanning reads. - overlap_assembly : bool + reads_to_variant_sequences : bool Construct variant sequences by merging overlapping reads. If False then variant sequences must be fully spanned by cDNA reads. Returns a collection of VariantSequence objects """ # just in case variant_reads is a generator, convert it to a list - variant_reads = list(variant_reads) + variant_reads = list(filter_non_alt_reads_for_variant(variant, reads)) if len(variant_reads) == 0: return [] @@ -406,6 +409,7 @@ def supporting_reads_to_variant_sequences( max(len(s) for s in variant_sequences)) else: logger.info("After overlap assembly: 0 variant sequences") + return [] variant_sequences = filter_variant_sequences( variant_sequences=variant_sequences, @@ -421,23 +425,13 @@ def supporting_reads_to_variant_sequences( max(len(s) for s in variant_sequences)) else: logger.info("After coverage & length filtering: 0 variant sequences") + return [] # sort VariantSequence objects by decreasing order of supporting read # counts variant_sequences.sort(key=sort_key_decreasing_read_count) return variant_sequences -def overlapping_reads_to_variant_sequences( - variant, - overlapping_reads, - min_reads_supporting_cdna_sequence, - preferred_sequence_length): - variant_reads = filter_non_alt_reads_for_variant(variant, overlapping_reads) - return supporting_reads_to_variant_sequences( - variant_reads, - preferred_sequence_length=preferred_sequence_length, - min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence) - def reads_generator_to_sequences_generator( variant_and_reads_generator, min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, @@ -468,9 +462,9 @@ def reads_generator_to_sequences_generator( - list of VariantSequence objects """ for variant, variant_reads in variant_and_reads_generator: - variant_sequences = overlapping_reads_to_variant_sequences( + variant_sequences = reads_to_variant_sequences( variant=variant, - overlapping_reads=variant_reads, + reads=variant_reads, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, preferred_sequence_length=preferred_sequence_length, variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) From e14e01938ab6e9011c4acd93a7d5db6a124f8b0c Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 4 Nov 2016 16:34:50 -0400 Subject: [PATCH 08/11] added caching of pyensembl --- .travis.yml | 6 ++++++ isovar/variant_sequences.py | 15 +++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 01c9b9c..d15bdef 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,12 @@ language: python python: - "2.7" - "3.4" +cache: + pip: true + # cache directory used for Ensembl downloads of GTF and FASTA files + # along with the indexed db of intervals and ID mappings and pickles + # of sequence dictionaries + directories: /home/travis/.cache/pyensembl/ before_install: # Commands below copied from: http://conda.pydata.org/docs/travis.html # We do this conditionally because it saves us some downloading if the diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py index e882cbf..586749e 100644 --- a/isovar/variant_sequences.py +++ b/isovar/variant_sequences.py @@ -35,7 +35,7 @@ # using this base class to define the core fields of a VariantSequence # but inheriting it from it to allow the addition of helper methods -VariantSequenceFields = namedtuple( +VariantSequenceBase = namedtuple( "VariantSequence", [ # nucleotides before a variant @@ -47,13 +47,12 @@ # since we often want to look at prefix+alt+suffix, let's cache it "sequence", # reads which were used to determine this sequences - "reads", - ]) + "reads"]) -class VariantSequence(VariantSequenceFields): +class VariantSequence(VariantSequenceBase): def __new__(cls, prefix, alt, suffix, reads): # construct sequence from prefix + alt + suffix - return VariantSequenceFields.__new__( + return VariantSequenceBase.__new__( cls, prefix=prefix, alt=alt, @@ -227,8 +226,8 @@ def trim_by_coverage(self, min_reads): suffix=self.suffix[:last_covered_index - variant_end_index + 1], reads=self.reads) -def sort_key_decreasing_read_count(variant_sequence): - return -len(variant_sequence.reads) + def sort_key_decreasing_read_count(self): + return -len(self.reads) def initial_variant_sequences_from_reads( variant_reads, @@ -429,7 +428,7 @@ def reads_to_variant_sequences( # sort VariantSequence objects by decreasing order of supporting read # counts - variant_sequences.sort(key=sort_key_decreasing_read_count) + variant_sequences.sort(key=VariantSequence.sort_key_decreasing_read_count) return variant_sequences def reads_generator_to_sequences_generator( From 881aa04810a9b8077da6521b58783577468b5010 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 4 Nov 2016 16:36:51 -0400 Subject: [PATCH 09/11] version bump due to large change in logic --- isovar/__init__.py | 2 +- isovar/variant_sequences.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/isovar/__init__.py b/isovar/__init__.py index 42419f9..f4cac02 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -14,4 +14,4 @@ from __future__ import print_function, division, absolute_import -__version__ = "0.2.4" +__version__ = "0.3.0" diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py index 586749e..4fa52db 100644 --- a/isovar/variant_sequences.py +++ b/isovar/variant_sequences.py @@ -452,7 +452,7 @@ def reads_generator_to_sequences_generator( sequence_length : int Desired sequence length, including variant nucleotides - overlap_assembly : bool + variant_cdna_sequence_assembly : bool Construct variant sequences by merging overlapping reads. If False then variant sequences must be fully spanned by cDNA reads. From bb757cbe04d3c3d5d3e1063ec6fdd1c79fa39994 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 4 Nov 2016 16:48:52 -0400 Subject: [PATCH 10/11] assembly unit test with mock reads --- test/test_assembly.py | 49 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/test/test_assembly.py b/test/test_assembly.py index 1bfe127..3ea5d4c 100644 --- a/test/test_assembly.py +++ b/test/test_assembly.py @@ -59,11 +59,46 @@ def test_assemble_transcript_fragments_snv(): assert len(s) > max_read_length, \ "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,) -""" -def test_assemble_simple_sequences(): - AlleleRead - VariantSequence -""" -if __name__ == "__main__": - test_assemble_transcript_fragments_snv() +def test_assembly_of_simple_sequence_from_mock_reads(): + # + # Read sequences: + # AAAAA|CC|TTTTT + # AAAAA|CC|TTTTT + # GAAAAA|CC|TTTTTG + # AAAA|CC|TTTT + reads = [ + # two identical reads with sequence AAAAA|CC|TTTTT + AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"), + AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"), + # longer sequence GAAAAA|CC|TTTTTG + AlleleRead(prefix="G" + "A" * 5, allele="CC", suffix="T" * 5 + "G", name="longer"), + # shorter sequence AAAA|CC|TTTT + AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4, name="shorter"), + ] + expected_variant_sequence = VariantSequence( + prefix="G" + "A" * 5, alt="CC", suffix="T" * 5 + "G", reads=reads) + initial_variant_sequences = initial_variant_sequences_from_reads(reads) + # expecting one fewer sequence than reads since two of the reads are + # duplicates + eq_(len(initial_variant_sequences), len(reads) - 1) + + assembled_variant_sequences = iterative_overlap_assembly( + initial_variant_sequences, + min_overlap_size=1) + + # since no reads contradict each other then we should get back a single + # assembled sequence + eq_(len(assembled_variant_sequences), + 1, + "Unexpected number of variant sequences: %s" % (assembled_variant_sequences,)) + assembled_variant_sequence = assembled_variant_sequences[0] + eq_(assembled_variant_sequence, expected_variant_sequence) + + eq_(len(assembled_variant_sequence.reads), len(reads)) + + eq_(assembled_variant_sequence.min_coverage(), 1) + # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with + # all 4/4 reads + expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14 + eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage) From dcfd2ed6a0d21209b125851d5e68b03cad972e18 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 4 Nov 2016 18:03:04 -0400 Subject: [PATCH 11/11] fixed name of reads arg --- isovar/reference_coding_sequence_key.py | 6 +-- isovar/reference_context.py | 55 +++++++++++++------------ isovar/translation.py | 7 ++-- test/test_allele_counts.py | 2 +- test/test_variant_sequences.py | 7 ++-- 5 files changed, 41 insertions(+), 36 deletions(-) diff --git a/isovar/reference_coding_sequence_key.py b/isovar/reference_coding_sequence_key.py index 7a9029e..d37bc04 100644 --- a/isovar/reference_coding_sequence_key.py +++ b/isovar/reference_coding_sequence_key.py @@ -40,9 +40,9 @@ class ReferenceCodingSequenceKey( "amino_acids_before_variant"))): """ - Includes all the fields of a ReferenceSequenceKey, and adds on top of those - a reading frame and information about where the start codon / 5' UTR are - relative to this sequence fragment. + ReferenceCodingSequenceKey includes all the fields of a ReferenceSequenceKey, + and additionally tracks the reading frame and information about where the + start codon and 5' UTR are relative to this sequence fragment. """ @classmethod diff --git a/isovar/reference_context.py b/isovar/reference_context.py index b90e3d2..1b0d262 100644 --- a/isovar/reference_context.py +++ b/isovar/reference_context.py @@ -34,19 +34,31 @@ # ########################## -ReferenceContext = namedtuple( - "ReferenceContext", - ReferenceCodingSequenceKey._fields + ( - "variant", - "transcripts") -) - -def sort_key_decreasing_max_length_transcript_cds(reference_context): - """ - Used to sort a sequence of ReferenceContext objects by the longest CDS - in each context's list of transcripts. - """ - return -max(len(t.coding_sequence) for t in reference_context.transcripts) +class ReferenceContext(namedtuple( + "ReferenceContext", + ReferenceCodingSequenceKey._fields + ("variant", "transcripts"))): + + @classmethod + def from_reference_coding_sequence_key(cls, key, variant, transcripts): + return cls( + strand=key.strand, + sequence_before_variant_locus=key.sequence_before_variant_locus, + sequence_at_variant_locus=key.sequence_at_variant_locus, + sequence_after_variant_locus=key.sequence_after_variant_locus, + offset_to_first_complete_codon=key.offset_to_first_complete_codon, + contains_start_codon=key.contains_start_codon, + overlaps_start_codon=key.overlaps_start_codon, + contains_five_prime_utr=key.contains_five_prime_utr, + amino_acids_before_variant=key.amino_acids_before_variant, + variant=variant, + transcripts=transcripts) + + def sort_key_decreasing_max_length_transcript_cds(self): + """ + Used to sort a sequence of ReferenceContext objects by the longest CDS + in each context's list of transcripts. + """ + return -max(len(t.coding_sequence) for t in self.transcripts) def reference_contexts_for_variant( variant, @@ -83,21 +95,12 @@ def reference_contexts_for_variant( sequence_groups[sequence_key_with_reading_frame].append(transcript) reference_contexts = [ - ReferenceContext( - strand=key.strand, - sequence_before_variant_locus=key.sequence_before_variant_locus, - sequence_at_variant_locus=key.sequence_at_variant_locus, - sequence_after_variant_locus=key.sequence_after_variant_locus, - offset_to_first_complete_codon=key.offset_to_first_complete_codon, - contains_start_codon=key.contains_start_codon, - overlaps_start_codon=key.overlaps_start_codon, - contains_five_prime_utr=key.contains_five_prime_utr, - amino_acids_before_variant=key.amino_acids_before_variant, - variant=variant, - transcripts=matching_transcripts) + ReferenceContext.from_reference_coding_sequence_key( + key, variant, matching_transcripts) for (key, matching_transcripts) in sequence_groups.items() ] - reference_contexts.sort(key=sort_key_decreasing_max_length_transcript_cds) + reference_contexts.sort( + key=ReferenceContext.sort_key_decreasing_max_length_transcript_cds) return reference_contexts def reference_contexts_for_variants( diff --git a/isovar/translation.py b/isovar/translation.py index d1ec571..a246201 100644 --- a/isovar/translation.py +++ b/isovar/translation.py @@ -26,7 +26,7 @@ from skbio import DNA from .reference_context import reference_contexts_for_variant -from .variant_sequences import supporting_reads_to_variant_sequences +from .variant_sequences import reads_to_variant_sequences from .default_parameters import ( MIN_TRANSCRIPT_PREFIX_LENGTH, @@ -553,8 +553,9 @@ def translate_variant_reads( # need to clip nucleotides at the start/end of the sequence cdna_sequence_length = (protein_sequence_length + 1) * 3 - variant_sequences = supporting_reads_to_variant_sequences( - variant_reads=variant_reads, + variant_sequences = reads_to_variant_sequences( + variant=variant, + reads=variant_reads, preferred_sequence_length=cdna_sequence_length, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) diff --git a/test/test_allele_counts.py b/test/test_allele_counts.py index 4d76abe..777b286 100644 --- a/test/test_allele_counts.py +++ b/test/test_allele_counts.py @@ -12,7 +12,7 @@ def test_allele_count_dataframe(): ] df = allele_counts_dataframe([(variant, reads)]) assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df,) - row = df.irow(0) + row = df.iloc[0] eq_(row.n_ref, 2) eq_(row.n_alt, 1) eq_(row.n_other, 0) diff --git a/test/test_variant_sequences.py b/test/test_variant_sequences.py index 2191bdf..42ea0a1 100644 --- a/test/test_variant_sequences.py +++ b/test/test_variant_sequences.py @@ -16,7 +16,7 @@ from nose.tools import eq_ from varcode import Variant -from isovar.variant_sequences import supporting_reads_to_variant_sequences +from isovar.variant_sequences import reads_to_variant_sequences from isovar.variant_reads import reads_supporting_variant from testing_helpers import load_bam @@ -34,8 +34,9 @@ def test_sequence_counts_snv(): chromosome=chromosome, variant=variant) - variant_sequences = supporting_reads_to_variant_sequences( - variant_reads, + variant_sequences = reads_to_variant_sequences( + variant=variant, + reads=variant_reads, preferred_sequence_length=61) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: