From 31e16473ff4eea8321647f839dfbfcce3a5c1cdd Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Mon, 15 Jan 2024 17:59:48 -0500 Subject: [PATCH] added develop script --- .github/workflows/tests.yml | 11 ++- develop.sh | 3 + pyensembl/transcript.py | 169 ++++++++++++++++++------------------ 3 files changed, 94 insertions(+), 89 deletions(-) create mode 100755 develop.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e9ae9bf..2897342 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,8 +1,12 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python +# TODO: +# - cache this directory $HOME/.cache/pyensembl/ +# - update coveralls +# - get a badge for tests passing +# - download binary dependencies from conda name: Tests - on: [push, pull_request] jobs: @@ -14,7 +18,8 @@ jobs: python-version: ["3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v3 + - name: Checkout repository + uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: @@ -23,7 +28,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest + python -m pip install flake8 pytest coveralls pip install -r requirements.txt pip install . - name: Lint with flake8 diff --git a/develop.sh b/develop.sh new file mode 100755 index 0000000..a7b89c6 --- /dev/null +++ b/develop.sh @@ -0,0 +1,3 @@ +set -e + +pip install -e . diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py index 9d30c5c..535eddb 100644 --- a/pyensembl/transcript.py +++ b/pyensembl/transcript.py @@ -24,18 +24,20 @@ class Transcript(LocusWithGenome): and not using the sequence, avoid the memory/performance overhead of fetching and storing sequences from a FASTA file. """ + def __init__( - self, - transcript_id, - transcript_name, - contig, - start, - end, - strand, - biotype, - gene_id, - genome, - support_level=None): + self, + transcript_id, + transcript_name, + contig, + start, + end, + strand, + biotype, + gene_id, + genome, + support_level=None, + ): LocusWithGenome.__init__( self, contig=contig, @@ -43,7 +45,8 @@ def __init__( end=end, strand=strand, biotype=biotype, - genome=genome) + genome=genome, + ) self.transcript_id = transcript_id self.transcript_name = transcript_name self.gene_id = gene_id @@ -71,16 +74,18 @@ def __str__(self): " biotype='%s'," " contig='%s'," " start=%d," - " end=%d, strand='%s', genome='%s')") % ( - self.transcript_id, - self.name, - self.gene_id, - self.biotype, - self.contig, - self.start, - self.end, - self.strand, - self.genome.reference_name) + " end=%d, strand='%s', genome='%s')" + ) % ( + self.transcript_id, + self.name, + self.gene_id, + self.biotype, + self.contig, + self.start, + self.end, + self.strand, + self.genome.reference_name, + ) def __len__(self): """ @@ -90,9 +95,10 @@ def __len__(self): def __eq__(self, other): return ( - other.__class__ is Transcript and - self.id == other.id and - self.genome == other.genome) + other.__class__ is Transcript + and self.id == other.id + and self.genome == other.genome + ) def __hash__(self): return hash(self.id) @@ -120,10 +126,8 @@ def exons(self): # in each transcript columns = ["exon_number", "exon_id"] exon_numbers_and_ids = self.db.query( - columns, - filter_column="transcript_id", - filter_value=self.id, - feature="exon") + columns, filter_column="transcript_id", filter_value=self.id, feature="exon" + ) # fill this list in its correct order (by exon_number) by using # the exon_number as a 1-based list offset @@ -133,15 +137,16 @@ def exons(self): exon = self.genome.exon_by_id(exon_id) if exon is None: raise ValueError( - "Missing exon %s for transcript %s" % ( - exon_number, self.id)) + "Missing exon %s for transcript %s" % (exon_number, self.id) + ) exon_number = int(exon_number) if exon_number < 1: raise ValueError("Invalid exon number: %s" % exon_number) elif exon_number > len(exons): raise ValueError( - "Invalid exon number: %s (max expected = %d)" % ( - exon_number, len(exons))) + "Invalid exon number: %s (max expected = %d)" + % (exon_number, len(exons)) + ) # exon_number is 1-based, convert to list index by subtracting 1 exon_idx = exon_number - 1 @@ -164,12 +169,13 @@ def _transcript_feature_position_ranges(self, feature, required=True): select_column_names=["start", "end"], filter_column="transcript_id", filter_value=self.id, - feature=feature) + feature=feature, + ) if required and len(results) == 0: raise ValueError( - "Transcript %s does not contain feature %s" % ( - self.id, feature)) + "Transcript %s does not contain feature %s" % (self.id, feature) + ) return results @memoize @@ -177,20 +183,20 @@ def _transcript_feature_positions(self, feature): """ Get unique positions for feature, raise an error if feature is absent. """ - ranges = self._transcript_feature_position_ranges( - feature, required=True) + ranges = self._transcript_feature_position_ranges(feature, required=True) results = [] # a feature (such as a stop codon), maybe be split over multiple # contiguous ranges. Collect all the nucleotide positions into a # single list. - for (start, end) in ranges: + for start, end in ranges: # since ranges are [inclusive, inclusive] and # Python ranges are [inclusive, exclusive) we have to increment # the end position for position in range(start, end + 1): if position in results: raise ValueError( - "Repeated position %d for %s" % (position, feature)) + "Repeated position %d for %s" % (position, feature) + ) results.append(position) return results @@ -207,10 +213,9 @@ def _codon_positions(self, feature): results = self._transcript_feature_positions(feature) if len(results) != 3: raise ValueError( - "Expected 3 positions for %s of %s but got %d" % ( - feature, - self.id, - len(results))) + "Expected 3 positions for %s of %s but got %d" + % (feature, self.id, len(results)) + ) return results @memoized_property @@ -219,7 +224,8 @@ def contains_start_codon(self): Does this transcript have an annotated start_codon entry? """ start_codons = self._transcript_feature_position_ranges( - "start_codon", required=False) + "start_codon", required=False + ) return len(start_codons) > 0 @memoized_property @@ -228,16 +234,17 @@ def contains_stop_codon(self): Does this transcript have an annotated stop_codon entry? """ stop_codons = self._transcript_feature_position_ranges( - "stop_codon", required=False) + "stop_codon", required=False + ) return len(stop_codons) > 0 - + @memoized_property def start_codon_complete(self): """ Does the start codon span 3 genomic positions? """ try: - pos = self._codon_positions("start_codon") + self._codon_positions("start_codon") except ValueError: return False return True @@ -266,9 +273,10 @@ def exon_intervals(self): select_column_names=["exon_number", "start", "end"], filter_column="transcript_id", filter_value=self.id, - feature="exon") + feature="exon", + ) sorted_intervals = [None] * len(results) - for (exon_number, start, end) in results: + for exon_number, start, end in results: sorted_intervals[int(exon_number) - 1] = (start, end) return sorted_intervals @@ -281,15 +289,15 @@ def spliced_offset(self, position): """ if type(position) is not int: raise TypeError( - "Position argument must be an integer, got %s : %s" % ( - position, type(position))) + "Position argument must be an integer, got %s : %s" + % (position, type(position)) + ) if position < self.start or position > self.end: raise ValueError( - "Invalid position: %d (must be between %d and %d)" % ( - position, - self.start, - self.end)) + "Invalid position: %d (must be between %d and %d)" + % (position, self.start, self.end) + ) # offset from beginning of unspliced transcript (including introns) unspliced_offset = self.offset(position) @@ -306,7 +314,8 @@ def spliced_offset(self, position): # Intron vs. Exon: ...iiiiiieeeeeeiiiiiiiiiiiiiiiieeeeeeiiiiiiiiiii... for exon in self.exons: exon_unspliced_start, exon_unspliced_end = self.offset_range( - exon.start, exon.end) + exon.start, exon.end + ) # If the relative position is not within this exon, keep a running # total of the total exonic length-so-far. # @@ -323,8 +332,8 @@ def spliced_offset(self, position): exon_length = len(exon) # exon_end_position - exon_start_position + 1 total_spliced_offset += exon_length raise ValueError( - "Couldn't find position %d on any exon of %s" % ( - position, self.id)) + "Couldn't find position %d on any exon of %s" % (position, self.id) + ) @memoized_property def start_codon_unspliced_offsets(self): @@ -332,11 +341,7 @@ def start_codon_unspliced_offsets(self): Offsets from start of unspliced pre-mRNA transcript of nucleotides in start codon. """ - return [ - self.offset(position) - for position - in self.start_codon_positions - ] + return [self.offset(position) for position in self.start_codon_positions] @memoized_property def stop_codon_unspliced_offsets(self): @@ -344,11 +349,7 @@ def stop_codon_unspliced_offsets(self): Offsets from start of unspliced pre-mRNA transcript of nucleotides in stop codon. """ - return [ - self.offset(position) - for position - in self.stop_codon_positions - ] + return [self.offset(position) for position in self.stop_codon_positions] def _contiguous_offsets(self, offsets): """ @@ -358,8 +359,7 @@ def _contiguous_offsets(self, offsets): offsets.sort() for i in range(len(offsets) - 1): if offsets[i] + 1 != offsets[i + 1]: - raise ValueError( - "Offsets not contiguous: %s" % (offsets,)) + raise ValueError("Offsets not contiguous: %s" % (offsets,)) return offsets @memoized_property @@ -369,9 +369,7 @@ def start_codon_spliced_offsets(self): of nucleotides in start codon. """ offsets = [ - self.spliced_offset(position) - for position - in self.start_codon_positions + self.spliced_offset(position) for position in self.start_codon_positions ] return self._contiguous_offsets(offsets) @@ -382,9 +380,7 @@ def stop_codon_spliced_offsets(self): of nucleotides in stop codon. """ offsets = [ - self.spliced_offset(position) - for position - in self.stop_codon_positions + self.spliced_offset(position) for position in self.stop_codon_positions ] return self._contiguous_offsets(offsets) @@ -403,11 +399,11 @@ def complete(self): a coding sequence whose length is divisible by 3 """ return ( - self.contains_start_codon and - self.start_codon_complete and - self.contains_stop_codon and - self.coding_sequence is not None and - len(self.coding_sequence) % 3 == 0 + self.contains_start_codon + and self.start_codon_complete + and self.contains_stop_codon + and self.coding_sequence is not None + and len(self.coding_sequence) % 3 == 0 ) @memoized_property @@ -459,7 +455,7 @@ def coding_sequence(self): # pylint: disable=invalid-slice-index # TODO(tavi) Figure out pylint is not happy with this slice - return self.sequence[start:end + 1] + return self.sequence[start : end + 1] @memoized_property def five_prime_utr_sequence(self): @@ -469,7 +465,7 @@ def five_prime_utr_sequence(self): """ # pylint: disable=invalid-slice-index # TODO(tavi) Figure out pylint is not happy with this slice - return self.sequence[:self.first_start_codon_spliced_offset] + return self.sequence[: self.first_start_codon_spliced_offset] @memoized_property def three_prime_utr_sequence(self): @@ -477,7 +473,7 @@ def three_prime_utr_sequence(self): cDNA sequence of 3' UTR (untranslated region at the end of the transcript) """ - return self.sequence[self.last_stop_codon_spliced_offset + 1:] + return self.sequence[self.last_stop_codon_spliced_offset + 1 :] @memoized_property def protein_id(self): @@ -487,7 +483,8 @@ def protein_id(self): filter_value=self.id, feature="CDS", distinct=True, - required=False) + required=False, + ) if result_tuple: return result_tuple[0] else: