Skip to content

Commit 3bedf41

Browse files
committed
Merge pull request #38 from hammerlab/EnsemblRelease_and_README_cleanup
deleted location methods from EnsemblRelease, cleaned up...
2 parents 5e9f7db + 057711e commit 3bedf41

File tree

4 files changed

+151
-149
lines changed

4 files changed

+151
-149
lines changed

README.md

Lines changed: 39 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,31 @@ combinations of the annotation features *gene\_name*, *gene\_id*,
3939
*transcript\_name*, *transcript\_id*, *exon\_id* as well as the location of
4040
these genomic elements (contig, start position, end position, strand).
4141

42-
## Gene Names
42+
## Genes
4343

44-
`gene_names()`
45-
: returns all gene names in the annotation database
44+
`genes(contig=None, strand=None)`
45+
: returns list of Gene objects, optionally restricted to a particular contig
46+
or strand.
4647

47-
`gene_names_on_contig(contig)`
48-
: all gene names on a particular chromosome/contig
48+
`genes_at_locus(contig, position, end=None, strand=None)`
49+
: returns list of Gene objects overlapping a particular position on a contig,
50+
optionally extend into a range with the `end` parameter and restrict to
51+
forward or backward strand by passing `strand='+'` or `strand='-'`.
52+
53+
`gene_by_id(gene_id)`
54+
: return Gene object for given Ensembl gene ID (e.g. "ENSG00000068793")
55+
56+
`gene_names(contig=None, strand=None)`
57+
: returns all gene names in the annotation database, optionally restricted
58+
to a particular contig or strand.
59+
60+
`genes_by_name(gene_name)`
61+
: get all the unqiue genes with the given name (there might be multiple
62+
due to copies in the genome), return a list containing a Gene object for each
63+
distinct ID.
64+
65+
`gene_by_protein_id(protein_id)`
66+
: find Gene associated with the given Ensembl protein ID (e.g. "ENSP00000350283")
4967

5068
`gene_names_at_locus(contig, position, end=None, strand=None)`
5169
: names of genes overlapping with the given locus
@@ -63,23 +81,28 @@ these genomic elements (contig, start position, end position, strand).
6381
`gene_name_of_exon_id(exon_id)`
6482
: name of gene associated with given exon ID
6583

66-
67-
## Gene IDs
68-
6984
`gene_ids(contig=None, strand=None)`
7085
: all gene IDs in the annotation database
7186

72-
`gene_id_of_gene_name(gene_name)`
73-
: translate Ensembl gene ID to its corresponding name
87+
`gene_ids_of_gene_name(gene_name)`
88+
: all Ensembl gene IDs with the given name
7489

7590

76-
## Transcript Names
91+
## Transcripts
92+
93+
`transcripts(contig=None, strand=None)`
94+
: returns list of Transcript objects for all transcript entries in the
95+
Ensembl database, optionally restricted to a particular contig or strand.
96+
97+
`transcript_by_id(transcript_id)`
98+
: construct Transcript object for given Ensembl transcript ID (e.g. "ENST00000369985")
99+
100+
`transcripts_by_name(transcript_name)`
101+
: returns list of Transcript objects for every transcript matching the given name.
77102

78103
`transcript_names(contig=None, strand=None)`
79104
: all transcript names in the annotation database
80105

81-
## Transcript IDs
82-
83106
`transcript_ids(contig=None, strand=None)`
84107
: returns all transcript IDs in the annotation database
85108

@@ -89,14 +112,14 @@ these genomic elements (contig, start position, end position, strand).
89112
`transcript_ids_of_gene_name(gene_name)`
90113
: return IDs of all transcripts associated with given gene name
91114

92-
`transcript_id_of_transcript_name(transcript_name)`
93-
: translate transcript name to its ID
115+
`transcript_ids_of_transcript_name(transcript_name)`
116+
: find all Ensembl transcript IDs with the given name
94117

95118
`transcript_ids_of_exon_id(exon_id)`
96119
: return IDs of all transcripts associatd with given exon ID
97120

98121

99-
## Exon IDs
122+
## Exons
100123

101124
`exon_ids(contig=None, strand=None)`
102125
: returns all transcript IDs in the annotation database
@@ -109,32 +132,3 @@ these genomic elements (contig, start position, end position, strand).
109132

110133
`exon_ids_of_transcript_id(transcript_id)`
111134

112-
113-
## Locations
114-
115-
These functions currently assume that each gene maps to a single unique
116-
location, which is invalid both with heavily copied genes
117-
(e.g. [U1](http://en.wikipedia.org/wiki/U1_spliceosomal_RNA)) and with
118-
polymorphic regions (e.g. HLA genes).
119-
120-
`location_of_gene_name(gene_name)`
121-
122-
`location_of_gene_id(gene_id)`
123-
124-
`location_of_transcript_id(transcript_id)`
125-
126-
`location_of_exon_id(exon_id)`
127-
128-
129-
## Start Codons
130-
131-
`start_codon_of_transcript_id(transcript_id)`
132-
133-
`start_codon_of_transcript_name(transcript_id)`
134-
135-
136-
## Stop Codons
137-
138-
`stop_codon_of_transcript_id(transcript_id)`
139-
140-
`stop_codon_of_transcript_name(transcript_name)`

pyensembl/database.py

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import sqlite3
55

66
from .common import CACHE_SUBDIR
7-
from .locus import normalize_chromosome, normalize_strand
7+
from .locus import normalize_chromosome, normalize_strand, Locus
88
from .type_checks import require_integer, require_string
99

1010
import datacache
@@ -159,7 +159,7 @@ def column_values_at_locus(
159159
SELECT %s%s
160160
FROM ensembl
161161
WHERE feature = ?
162-
AND seqname=?
162+
AND seqname= ?
163163
AND start <= ?
164164
AND end >= ?
165165
@@ -323,3 +323,66 @@ def query_distinct_on_contig(self, column_name, feature, contig):
323323
feature=feature,
324324
contig=contig,
325325
distinct=True)
326+
327+
def query_loci(self, filter_column, filter_value, feature):
328+
"""
329+
Query for loci satisfying a given filter and feature type.
330+
331+
332+
Parameters
333+
----------
334+
filter_column : str
335+
Name of column to filter results by.
336+
337+
filter_value : str
338+
Only return loci which have this value in the their filter_column.
339+
340+
feature : str
341+
Feature names such as 'transcript', 'gene', and 'exon'
342+
343+
Returns list of Locus objects
344+
"""
345+
# list of values containing (contig, start, stop, strand)
346+
result_tuples = self.query(
347+
select_column_names=["seqname", "start", "end", "strand"],
348+
filter_column=property_name,
349+
filter_value=property_value,
350+
feature=feature,
351+
distinct=True,
352+
required=True)
353+
return [
354+
Locus(contig, start, end, strand)
355+
for (contig, start, end, strand)
356+
in result_tuples
357+
]
358+
359+
def query_locus(self, filter_column, filter_value, feature):
360+
"""
361+
Query for unique locus, raises error if missing or more than
362+
one locus in the database.
363+
364+
Parameters
365+
----------
366+
filter_column : str
367+
Name of column to filter results by.
368+
369+
filter_value : str
370+
Only return loci which have this value in the their filter_column.
371+
372+
feature : str
373+
Feature names such as 'transcript', 'gene', and 'exon'
374+
375+
Returns single Locus object.
376+
"""
377+
loci = self.query_loci(
378+
filter_column=filter_column,
379+
filter_value=filter_value,
380+
feature=feature)
381+
382+
if len(loci) == 0:
383+
raise ValueError("Couldn't find locus for %s with %s = %s" % (
384+
feature, filter_column, filter_value))
385+
elif len(loci) > 1:
386+
raise ValueError("Too many loci for %s with %s = %s: %s" % (
387+
feature, filter_column, filter_value, loci))
388+
return loci[0]

0 commit comments

Comments
 (0)