Skip to content

Commit

Permalink
Merge pull request #15 from krassowski/batching-improvements
Browse files Browse the repository at this point in the history
Document batching in README, parse snp batch results
  • Loading branch information
krassowski authored Jan 21, 2023
2 parents 6cd14fb + 2e238e5 commit b7d1f12
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 3 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,20 @@ protein_hgvs.head()
> | rs1940853299 | NP_006437.3 | p.Lys201Thr |
> | rs1940852987 | NP_006437.3 | p.Asp198Glu |
#### Fetching more than 10 000 entries

Use `in_batches_of` method to fetch more than 10k entries (e.g. `variant_ids`):

```python
snps_result = (
entrez.api
.in_batches_of(1_000)
.fetch(variant_ids, max_results=5_000, database='snp')
)
```

The result is a dictionary with keys being identifiers used in each batch (because the Entrez API does not always return the indentifiers back) and values representing the result. You can use `parse_dbsnp_variants` directly on this dictionary.

#### Find PubMed ID from DOI

When searching GWAS catalog PMID is needed over DOI. You can covert one to the other using:
Expand Down
23 changes: 21 additions & 2 deletions easy_entrez/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
from warnings import warn
from xml.dom import minidom
from xml.etree import ElementTree
from typing import Union, Dict

from .api import EntrezResponse, is_xml_response, is_response_for
from .queries import FetchQuery

try:
from pandas import DataFrame
from pandas import DataFrame, concat
except ImportError:
DataFrame = None

Expand Down Expand Up @@ -58,13 +59,31 @@ def parse_docsum(docsum: str) -> dict:
return result


def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) -> VariantSet:
def parse_dbsnp_variants(snps_result: Union[EntrezResponse, Dict[tuple, EntrezResponse]], verbose: bool = False) -> VariantSet:
"""Parse coordinates, frequencies and preferred IDs of dbSNP variants.
Parameters:
snps_result: result of fetch query in XML format, usually to `'snp'` database
verbose: whether to print out full problematic XML if SPDI cannot be parsed
"""
if isinstance(snps_result, dict):
coordinates = []
alt_frequencies = []
preferred_ids = {}
summaries = []
for result in snps_result.values():
parsed = parse_dbsnp_variants(result)
coordinates.append(parsed.coordinates)
alt_frequencies.append(parsed.alt_frequencies)
preferred_ids.update(parsed.preferred_ids)
summaries.append(parsed.alt_frequencies)
return VariantSet(
coordinates=concat(coordinates),
alt_frequencies=concat(alt_frequencies),
preferred_ids=preferred_ids,
summary=concat(summaries)
)

if DataFrame is None:
raise ValueError('pandas is required for parser_dbsnp_variants')
if not is_xml_response(snps_result):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get_long_description(file_name):
package_data={'easy_entrez': ['data/*.tsv', 'py.typed']},
# required for mypy to work
zip_safe=False,
version='0.3.5',
version='0.3.6',
license='MIT',
description='Python REST API for Entrez E-Utilities: stateless, easy to use, reliable.',
long_description=get_long_description('README.md'),
Expand Down
11 changes: 11 additions & 0 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@ def test_parse_two_snps():
assert set(summary.columns) == {'HGVS', 'SEQ', 'LEN', 'GENE'}


@pytest.mark.optional
def test_parse_batch():
response = DummyResponse(
query=FetchQuery(ids=['rs6311', 'rs662138'], database='snp', max_results=10),
content_type='xml',
data=fromstring(TWO_SNPS)
)
variant_set = parse_dbsnp_variants({('rs6311', 'rs662138'): response})
assert type(variant_set) == VariantSet


@pytest.mark.optional
def test_merged_variant_solving():
response = DummyResponse(
Expand Down

0 comments on commit b7d1f12

Please sign in to comment.