From d960d058324fcbbe1afce5827395ead7c6da9bc8 Mon Sep 17 00:00:00 2001 From: krassowski <5832902+krassowski@users.noreply.github.com> Date: Fri, 20 Jan 2023 17:56:25 +0000 Subject: [PATCH 1/2] Document batching in README, parse snp batch results --- README.md | 14 ++++++++++++++ easy_entrez/parsing.py | 23 +++++++++++++++++++++-- tests/test_parsing.py | 11 +++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9d458b2..d7ff48b 100644 --- a/README.md +++ b/README.md @@ -249,6 +249,20 @@ protein_hgvs.head() > | rs1940853299 | NP_006437.3 | p.Lys201Thr | > | rs1940852987 | NP_006437.3 | p.Asp198Glu | +#### Fetching more than 10 000 entries + +Use `in_batches_of` method to fetch more than 10k entries (e.g. `variant_ids`): + +```python +snps_result = ( + entrez.api + .in_batches_of(1_000) + .fetch(variant_ids, max_results=5_000, database='snp') +) +``` + +The result is a dictionary with keys being identifiers used in each batch (because the Entrez API does not always return the indentifiers back) and values representing the result. You can use `parse_dbsnp_variants` directly on this dictionary. + #### Find PubMed ID from DOI When searching GWAS catalog PMID is needed over DOI. You can covert one to the other using: diff --git a/easy_entrez/parsing.py b/easy_entrez/parsing.py index e432363..dad41bb 100644 --- a/easy_entrez/parsing.py +++ b/easy_entrez/parsing.py @@ -4,12 +4,13 @@ from warnings import warn from xml.dom import minidom from xml.etree import ElementTree +from typing import Union, Dict from .api import EntrezResponse, is_xml_response, is_response_for from .queries import FetchQuery try: - from pandas import DataFrame + from pandas import DataFrame, concat except ImportError: DataFrame = None @@ -58,13 +59,31 @@ def parse_docsum(docsum: str) -> dict: return result -def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) -> VariantSet: +def parse_dbsnp_variants(snps_result: Union[EntrezResponse, Dict[tuple, EntrezResponse]], verbose: bool = False) -> VariantSet: """Parse coordinates, frequencies and preferred IDs of dbSNP variants. Parameters: snps_result: result of fetch query in XML format, usually to `'snp'` database verbose: whether to print out full problematic XML if SPDI cannot be parsed """ + if isinstance(snps_result, dict): + coordinates = [] + alt_frequencies = [] + preferred_ids = {} + summaries = [] + for result in snps_result.values(): + parsed = parse_dbsnp_variants(result) + coordinates.append(parsed.coordinates) + alt_frequencies.append(parsed.alt_frequencies) + preferred_ids.update(parsed.preferred_ids) + summaries.append(parsed.alt_frequencies) + return VariantSet( + coordinates=concat(coordinates), + alt_frequencies=concat(alt_frequencies), + preferred_ids=preferred_ids, + summary=concat(summaries) + ) + if DataFrame is None: raise ValueError('pandas is required for parser_dbsnp_variants') if not is_xml_response(snps_result): diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 7e3d605..bb0845c 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -75,6 +75,17 @@ def test_parse_two_snps(): assert set(summary.columns) == {'HGVS', 'SEQ', 'LEN', 'GENE'} +@pytest.mark.optional +def test_parse_batch(): + response = DummyResponse( + query=FetchQuery(ids=['rs6311', 'rs662138'], database='snp', max_results=10), + content_type='xml', + data=fromstring(TWO_SNPS) + ) + variant_set = parse_dbsnp_variants({('rs6311', 'rs662138'): response}) + assert type(variant_set) == VariantSet + + @pytest.mark.optional def test_merged_variant_solving(): response = DummyResponse( From 2e238e56e090eaf2ad27b184782a3b33651d1365 Mon Sep 17 00:00:00 2001 From: krassowski <5832902+krassowski@users.noreply.github.com> Date: Sat, 21 Jan 2023 14:05:54 +0000 Subject: [PATCH 2/2] Bump version to 0.3.6 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ac60360..18a5192 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ def get_long_description(file_name): package_data={'easy_entrez': ['data/*.tsv', 'py.typed']}, # required for mypy to work zip_safe=False, - version='0.3.5', + version='0.3.6', license='MIT', description='Python REST API for Entrez E-Utilities: stateless, easy to use, reliable.', long_description=get_long_description('README.md'),