Merge pull request #15 from krassowski/batching-improvements

Document batching in README, parse snp batch results
krassowski · Jan 21, 2023 · b7d1f12 · b7d1f12
2 parents 6cd14fb + 2e238e5
commit b7d1f12
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -249,6 +249,20 @@ protein_hgvs.head()
 > | rs1940853299 | NP_006437.3 | p.Lys201Thr |
 > | rs1940852987 | NP_006437.3 | p.Asp198Glu |
 
+#### Fetching more than 10 000 entries
+
+Use `in_batches_of` method to fetch more than 10k entries (e.g. `variant_ids`):
+
+```python
+snps_result = (
+    entrez.api
+    .in_batches_of(1_000)
+    .fetch(variant_ids, max_results=5_000, database='snp')
+)
+```
+
+The result is a dictionary with keys being identifiers used in each batch (because the Entrez API does not always return the indentifiers back) and values representing the result. You can use `parse_dbsnp_variants` directly on this dictionary.
+
 #### Find PubMed ID from DOI
 
 When searching GWAS catalog PMID is needed over DOI. You can covert one to the other using:

diff --git a/easy_entrez/parsing.py b/easy_entrez/parsing.py
@@ -4,12 +4,13 @@
 from warnings import warn
 from xml.dom import minidom
 from xml.etree import ElementTree
+from typing import Union, Dict
 
 from .api import EntrezResponse, is_xml_response, is_response_for
 from .queries import FetchQuery
 
 try:
-    from pandas import DataFrame
+    from pandas import DataFrame, concat
 except ImportError:
     DataFrame = None
 
@@ -58,13 +59,31 @@ def parse_docsum(docsum: str) -> dict:
     return result
 
 
-def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) -> VariantSet:
+def parse_dbsnp_variants(snps_result: Union[EntrezResponse, Dict[tuple, EntrezResponse]], verbose: bool = False) -> VariantSet:
     """Parse coordinates, frequencies and preferred IDs of dbSNP variants.
 
     Parameters:
         snps_result: result of fetch query in XML format, usually to `'snp'` database
         verbose: whether to print out full problematic XML if SPDI cannot be parsed
     """
+    if isinstance(snps_result, dict):
+        coordinates = []
+        alt_frequencies = []
+        preferred_ids = {}
+        summaries = []
+        for result in snps_result.values():
+            parsed = parse_dbsnp_variants(result)
+            coordinates.append(parsed.coordinates)
+            alt_frequencies.append(parsed.alt_frequencies)
+            preferred_ids.update(parsed.preferred_ids)
+            summaries.append(parsed.alt_frequencies)
+        return VariantSet(
+            coordinates=concat(coordinates),
+            alt_frequencies=concat(alt_frequencies),
+            preferred_ids=preferred_ids,
+            summary=concat(summaries)
+        )
+
     if DataFrame is None:
         raise ValueError('pandas is required for parser_dbsnp_variants')
     if not is_xml_response(snps_result):

diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@ def get_long_description(file_name):
         package_data={'easy_entrez': ['data/*.tsv', 'py.typed']},
         # required for mypy to work
         zip_safe=False,
-        version='0.3.5',
+        version='0.3.6',
         license='MIT',
         description='Python REST API for Entrez E-Utilities: stateless, easy to use, reliable.',
         long_description=get_long_description('README.md'),

diff --git a/tests/test_parsing.py b/tests/test_parsing.py
@@ -75,6 +75,17 @@ def test_parse_two_snps():
     assert set(summary.columns) == {'HGVS', 'SEQ', 'LEN', 'GENE'}
 
 
+@pytest.mark.optional
+def test_parse_batch():
+    response = DummyResponse(
+        query=FetchQuery(ids=['rs6311', 'rs662138'], database='snp', max_results=10),
+        content_type='xml',
+        data=fromstring(TWO_SNPS)
+    )
+    variant_set = parse_dbsnp_variants({('rs6311', 'rs662138'): response})
+    assert type(variant_set) == VariantSet
+
+
 @pytest.mark.optional
 def test_merged_variant_solving():
     response = DummyResponse(