From d960d058324fcbbe1afce5827395ead7c6da9bc8 Mon Sep 17 00:00:00 2001
From: krassowski <5832902+krassowski@users.noreply.github.com>
Date: Fri, 20 Jan 2023 17:56:25 +0000
Subject: [PATCH 1/2] Document batching in README, parse snp batch results

---
 README.md              | 14 ++++++++++++++
 easy_entrez/parsing.py | 23 +++++++++++++++++++++--
 tests/test_parsing.py  | 11 +++++++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9d458b2..d7ff48b 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,20 @@ protein_hgvs.head()
 > | rs1940853299 | NP_006437.3 | p.Lys201Thr |
 > | rs1940852987 | NP_006437.3 | p.Asp198Glu |
 
+#### Fetching more than 10 000 entries
+
+Use `in_batches_of` method to fetch more than 10k entries (e.g. `variant_ids`):
+
+```python
+snps_result = (
+    entrez.api
+    .in_batches_of(1_000)
+    .fetch(variant_ids, max_results=5_000, database='snp')
+)
+```
+
+The result is a dictionary with keys being identifiers used in each batch (because the Entrez API does not always return the indentifiers back) and values representing the result. You can use `parse_dbsnp_variants` directly on this dictionary.
+
 #### Find PubMed ID from DOI
 
 When searching GWAS catalog PMID is needed over DOI. You can covert one to the other using:
diff --git a/easy_entrez/parsing.py b/easy_entrez/parsing.py
index e432363..dad41bb 100644
--- a/easy_entrez/parsing.py
+++ b/easy_entrez/parsing.py
@@ -4,12 +4,13 @@
 from warnings import warn
 from xml.dom import minidom
 from xml.etree import ElementTree
+from typing import Union, Dict
 
 from .api import EntrezResponse, is_xml_response, is_response_for
 from .queries import FetchQuery
 
 try:
-    from pandas import DataFrame
+    from pandas import DataFrame, concat
 except ImportError:
     DataFrame = None
 
@@ -58,13 +59,31 @@ def parse_docsum(docsum: str) -> dict:
     return result
 
 
-def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) -> VariantSet:
+def parse_dbsnp_variants(snps_result: Union[EntrezResponse, Dict[tuple, EntrezResponse]], verbose: bool = False) -> VariantSet:
     """Parse coordinates, frequencies and preferred IDs of dbSNP variants.
 
     Parameters:
         snps_result: result of fetch query in XML format, usually to `'snp'` database
         verbose: whether to print out full problematic XML if SPDI cannot be parsed
     """
+    if isinstance(snps_result, dict):
+        coordinates = []
+        alt_frequencies = []
+        preferred_ids = {}
+        summaries = []
+        for result in snps_result.values():
+            parsed = parse_dbsnp_variants(result)
+            coordinates.append(parsed.coordinates)
+            alt_frequencies.append(parsed.alt_frequencies)
+            preferred_ids.update(parsed.preferred_ids)
+            summaries.append(parsed.alt_frequencies)
+        return VariantSet(
+            coordinates=concat(coordinates),
+            alt_frequencies=concat(alt_frequencies),
+            preferred_ids=preferred_ids,
+            summary=concat(summaries)
+        )
+
     if DataFrame is None:
         raise ValueError('pandas is required for parser_dbsnp_variants')
     if not is_xml_response(snps_result):
diff --git a/tests/test_parsing.py b/tests/test_parsing.py
index 7e3d605..bb0845c 100644
--- a/tests/test_parsing.py
+++ b/tests/test_parsing.py
@@ -75,6 +75,17 @@ def test_parse_two_snps():
     assert set(summary.columns) == {'HGVS', 'SEQ', 'LEN', 'GENE'}
 
 
+@pytest.mark.optional
+def test_parse_batch():
+    response = DummyResponse(
+        query=FetchQuery(ids=['rs6311', 'rs662138'], database='snp', max_results=10),
+        content_type='xml',
+        data=fromstring(TWO_SNPS)
+    )
+    variant_set = parse_dbsnp_variants({('rs6311', 'rs662138'): response})
+    assert type(variant_set) == VariantSet
+
+
 @pytest.mark.optional
 def test_merged_variant_solving():
     response = DummyResponse(

From 2e238e56e090eaf2ad27b184782a3b33651d1365 Mon Sep 17 00:00:00 2001
From: krassowski <5832902+krassowski@users.noreply.github.com>
Date: Sat, 21 Jan 2023 14:05:54 +0000
Subject: [PATCH 2/2] Bump version to 0.3.6

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ac60360..18a5192 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@ def get_long_description(file_name):
         package_data={'easy_entrez': ['data/*.tsv', 'py.typed']},
         # required for mypy to work
         zip_safe=False,
-        version='0.3.5',
+        version='0.3.6',
         license='MIT',
         description='Python REST API for Entrez E-Utilities: stateless, easy to use, reliable.',
         long_description=get_long_description('README.md'),