From 810b55f82f611159c5c45de9f14aa0132784375b Mon Sep 17 00:00:00 2001
From: Marvin Hofer <mh68feta@studserv.uni-leipzig.de>
Date: Wed, 13 Sep 2023 14:45:15 +0200
Subject: [PATCH] init download function

---
 python/databusclient/cli.py    |   8 ++-
 python/databusclient/client.py | 100 +++++++++++++++++++++++++++++++++
 python/pyproject.toml          |   3 +
 python/tests/test_download.py  |  20 +++++++
 4 files changed, 129 insertions(+), 2 deletions(-)
 create mode 100644 python/tests/test_download.py

diff --git a/python/databusclient/cli.py b/python/databusclient/cli.py
index 29ce698..6a45c58 100644
--- a/python/databusclient/cli.py
+++ b/python/databusclient/cli.py
@@ -35,5 +35,9 @@ def deploy(
 
 
 @app.command()
-def download(collection: str):
-    typer.echo(f"TODO")
+def download(
+    localDir: str = typer.Option(..., help="local databus folder"),
+    databus: str = typer.Option(..., help="databus URL"),
+    databusURIs: List[str] = typer.Argument(...,help="any kind of these: databus identifier, databus collection identifier, query file")
+):
+    client.download(localDir=localDir,endpoint=databus,databusURIs=databusURIs)
diff --git a/python/databusclient/client.py b/python/databusclient/client.py
index b096256..518c7d9 100644
--- a/python/databusclient/client.py
+++ b/python/databusclient/client.py
@@ -3,7 +3,10 @@
 import requests
 import hashlib
 import json
+from tqdm import tqdm
+from SPARQLWrapper import SPARQLWrapper, JSON
 from urllib.parse import urldefrag
+from hashlib import sha256
 
 __debug = False
 
@@ -374,3 +377,100 @@ def deploy(
     if debug or __debug:
         print("---------")
         print(resp.text)
+
+
+def __download_file__(url, filename):
+    """
+    Download a file from the internet with a progress bar using tqdm.
+
+    Parameters:
+    - url: the URL of the file to download
+    - filename: the local file path where the file should be saved
+    """
+    print("download "+url)
+    response = requests.get(url, stream=True)
+    total_size_in_bytes= int(response.headers.get('content-length', 0))
+    block_size = 1024 # 1 Kibibyte
+
+    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+    with open(filename, 'wb') as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+        print("ERROR, something went wrong")
+
+
+def __query_sparql__(endpoint_url, query)-> dict:
+    """
+    Query a SPARQL endpoint and return results in JSON format.
+
+    Parameters:
+    - endpoint_url: the URL of the SPARQL endpoint
+    - query: the SPARQL query string
+
+    Returns:
+    - Dictionary containing the query results
+    """
+    sparql = SPARQLWrapper(endpoint_url)
+    sparql.method = 'POST'
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    results = sparql.query().convert()
+    return results
+
+
+def __handle__databus_file_query__(endpoint_url, query) -> List[str]:
+    result_dict = __query_sparql__(endpoint_url,query)
+    for binding in result_dict['results']['bindings']:
+        if len(binding.keys()) > 1:
+            print("Error multiple bindings in query response")
+            break
+        else:
+            value = binding[next(iter(binding.keys()))]['value']
+        yield value
+
+
+def wsha256(raw: str):
+    return sha256(raw.encode('utf-8')).hexdigest()
+
+
+def __handle_databus_collection__(endpoint, uri: str)-> str:
+    headers = {"Accept": "text/sparql"}
+    return requests.get(uri, headers=headers).text
+
+
+def __download_list__(urls: List[str], localDir: str):
+    for url in urls:
+        __download_file__(url=url,filename=localDir+"/"+wsha256(url))
+
+
+def download(
+    localDir: str,
+    endpoint: str,
+    databusURIs: List[str]
+) -> None:
+    """
+    Download datasets to local storage from databus registry
+    ------
+    localDir: the local directory
+    databusURIs: identifiers to access databus registered datasets
+    """
+    for databusURI in databusURIs:
+        # dataID or databus collection
+        if databusURI.startswith("http://") or databusURI.startswith("https://"):
+            # databus collection
+            if "/collections/" in databusURI:
+                query = __handle_databus_collection__(endpoint,databusURI)
+                res = __handle__databus_file_query__(endpoint, query)
+            else:
+                print("dataId not supported yet")
+        # query in local file
+        elif databusURI.startswith("file://"):
+            print("query in file not supported yet")
+        # query as argument
+        else:
+            print("QUERY {}", databusURI.replace("\n"," "))
+            res = __handle__databus_file_query__(endpoint,databusURI)
+            __download_list__(res,localDir)
\ No newline at end of file
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 069d196..71271cf 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -10,6 +10,9 @@ readme = "README.md"
 python = "^3.9"
 typer = "^0.6.1"
 requests = "^2.28.1"
+tqdm = "^2.2.3"
+SPARQLWrapper = "^2.0.0"
+
 
 [tool.poetry.dev-dependencies]
 black = "^22.6.0"
diff --git a/python/tests/test_download.py b/python/tests/test_download.py
new file mode 100644
index 0000000..35a6c6f
--- /dev/null
+++ b/python/tests/test_download.py
@@ -0,0 +1,20 @@
+"""Download Tests"""
+import pytest
+import databusclient.client as cl
+
+DEFAULT_ENDPOINT="https://databus.dbpedia.org/sparql"
+TEST_QUERY="""
+PREFIX dcat: <http://www.w3.org/ns/dcat#>
+SELECT ?x WHERE {
+  ?sub dcat:downloadURL ?x .
+} LIMIT 10
+"""
+TEST_COLLECTION="https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12"
+
+def test_with_query():
+  cl.download("target",DEFAULT_ENDPOINT,[TEST_QUERY]
+
+)
+  
+def test_with_collection():
+  cl.download("target",DEFAULT_ENDPOINT,[TEST_COLLECTION])
\ No newline at end of file