Skip to content

Commit 41f02bd

Browse files
committed
change handling of background jobs; switch from catalogue/jobs to catalog/results; update and extend tests
1 parent 5403ecf commit 41f02bd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+87694
-42953
lines changed

Changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# Changelog
22

3+
## 0.5.4
4+
5+
- Change handling of backend jobs: Switch from catalogue/jobs to catalogue/results.
6+
- Reactivate tests for Regionalstatistik.
7+
38
## 0.5.3
49

510
- Support for a new parameter in `Table.get_data(..., compress: bool = True)` that can be `True` or `False`. When set to `True`, it will suppress empty rows and columns in the table before downloading it, thus reducing the table size.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ To contribute to this project, please follow these steps:
183183
2. Create a new branch by running `git checkout -b <new-branch>` or `git switch -c <new-branch>`. If possible, add an issue number to the branch name.
184184
3. Make your changes.
185185
4. Delete the cassettes folder under tests to make sure that the tests are loading the latest data from the API.
186-
5. Run `uv run pytest tests -sx -vv --vcr-record=new_episodes` to see if all existing tests still run through.
186+
5. Run `uv run pytest tests -sx -vv --vcr-record=none` to see if all existing tests still run through.
187187
6. Add new tests depending on your changes.
188188
7. Run `uv run pytest tests -sx -vv --vcr-record=new_episodes` again to make sure your tests are also passed.
189189
8. Commit your changes. This will trigger all pre-commit hooks as defined in `.pre-commit-config.yaml`. If any of these pre-hooks fails, your commit is declined, and you have to resolve the issues first.

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "pystatis"
3-
version = "0.5.3"
3+
version = "0.5.4"
44
description = "Python wrapper for GENESIS web service interface (API) of the Federal Statistical Office."
55
authors = [
66
{ name = "Michael Aydinbas", email = "[email protected]" },
@@ -73,3 +73,6 @@ log_cli = false
7373
[tool.safety]
7474
full_report = true
7575
ignore = "67599,70612"
76+
77+
[tool.ruff]
78+
line-length = 100

src/pystatis/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from pystatis.helloworld import logincheck, whoami
1616
from pystatis.table import Table
1717

18-
__version__ = "0.5.3"
18+
__version__ = "0.5.4"
1919

2020
__all__ = [
2121
"clear_cache",

src/pystatis/config.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,7 @@
3434
REGEX_DB = {
3535
"genesis": re.compile(r"^((\d{5}-\d{4})|([0-9A-Z]{10}))$"),
3636
"zensus": re.compile(r"^\d{4}[A-Z]-\d{4}$"),
37-
"regio": re.compile(
38-
r"^((\d{5}-.{1,2}($|-.*$))|(A.*$)|([0-9A-Z]{10}$)|(\d{5}\w-Z-\d{1,2}))"
39-
),
37+
"regio": re.compile(r"^((\d{5}-.{1,2}($|-.*$))|(A.*$)|([0-9A-Z]{10}$)|(\d{5}\w-Z-\d{1,2}))"),
4038
}
4139

4240
ARS_OR_AGS_MAPPING = {

src/pystatis/db.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
import json
44
import logging
55

6-
from pystatis import config
7-
from pystatis import cache
6+
from pystatis import cache, config, http_helper
87
from pystatis.exception import PystatisConfigError
9-
from pystatis import http_helper
108

119
logger = logging.getLogger(__name__)
1210

@@ -34,9 +32,7 @@ def identify_db_matches(table_name: str) -> list[str]:
3432
if db_matches:
3533
return db_matches
3634
else:
37-
raise ValueError(
38-
f"Could not determine the database for the table '{table_name}'."
39-
)
35+
raise ValueError(f"Could not determine the database for the table '{table_name}'.")
4036

4137

4238
def select_db_by_credentials(db_matches: list[str]) -> str:

src/pystatis/helloworld.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def whoami(db_name: str) -> str:
2020
url = f"{db.get_host(db_name)}" + "helloworld/whoami"
2121

2222
try:
23-
response = requests.get(url, timeout=(1, 15))
23+
response = requests.get(url, timeout=(30, 15))
2424
except requests.exceptions.Timeout:
2525
raise TimeoutError("Login request timed out after 15 minutes")
2626

@@ -56,7 +56,7 @@ def logincheck(db_name: str) -> str:
5656
"language": "de",
5757
}
5858

59-
response = requests.post(url, headers=headers, data=params, timeout=(1, 15))
59+
response = requests.post(url, headers=headers, data=params, timeout=(30, 15))
6060

6161
# NOTE: Cannot use get_data_from_endpoint due to colliding
6262
# and misleading usage of "Status" key in API response

src/pystatis/http_helper.py

Lines changed: 69 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77

88
import requests
99

10-
from pystatis import config, db
11-
from pystatis import cache
10+
from pystatis import cache, config, db
1211
from pystatis.exception import DestatisStatusError, NoNewerDataError, TableNotFoundError
1312
from pystatis.types import ParamDict
1413

@@ -51,9 +50,7 @@ def load_data(
5150
logger.info("Data was loaded from cache.")
5251
else:
5352
response = get_data_from_endpoint(endpoint, method, params, db_name)
54-
content_type = response.headers.get("Content-Type", "text/csv").split("/")[
55-
-1
56-
]
53+
content_type = response.headers.get("Content-Type", "text/csv").split("/")[-1]
5754
data = response.content
5855

5956
# status code 98 means that the table is too big
@@ -62,7 +59,7 @@ def load_data(
6259
try:
6360
# test for job-relevant status code
6461
response_status_code = response.json().get("Status").get("Code")
65-
except json.decoder.JSONDecodeError:
62+
except (json.decoder.JSONDecodeError, requests.exceptions.JSONDecodeError):
6663
pass
6764

6865
if response_status_code == 98:
@@ -72,14 +69,9 @@ def load_data(
7269
"Verarbeitung im Hintergrund erfolgreich gestartet. Job-ID: %s.",
7370
job_id,
7471
)
75-
# in rare cases it seems that asking catalogue/jobs endpoint for the state of the newly created job fails because no job could be found
76-
# so we add 5 seconds here to make sure that the job was created in the meantime
77-
time.sleep(5)
78-
response = get_data_from_resultfile(job_id, db_name)
72+
response = get_data_from_resultfile(job_id, params, db_name)
7973
assert isinstance(response.content, bytes) # nosec assert_used
80-
content_type = response.headers.get("Content-Type", "text/csv").split(
81-
"/"
82-
)[-1]
74+
content_type = response.headers.get("Content-Type", "text/csv").split("/")[-1]
8375
data = response.content
8476

8577
cache.cache_data(cache_dir, name, params, data, content_type)
@@ -122,7 +114,7 @@ def get_response(db_name: str, params: ParamDict) -> requests.Response:
122114
"password": db_pw,
123115
}
124116

125-
return requests.post(url, headers=headers, data=params, timeout=(5, 300))
117+
return requests.post(url, headers=headers, data=params, timeout=(30, 300))
126118

127119
# Determine database by matching regex to item code
128120
if db_name is None:
@@ -162,18 +154,19 @@ def start_job(endpoint: str, method: str, params: ParamDict) -> requests.Respons
162154
Args:
163155
endpoint (str): Destatis endpoint (eg. data, catalogue, ..)
164156
method (str): Destatis method (eg. tablefile, ...)
165-
params (dict): dictionary of query parameters
157+
params (dict): Dictionary of query parameters.
166158
167159
Returns:
168160
requests.Response: the response object holding the response from calling the Destatis endpoint.
169161
"""
170162
logger.warning(
171163
"Die Tabelle ist zu groß, um direkt abgerufen zu werden. Es wird eine Verarbeitung im Hintergrund gestartet."
172164
)
173-
params["job"] = "true"
165+
job_params = params.copy()
166+
job_params["job"] = "true"
174167

175168
# starting a job
176-
response = get_data_from_endpoint(endpoint=endpoint, method=method, params=params)
169+
response = get_data_from_endpoint(endpoint=endpoint, method=method, params=job_params)
177170

178171
return response
179172

@@ -201,58 +194,57 @@ def get_job_id_from_response(response: requests.Response) -> str:
201194

202195

203196
def get_data_from_resultfile(
204-
job_id: str, db_name: str | None = None
197+
job_id: str, params: ParamDict, db_name: str | None = None
205198
) -> requests.Response:
206199
"""Get data from a job once it is finished or when the timeout is reached.
207200
208201
Args:
209202
job_id (str): Job ID generated by Destatis API.
203+
params (dict): Dictionary of query parameters.
210204
db_name (str, optional): The database to use for this data request.
211205
One of "genesis", "zensus", "regio". Defaults to None.
212206
213207
Returns:
214208
requests.Response: the response object holding the response from calling the Destatis endpoint.
215209
"""
216-
params = {
217-
"selection": "*" + job_id,
218-
"searchcriterion": "code",
219-
"sortcriterion": "code",
220-
"type": "all",
210+
job_params = {
211+
"selection": job_id,
212+
"area": "user",
213+
"pagelength": "1",
221214
}
222215

223216
time_ = time.perf_counter()
224217

225218
while (time.perf_counter() - time_) < JOB_TIMEOUT:
226-
response = get_data_from_endpoint(
227-
endpoint="catalogue", method="jobs", params=params, db_name=db_name
228-
)
229-
230-
jobs = response.json().get("List")
231-
if len(jobs) > 0 and jobs[0].get("State") == "Fertig":
232-
logger.info(
233-
(
234-
"Verarbeitung im Hintergrund abgeschlossen. "
235-
"Ergebnis kann jetzt abgerufen werden über "
236-
"/data/resultfile und Job-ID: %s."
237-
),
238-
job_id,
219+
try:
220+
response = get_data_from_endpoint(
221+
endpoint="catalogue", method="results", params=job_params, db_name=db_name
239222
)
240-
break
241-
else:
223+
224+
jobs = response.json().get("List")
225+
if len(jobs) > 0:
226+
logger.info(
227+
(
228+
"Verarbeitung im Hintergrund abgeschlossen. "
229+
"Ergebnis kann jetzt abgerufen werden über "
230+
"/data/resultfile und Job-ID: %s."
231+
),
232+
job_id,
233+
)
234+
break
235+
except DestatisStatusError:
242236
logger.info("Verarbeitung im Hintergrund läuft noch...")
243237

244238
time.sleep(5)
245239
else:
246-
print("Time out exceeded! Aborting...")
240+
logger.error(
241+
"Verarbeitungsfenster von %s Minuten überschritten. Job-Datei konnte nicht heruntergeladen werden.",
242+
JOB_TIMEOUT // 60,
243+
)
247244
return bytes()
248245

249-
time.sleep(5)
250-
params = {
251-
"name": job_id,
252-
"area": "all",
253-
"compress": "false",
254-
"format": "ffcsv",
255-
}
246+
params = params.copy()
247+
params["name"] = job_id
256248
response = get_data_from_endpoint(
257249
endpoint="data", method="resultfile", params=params, db_name=db_name
258250
)
@@ -325,36 +317,47 @@ def _check_destatis_status(destatis_status: dict) -> None: # type: ignore
325317
Raises:
326318
DestatisStatusError: If the status code or type displays an error (caused by the user inputs)
327319
"""
328-
# -1 status code for unexpected errors and if no status code is given (faulty response)
329-
destatis_status_code = destatis_status.get("Code", -1)
330-
destatis_status_type = destatis_status.get("Type", "Information")
320+
# Status codes
321+
STATUS_CODE_PARAM_ADJUSTED = 22
322+
STATUS_CODE_NO_NEW_DATA = 50
323+
STATUS_CODE_TABLE_NOT_FOUND = 90
324+
STATUS_CODE_LARGE_TABLE = 98
325+
STATUS_CODE_NO_MATCHING_OBJECT = 104
326+
STATUS_CODE_ERROR = -1 # For unexpected errors and if no status code is given
327+
328+
# Status types
329+
STATUS_TYPE_INFORMATION = "Information"
330+
ERROR_TYPES = ["Error", "Fehler"]
331+
WARNING_TYPES = ["Warning", "Warnung"]
332+
333+
destatis_status_code = destatis_status.get("Code", STATUS_CODE_ERROR)
334+
destatis_status_type = destatis_status.get("Type", STATUS_TYPE_INFORMATION)
331335
destatis_status_content = destatis_status.get("Content")
332336

333-
# define status types
334-
error_en_de = ["Error", "Fehler"]
335-
warning_en_de = ["Warning", "Warnung"]
336-
337-
# check for generic/ system error
338-
if destatis_status_code == -1:
337+
# Check for generic/system error
338+
if destatis_status_code == STATUS_CODE_ERROR:
339339
raise DestatisStatusError(destatis_status_content)
340340

341-
# check for destatis/ query errors
342-
elif (destatis_status_code in [104, 50, 90]) or (
343-
destatis_status_type in error_en_de
344-
):
345-
if destatis_status_code == 98:
341+
# Check for destatis/query errors
342+
elif (
343+
destatis_status_code
344+
in [STATUS_CODE_NO_MATCHING_OBJECT, STATUS_CODE_NO_NEW_DATA, STATUS_CODE_TABLE_NOT_FOUND]
345+
) or (destatis_status_type in ERROR_TYPES):
346+
if destatis_status_code == STATUS_CODE_LARGE_TABLE:
346347
pass
347-
elif destatis_status_code == 50:
348+
elif destatis_status_code == STATUS_CODE_NO_NEW_DATA:
348349
raise NoNewerDataError(destatis_status_content)
349-
elif destatis_status_code == 90:
350+
elif destatis_status_code == STATUS_CODE_TABLE_NOT_FOUND:
350351
raise TableNotFoundError(destatis_status_content)
351352
else:
352353
raise DestatisStatusError(destatis_status_content)
353354

354-
# output warnings to user
355-
elif (destatis_status_code == 22) or (destatis_status_type in warning_en_de):
355+
# Output warnings to user
356+
elif (destatis_status_code == STATUS_CODE_PARAM_ADJUSTED) or (
357+
destatis_status_type in WARNING_TYPES
358+
):
356359
logger.warning(destatis_status_content)
357360

358-
# output information to user
359-
elif destatis_status_type.lower() == "information":
361+
# Output information to user
362+
elif destatis_status_type.lower() == STATUS_TYPE_INFORMATION.lower():
360363
logger.info("Code %d: %s", destatis_status_code, destatis_status_content)

src/pystatis/profile.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@ def change_password(db_name: str, new_password: str) -> str:
2626
}
2727

2828
# change remote password
29-
response_text = load_data(
30-
endpoint="profile", method="password", params=params, db_name=db_name
31-
)
29+
response_text = load_data(endpoint="profile", method="password", params=params, db_name=db_name)
3230
# change local password
3331
db.set_pw(db_name, new_password)
3432

src/pystatis/results.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,7 @@ def show_metadata(self, row_numbers: list[int]) -> None:
9292
structure_dict["Head"]["Content"],
9393
f"{'-' * 20}",
9494
"Columns:",
95-
"\n".join(
96-
[col["Content"] for col in structure_dict["Columns"]]
97-
),
95+
"\n".join([col["Content"] for col in structure_dict["Columns"]]),
9896
f"{'-' * 20}",
9997
"Rows:",
10098
"\n".join([row["Content"] for row in structure_dict["Rows"]]),

0 commit comments

Comments
 (0)