From 56196a7642e03b7df9f7a9289751d27d0a5785ca Mon Sep 17 00:00:00 2001 From: haasad Date: Fri, 25 Aug 2023 23:07:09 +0200 Subject: [PATCH 1/4] Switch from cookie-based session login to SSO tokens --- eidl/core.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/eidl/core.py b/eidl/core.py index 33f998e..8612666 100644 --- a/eidl/core.py +++ b/eidl/core.py @@ -3,6 +3,7 @@ import tempfile import getpass import subprocess +import json import requests import bs4 @@ -20,6 +21,8 @@ def __init__(self, username=None, password=None, version=None, self.version = version self.system_model = system_model self.outdir = outdir + self.access_token = None + self.refresh_token = None def run(self): if self.check_stored(): @@ -58,20 +61,23 @@ def get_credentials(self): return un, pw def login(self): - self.session = requests.Session() - logon_url = 'https://v33.ecoquery.ecoinvent.org/Account/LogOn' - post_data = {'UserName': self.username, - 'Password': self.password, - 'IsEncrypted': 'false', - 'ReturnUrl': '/'} + sso_url='https://sso.ecoinvent.org/realms/ecoinvent/protocol/openid-connect/token' + post_data = {'username': self.username, + 'password': self.password, + 'client_id': 'apollo-ui', + 'grant_type': 'password'} try: - self.session.post(logon_url, post_data, timeout=20) + response = requests.post(sso_url, post_data, timeout=20) except (requests.ConnectTimeout, requests.ReadTimeout, requests.ConnectionError) as e: self.handle_connection_timeout() raise e - success = bool(self.session.cookies) - self.login_success(success) + if response.ok: + tokens = json.loads(response.text) + self.access_token = tokens['access_token'] + self.refresh_token = tokens['refresh_token'] + + self.login_success(response.ok) def login_success(self, success): if not success: From db4cfc2328a7c84844afc81701c5d9d031bd551a Mon Sep 17 00:00:00 2001 From: haasad Date: Sat, 26 Aug 2023 00:16:02 +0200 Subject: [PATCH 2/4] Query available files from API Instead of parsing the HTML, we now gather the necessary information from the API's json response. Still uses the old method of parsing and filtering the filenames. Maybe there's a cleaner way with the /files endpoint, but it's difficult to figure out without documentation. --- eidl/core.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/eidl/core.py b/eidl/core.py index 8612666..cd3c795 100644 --- a/eidl/core.py +++ b/eidl/core.py @@ -96,18 +96,21 @@ def handle_connection_timeout(self): ) def get_available_files(self): - files_url = 'https://v33.ecoquery.ecoinvent.org/File/Files' + files_url = 'https://api.ecoquery.ecoinvent.org/files' + auth_header = {'Authorization': f'Bearer {self.access_token}'} try: - files_res = self.session.get(files_url, timeout=20) + files_res = requests.get(files_url, headers=auth_header, timeout=20) except (requests.ConnectTimeout, requests.ReadTimeout, requests.ConnectionError) as e: self.handle_connection_timeout() raise e - soup = bs4.BeautifulSoup(files_res.text, 'html.parser') - all_files = [l for l in soup.find_all('a', href=True) if - l['href'].startswith('/File/File?')] - not_allowed = soup.find_all('a', class_='fileDownloadNotAllowed') - available_files = set(all_files).difference(set(not_allowed)) - link_dict = {f.contents[0]: f['href'] for f in available_files} + + files_raw = json.loads(files_res.text) + link_dict = dict() + for version in files_raw: + for release in version['releases']: + for rf in release['release_files']: + link_dict[rf['name']] = rf['uuid'] + link_dict = { k.replace('-', ''):v for k, v in link_dict.items() if k.startswith('ecoinvent ') and k.endswith('ecoSpold02.7z') and not 'lc' in k.lower() From bbe29619aa701b3e8c877769378100e5a69df5c3 Mon Sep 17 00:00:00 2001 From: haasad Date: Sat, 26 Aug 2023 00:42:14 +0200 Subject: [PATCH 3/4] Request aws s3 link from API and download from there --- eidl/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/eidl/core.py b/eidl/core.py index cd3c795..5de7ecc 100644 --- a/eidl/core.py +++ b/eidl/core.py @@ -149,10 +149,12 @@ def choose_db(self): return dbkey def download(self): - url = 'https://v33.ecoquery.ecoinvent.org' db_key = (self.version, self.system_model) + url = f'https://api.ecoquery.ecoinvent.org/files/r/{self.db_dict[db_key]}' + auth_header = {'Authorization': f'Bearer {self.access_token}'} try: - file_content = self.session.get(url + self.db_dict[db_key], timeout=60).content + s3_link = json.loads(requests.get(url, headers=auth_header, timeout=20).text) + file_content = requests.get(s3_link['download_url'], timeout=60).content except (requests.ConnectTimeout, requests.ReadTimeout, requests.ConnectionError) as e: self.handle_connection_timeout() raise e From 8ee5dd1f9f5fce1dd0b08787a4516a6e01f6d9ed Mon Sep 17 00:00:00 2001 From: haasad Date: Sat, 26 Aug 2023 01:11:51 +0200 Subject: [PATCH 4/4] Make sure access token is refreshed before it's used Access token expires afer 5 mins, refresh token after 30 mins. --- eidl/core.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/eidl/core.py b/eidl/core.py index 5de7ecc..95451f7 100644 --- a/eidl/core.py +++ b/eidl/core.py @@ -79,6 +79,23 @@ def login(self): self.login_success(response.ok) + def refresh_tokens(self): + if self.refresh_token is None: + return + + sso_url='https://sso.ecoinvent.org/realms/ecoinvent/protocol/openid-connect/token' + post_data = {'client_id': 'apollo-ui', + 'grant_type': 'refresh_token', + 'refresh_token': self.refresh_token} + response = requests.post(sso_url, post_data, timeout=20) + + if response.ok: + tokens = json.loads(response.text) + self.access_token = tokens['access_token'] + self.refresh_token = tokens['refresh_token'] + else: + self.login() + def login_success(self, success): if not success: print('Login failed') @@ -97,6 +114,7 @@ def handle_connection_timeout(self): def get_available_files(self): files_url = 'https://api.ecoquery.ecoinvent.org/files' + self.refresh_tokens() auth_header = {'Authorization': f'Bearer {self.access_token}'} try: files_res = requests.get(files_url, headers=auth_header, timeout=20) @@ -151,6 +169,7 @@ def choose_db(self): def download(self): db_key = (self.version, self.system_model) url = f'https://api.ecoquery.ecoinvent.org/files/r/{self.db_dict[db_key]}' + self.refresh_tokens() auth_header = {'Authorization': f'Bearer {self.access_token}'} try: s3_link = json.loads(requests.get(url, headers=auth_header, timeout=20).text)