Skip to content

Commit 1202f1c

Browse files
orangetinjustusc
andauthored
Fix r2 path (#144)
* Fix r2 path * Add debugging --------- Co-authored-by: Justus Calvin <[email protected]>
1 parent 848a867 commit 1202f1c

File tree

1 file changed

+15
-8
lines changed

1 file changed

+15
-8
lines changed

data/prepare_data.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def clone_git_repo(data_source, destination_dir):
109109

110110
# Download all files from an S3 compatible storage service.
111111
def download_from_s3(url, destination_dir, access_key_id = None,
112-
secret_access_key = None, session_token = None):
112+
secret_access_key = None, session_token = None, debug = False):
113113
# Get the access key ID and secret access key from the environment variables
114114
if access_key_id is None:
115115
access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
@@ -118,16 +118,21 @@ def download_from_s3(url, destination_dir, access_key_id = None,
118118
if session_token is None:
119119
session_token = os.environ.get('AWS_SESSION_TOKEN')
120120

121-
print(f"access_key_id={access_key_id}")
122-
print(f"secret_access_key={secret_access_key}")
123-
124121
# Create an S3 client
125122
parsed_url = url.split('/')
126123
endpoint_url = f"{parsed_url[0]}//{parsed_url[2]}"
127124
bucket_name = parsed_url[3]
128125
key_prefix = "/".join(parsed_url[4:-1])
129126
base_file = parsed_url[-1] if not url.endswith('/') else ""
127+
130128
print(f"endpoint_url={endpoint_url} ...")
129+
if debug:
130+
print(f"access_key_id={access_key_id}")
131+
print(f"secret_access_key={secret_access_key}")
132+
print(f"bucket_name={bucket_name}")
133+
print(f"key_prefix={key_prefix}")
134+
print(f"base_file={base_file}")
135+
131136
s3 = boto3.resource('s3',
132137
endpoint_url = endpoint_url,
133138
aws_access_key_id = access_key_id,
@@ -158,7 +163,7 @@ def download_from_s3(url, destination_dir, access_key_id = None,
158163
destination_file = os.path.join(destination_dir, base_file)
159164
if not os.path.exists(destination_file):
160165
print(f"Downloading {base_file} ...")
161-
bucket.download_file(f'{key_prefix}/{base_file}', destination_file)
166+
bucket.download_file(f'/{key_prefix}/{base_file}', destination_file)
162167
else:
163168
print(f"File already exists, skipping {base_file}")
164169

@@ -210,7 +215,7 @@ def download_from_url(url, destination_dir):
210215

211216
# Perepare data will clone the git repository given by data_source into the
212217
# destination_dir.
213-
def prepare_data(data_source, destination_dir, access_key_id=None, secret_access_key=None):
218+
def prepare_data(data_source, destination_dir, access_key_id=None, secret_access_key=None, debug=False):
214219

215220
# Check that destination_dir is a directory. If it does not exist, then
216221
# create it.
@@ -229,7 +234,8 @@ def prepare_data(data_source, destination_dir, access_key_id=None, secret_access
229234
clone_git_repo(data_source, destination_dir)
230235
elif is_s3_url(data_source):
231236
# Handle the case where the data source is an S3 URL
232-
download_from_s3(data_source, destination_dir, access_key_id, secret_access_key)
237+
download_from_s3(url=data_source, destination_dir=destination_dir, access_key_id=access_key_id,
238+
secret_access_key=secret_access_key, debug=debug)
233239
elif data_source.startswith('http://') or data_source.startswith('https://'):
234240
# Handle the case where the data source is a URL
235241
download_from_url(data_source, destination_dir)
@@ -250,9 +256,10 @@ def main():
250256
parser.add_argument("-d", "--dest", required=True, help="Destination directory to clone the repository and extract files")
251257
parser.add_argument("-a", "--access-key-id", required=False, help="AWS access key ID")
252258
parser.add_argument("-k", "--secret-access-key", required=False, help="AWS secret access key")
259+
parser.add_argument("--debug", action='store_true', help="Enable debug mode")
253260

254261
args = parser.parse_args()
255-
prepare_data(args.data_source, args.dest, args.access_key_id, args.secret_access_key)
262+
prepare_data(args.data_source, args.dest, args.access_key_id, args.secret_access_key, args.debug)
256263

257264

258265
if __name__ == "__main__":

0 commit comments

Comments
 (0)