Skip to content

Commit c153c00

Browse files
authoredOct 25, 2024
Updates related to Dryad upload (#28)
* WIP * needed updates for dryad data * Updates from initial run before running in nautilus * Updates in preparation for Nautilus * Updates related to kubernetes upload * Clean up csvs * adding DELTA_LAKE_PATH back to compose
1 parent 12d5368 commit c153c00

21 files changed

+1591
-119
lines changed
 

‎.dockerignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
jupyter
1+
jupyter
2+
data

‎.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,5 @@ cython_debug/
167167
data
168168
.DS_Store
169169
.env.*
170+
171+
brave-sonar-390402-7644b983ce44.json
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Generated by Django 5.1.2 on 2024-10-13 22:57
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
("metadata", "0014_remove_files_file_path"),
9+
]
10+
11+
operations = [
12+
migrations.AddField(
13+
model_name="animals",
14+
name="birth_year",
15+
field=models.IntegerField(blank=True, null=True),
16+
),
17+
migrations.AddField(
18+
model_name="animals",
19+
name="domain_ids",
20+
field=models.CharField(blank=True, null=True),
21+
),
22+
migrations.AddField(
23+
model_name="animals",
24+
name="lab_id",
25+
field=models.CharField(blank=True, null=True),
26+
),
27+
migrations.AddField(
28+
model_name="animals",
29+
name="sex",
30+
field=models.CharField(blank=True, null=True),
31+
),
32+
migrations.AddField(
33+
model_name="deployments",
34+
name="animal_age",
35+
field=models.IntegerField(blank=True, null=True),
36+
),
37+
migrations.AddField(
38+
model_name="deployments",
39+
name="animal_age_class",
40+
field=models.CharField(blank=True, null=True),
41+
),
42+
migrations.AddField(
43+
model_name="deployments",
44+
name="arrival_datetime",
45+
field=models.DateTimeField(blank=True, null=True),
46+
),
47+
migrations.AddField(
48+
model_name="deployments",
49+
name="departure_datetime",
50+
field=models.DateTimeField(blank=True, null=True),
51+
),
52+
migrations.AddField(
53+
model_name="deployments",
54+
name="deployment_latitude",
55+
field=models.FloatField(blank=True, null=True),
56+
),
57+
migrations.AddField(
58+
model_name="deployments",
59+
name="deployment_location",
60+
field=models.CharField(blank=True, null=True),
61+
),
62+
migrations.AddField(
63+
model_name="deployments",
64+
name="deployment_longitude",
65+
field=models.FloatField(blank=True, null=True),
66+
),
67+
migrations.AddField(
68+
model_name="deployments",
69+
name="deployment_type",
70+
field=models.CharField(blank=True, null=True),
71+
),
72+
migrations.AddField(
73+
model_name="deployments",
74+
name="domain_deployment_id",
75+
field=models.CharField(blank=True, null=True),
76+
),
77+
migrations.AddField(
78+
model_name="deployments",
79+
name="recovery_latitude",
80+
field=models.FloatField(blank=True, null=True),
81+
),
82+
migrations.AddField(
83+
model_name="deployments",
84+
name="recovery_location",
85+
field=models.CharField(blank=True, null=True),
86+
),
87+
migrations.AddField(
88+
model_name="deployments",
89+
name="recovery_longitude",
90+
field=models.FloatField(blank=True, null=True),
91+
),
92+
migrations.AddField(
93+
model_name="loggers",
94+
name="manufacturer_name",
95+
field=models.CharField(blank=True, null=True),
96+
),
97+
migrations.AddField(
98+
model_name="loggers",
99+
name="ptt",
100+
field=models.CharField(blank=True, null=True),
101+
),
102+
migrations.AlterField(
103+
model_name="files",
104+
name="file",
105+
field=models.FileField(
106+
storage="DiveDB.services.utils.storage.OpenStackStorage",
107+
upload_to="divedb-media/",
108+
),
109+
),
110+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Generated by Django 5.1.2 on 2024-10-13 23:04
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
(
9+
"metadata",
10+
"0015_animals_birth_year_animals_domain_ids_animals_lab_id_and_more",
11+
),
12+
]
13+
14+
operations = [
15+
migrations.AddField(
16+
model_name="recordings",
17+
name="attachment_location",
18+
field=models.CharField(blank=True, null=True),
19+
),
20+
migrations.AddField(
21+
model_name="recordings",
22+
name="attachment_type",
23+
field=models.CharField(blank=True, null=True),
24+
),
25+
migrations.AddField(
26+
model_name="recordings",
27+
name="quality",
28+
field=models.CharField(blank=True, null=True),
29+
),
30+
migrations.AddField(
31+
model_name="recordings",
32+
name="timezone",
33+
field=models.CharField(blank=True, max_length=32, null=True),
34+
),
35+
]

‎DiveDB/server/metadata/models.py

+22
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ class Loggers(models.Model):
3838
icon_url = models.URLField(max_length=1000, null=True, blank=True)
3939
serial_no = models.CharField(null=True, blank=True)
4040
manufacturer = models.CharField(null=True, blank=True)
41+
manufacturer_name = models.CharField(null=True, blank=True)
42+
ptt = models.CharField(null=True, blank=True)
4143
type = models.CharField(null=True, blank=True)
4244
type_name = models.CharField(null=True, blank=True)
4345
notes = models.TextField(null=True, blank=True)
@@ -59,6 +61,10 @@ class Animals(models.Model):
5961
project_id = models.CharField()
6062
common_name = models.CharField()
6163
scientific_name = models.CharField()
64+
lab_id = models.CharField(null=True, blank=True)
65+
birth_year = models.IntegerField(null=True, blank=True)
66+
sex = models.CharField(null=True, blank=True)
67+
domain_ids = models.CharField(null=True, blank=True)
6268

6369
class Meta:
6470
db_table = "Animals"
@@ -81,8 +87,20 @@ class Deployments(models.Model):
8187
]
8288

8389
id = models.CharField(primary_key=True)
90+
domain_deployment_id = models.CharField(null=True, blank=True)
91+
animal_age_class = models.CharField(null=True, blank=True)
92+
animal_age = models.IntegerField(null=True, blank=True)
93+
deployment_type = models.CharField(null=True, blank=True)
8494
deployment_name = models.CharField()
8595
rec_date = models.DateField()
96+
deployment_latitude = models.FloatField(null=True, blank=True)
97+
deployment_longitude = models.FloatField(null=True, blank=True)
98+
deployment_location = models.CharField(null=True, blank=True)
99+
departure_datetime = models.DateTimeField(null=True, blank=True)
100+
recovery_latitude = models.FloatField(null=True, blank=True)
101+
recovery_longitude = models.FloatField(null=True, blank=True)
102+
recovery_location = models.CharField(null=True, blank=True)
103+
arrival_datetime = models.DateTimeField(null=True, blank=True)
86104
animal = models.CharField()
87105
start_time = models.DateTimeField(null=True, blank=True)
88106
start_time_precision = models.TextField(null=True, blank=True)
@@ -130,6 +148,10 @@ class Recordings(models.Model):
130148
start_time_precision = models.CharField(
131149
null=True, blank=True, choices=PRECISION_CHOICES
132150
)
151+
timezone = models.CharField(max_length=32, null=True, blank=True)
152+
quality = models.CharField(null=True, blank=True)
153+
attachment_location = models.CharField(null=True, blank=True)
154+
attachment_type = models.CharField(null=True, blank=True)
133155

134156
def __str__(self):
135157
return f"{self.name} ({self.id})"

‎DiveDB/services/data_uploader.py

+119-16
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,21 @@
1919
from datetime import datetime, timezone
2020
from DiveDB.services.utils.openstack import SwiftClient
2121

22-
duckpond = DuckPond()
23-
swift_client = SwiftClient()
2422

2523
django_prefix = os.environ.get("DJANGO_PREFIX", "DiveDB")
2624
os.environ.setdefault(
2725
"DJANGO_SETTINGS_MODULE", f"{django_prefix}.server.django_app.settings"
2826
)
2927
django.setup()
3028

31-
from DiveDB.server.metadata.models import Files, Recordings # noqa: E402
29+
from DiveDB.server.metadata.models import ( # noqa: E402
30+
Files,
31+
Recordings,
32+
Deployments,
33+
Animals,
34+
Loggers,
35+
AnimalDeployments,
36+
)
3237

3338

3439
@dataclass
@@ -55,6 +60,11 @@ class NetCDFValidationError(Exception):
5560
class DataUploader:
5661
"""Data Uploader"""
5762

63+
def __init__(self, duckpond: DuckPond = None, swift_client: SwiftClient = None):
64+
"""Initialize DataUploader with optional DuckPond and SwiftClient instances."""
65+
self.duckpond = duckpond or DuckPond()
66+
self.swift_client = swift_client or SwiftClient()
67+
5868
def _read_edf_signal(self, edf: edfio.Edf, label: str):
5969
"""Function to read a single signal from an EDF file."""
6070
signal = edf.get_signal(label)
@@ -119,15 +129,15 @@ def _create_value_structs(self, values):
119129
lambda x: (float(x) if isinstance(x, (int, float)) else np.nan)
120130
)(values)
121131

122-
# Determine if any value has a decimal place
123-
if np.any(numeric_values % 1 != 0):
124-
float_values = np.where(np.isfinite(numeric_values), numeric_values, None)
125-
int_values = None
126-
else:
132+
# Check if the data type is integer
133+
if np.issubdtype(numeric_values.dtype, np.integer):
127134
float_values = None
128135
int_values = np.where(
129136
np.isfinite(numeric_values), numeric_values.astype(int), None
130137
)
138+
else:
139+
float_values = np.where(np.isfinite(numeric_values), numeric_values, None)
140+
int_values = None
131141

132142
string_values = np.where(
133143
~np.isin(values, [True, False])
@@ -186,7 +196,9 @@ def _write_data_to_duckpond(
186196
[metadata["animal"]] * len(values), type=pa.string()
187197
),
188198
"deployment": pa.array(
189-
[metadata["deployment"]] * len(values), type=pa.string()
199+
# This fix isn't working, make it a string
200+
[str(metadata["deployment"])] * len(values),
201+
type=pa.string(),
190202
),
191203
"recording": pa.array(
192204
[metadata["recording"]] * len(values), type=pa.string()
@@ -199,7 +211,7 @@ def _write_data_to_duckpond(
199211
},
200212
schema=LAKE_CONFIGS["DATA"]["schema"],
201213
)
202-
duckpond.write_to_delta(
214+
self.duckpond.write_to_delta(
203215
data=batch_table,
204216
lake="DATA",
205217
mode="append",
@@ -241,7 +253,7 @@ def _write_event_to_duckpond(
241253
[metadata["animal"]] * len(event_keys), type=pa.string()
242254
),
243255
"deployment": pa.array(
244-
[metadata["deployment"]] * len(event_keys), type=pa.string()
256+
[str(metadata["deployment"])] * len(event_keys), type=pa.string()
245257
),
246258
"recording": pa.array(
247259
[metadata["recording"]] * len(event_keys), type=pa.string()
@@ -256,7 +268,7 @@ def _write_event_to_duckpond(
256268
},
257269
schema=LAKE_CONFIGS["STATE_EVENTS"]["schema"],
258270
)
259-
duckpond.write_to_delta(
271+
self.duckpond.write_to_delta(
260272
data=batch_table,
261273
lake="STATE_EVENTS",
262274
mode="append",
@@ -266,7 +278,6 @@ def _write_event_to_duckpond(
266278
)
267279
del batch_table
268280
gc.collect()
269-
270281
def _validate_netcdf(self, ds: xr.Dataset):
271282
"""
272283
Validates netCDF file before upload.
@@ -356,8 +367,84 @@ def _validate_netcdf(self, ds: xr.Dataset):
356367
)
357368
return True
358369

370+
def get_or_create_logger(self, logger_data):
371+
logger, created = Loggers.objects.get_or_create(
372+
id=logger_data["logger_id"],
373+
defaults={
374+
"manufacturer": logger_data.get("manufacturer"),
375+
"manufacturer_name": logger_data.get("manufacturer_name"),
376+
"serial_no": logger_data.get("serial_no"),
377+
"ptt": logger_data.get("ptt"),
378+
"type": logger_data.get("type"),
379+
"notes": logger_data.get("notes"),
380+
},
381+
)
382+
return logger, created
383+
384+
def get_or_create_recording(self, recording_data):
385+
animal_deployment, _ = AnimalDeployments.objects.get_or_create(
386+
animal=recording_data["animal"], deployment=recording_data["deployment"]
387+
)
388+
recording, created = Recordings.objects.get_or_create(
389+
id=recording_data["recording_id"],
390+
defaults={
391+
"name": recording_data.get("name"),
392+
"animal_deployment": animal_deployment,
393+
"logger": recording_data.get("logger"),
394+
"start_time": recording_data.get("start_time"),
395+
"end_time": recording_data.get("end_time"),
396+
"timezone": recording_data.get("timezone"),
397+
"quality": recording_data.get("quality"),
398+
"attachment_location": recording_data.get("attachment_location"),
399+
"attachment_type": recording_data.get("attachment_type"),
400+
},
401+
)
402+
return recording, created
403+
404+
def get_or_create_deployment(self, deployment_data):
405+
deployment, created = Deployments.objects.get_or_create(
406+
id=deployment_data["deployment_id"],
407+
defaults={
408+
"domain_deployment_id": deployment_data.get("domain_deployment_id"),
409+
"animal_age_class": deployment_data.get("animal_age_class"),
410+
"animal_age": deployment_data.get("animal_age"),
411+
"deployment_type": deployment_data.get("deployment_type"),
412+
"deployment_name": deployment_data.get("deployment_name"),
413+
"rec_date": deployment_data.get("rec_date"),
414+
"deployment_latitude": deployment_data.get("deployment_latitude"),
415+
"deployment_longitude": deployment_data.get("deployment_longitude"),
416+
"deployment_location": deployment_data.get("deployment_location"),
417+
"departure_datetime": deployment_data.get("departure_datetime"),
418+
"recovery_latitude": deployment_data.get("recovery_latitude"),
419+
"recovery_longitude": deployment_data.get("recovery_longitude"),
420+
"recovery_location": deployment_data.get("recovery_location"),
421+
"arrival_datetime": deployment_data.get("arrival_datetime"),
422+
"notes": deployment_data.get("notes"),
423+
},
424+
)
425+
return deployment, created
426+
427+
def get_or_create_animal(self, animal_data):
428+
animal, created = Animals.objects.get_or_create(
429+
id=animal_data["animal_id"],
430+
defaults={
431+
"project_id": animal_data.get("project_id"),
432+
"common_name": animal_data.get("common_name"),
433+
"scientific_name": animal_data.get("scientific_name"),
434+
"lab_id": animal_data.get("lab_id"),
435+
"birth_year": animal_data.get("birth_year"),
436+
"sex": animal_data.get("sex"),
437+
"domain_ids": animal_data.get("domain_ids"),
438+
},
439+
)
440+
return animal, created
441+
359442
def upload_netcdf(
360-
self, netcdf_file_path: str, metadata: dict, batch_size: int = 1000000
443+
self,
444+
netcdf_file_path: str,
445+
metadata: dict,
446+
batch_size: int = 1000000,
447+
rename_map: dict = None,
361448
):
362449
"""
363450
Uploads a netCDF file to the database and DuckPond.
@@ -370,9 +457,22 @@ def upload_netcdf(
370457
- deployment: Deployment Name (str)
371458
- recording: Recording Name (str)
372459
batch_size (int, optional): Size of data batches for processing. Defaults to 1 million
460+
rename_map (dict, optional): A dictionary mapping original variable names to new names.
373461
"""
462+
374463
ds = xr.open_dataset(netcdf_file_path)
375464

465+
# Apply renaming if rename_map is provided
466+
if rename_map:
467+
# Convert all data variable names to lowercase
468+
lower_case_rename_map = {k.lower(): v for k, v in rename_map.items()}
469+
ds = ds.rename(
470+
{
471+
var: lower_case_rename_map.get(var.lower(), var)
472+
for var in ds.data_vars
473+
}
474+
)
475+
376476
print(
377477
f"Creating file record for {os.path.basename(netcdf_file_path)} and uploading to OpenStack..."
378478
)
@@ -431,7 +531,7 @@ def __init__(self, name):
431531
group=coord.replace("_samples", ""),
432532
event_keys=event_keys,
433533
event_data=event_data,
434-
file_name=file.file["name"],
534+
file_name=file.file.name,
435535
)
436536
for var_name, var_data in ds[variables_with_coord].items():
437537
if (
@@ -453,7 +553,9 @@ def __init__(self, name):
453553

454554
group = var_data.attrs.get("group", "ungrouped")
455555
class_name = var_name
456-
label = sub_var_name
556+
label = rename_map.get(
557+
sub_var_name.lower(), sub_var_name
558+
)
457559

458560
values = var_data.values[start:end, var_index]
459561
self._write_data_to_duckpond(
@@ -487,6 +589,7 @@ def __init__(self, name):
487589
if "variable" in var_data.attrs
488590
else var_name
489591
)
592+
label = rename_map.get(label.lower(), label)
490593

491594
values = var_data.values[start:end]
492595
self._write_data_to_duckpond(

‎DiveDB/services/duck_pond.py

+3-12
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414

1515
# flake8: noqa
1616

17-
18-
os.environ["CONTAINER_DELTA_LAKE_PATH"] = "s3://divedb-delta-lakes"
1917
os.environ["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true"
2018

2119
LAKES = [
@@ -105,9 +103,7 @@ def __init__(self, delta_path: str | None = None, connect_to_postgres: bool = Tr
105103
self.conn.execute("LOAD httpfs;")
106104

107105
# Set S3 configurations
108-
self.conn.execute(
109-
"SET s3_url_style='path';"
110-
) # Important for OpenStack Swift
106+
self.conn.execute("SET s3_url_style='path';")
111107
self.conn.execute("SET s3_use_ssl=true;")
112108
self.conn.execute(
113109
"""
@@ -122,11 +118,11 @@ def __init__(self, delta_path: str | None = None, connect_to_postgres: bool = Tr
122118
os.getenv("AWS_REGION"),
123119
os.getenv("AWS_ACCESS_KEY_ID"),
124120
os.getenv("AWS_SECRET_ACCESS_KEY"),
125-
os.getenv("AWS_ENDPOINT_URL"),
121+
os.getenv("AWS_ENDPOINT_URL").replace("https://", ""),
126122
)
127123
)
128124

129-
# self._create_lake_views()
125+
self._create_lake_views()
130126

131127
if connect_to_postgres:
132128
logging.info("Connecting to PostgreSQL")
@@ -220,11 +216,6 @@ def get_delta_data(
220216
- If frequency is not None, returns a pd.DataFrame.
221217
- If frequency is None, returns a DuckDBPyRelation object with pivoted data.
222218
"""
223-
has_predicates = False
224-
225-
def get_predicate_preface():
226-
nonlocal has_predicates
227-
return " AND" if has_predicates else " WHERE"
228219

229220
def get_predicate_string(predicate: str, values: List[str]):
230221
if not values:

‎DiveDB/services/utils/netcdf_conversions.py

+43-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import xarray as xr
33
import pandas as pd
44
import numpy as np
5+
from datetime import datetime
56

67

78
def matlab_datenum_to_datetime_vectorized(
@@ -29,6 +30,39 @@ def matlab_datenum_to_datetime_vectorized(
2930
return converted_dates
3031

3132

33+
possible_formats = [
34+
"%H:%M:%S %d-%b-%Y", # e.g., "00:11:12 21-Feb-2017"
35+
"%Y-%m-%d %H:%M:%S", # e.g., "2017-02-21 00:11:12"
36+
"%d/%m/%Y %H:%M", # e.g., "21/02/2017 00:11"
37+
"%m/%d/%Y %I:%M:%S %p", # e.g., "02/21/2017 12:11:12 AM"
38+
]
39+
40+
41+
def infer_date_format(date_str, possible_formats=possible_formats):
42+
"""
43+
Infers the date format of a date string from a list of possible formats.
44+
45+
Parameters:
46+
- date_str (str): The date string to parse.
47+
- possible_formats (list): A list of date format strings.
48+
49+
Returns:
50+
- str: The matching date format string.
51+
52+
Raises:
53+
- ValueError: If no matching format is found.
54+
"""
55+
if date_str is None:
56+
return None
57+
58+
for fmt in possible_formats:
59+
try:
60+
datetime.strptime(date_str, fmt)
61+
return fmt
62+
except ValueError:
63+
continue
64+
65+
3266
def convert_to_formatted_dataset(
3367
input_file_path: str,
3468
output_file_path: str = None,
@@ -55,9 +89,16 @@ def convert_to_formatted_dataset(
5589
ds["DATE"].values
5690
)
5791
else:
58-
datetime_coord = np.array(pd.to_datetime(ds["DATE"].values)).astype(
59-
"datetime64[ns]"
92+
first_date_str = (
93+
ds["DATE"].values[0] if len(ds["DATE"].values) > 0 else None
6094
)
95+
date_format = infer_date_format(first_date_str, possible_formats)
96+
datetime_coord = np.array(
97+
pd.to_datetime(
98+
ds["DATE"].values,
99+
format=date_format if date_format else "mixed",
100+
)
101+
).astype("datetime64[ns]")
61102

62103
datetime_coord = datetime_coord[
63104
~np.isnat(datetime_coord) & (datetime_coord != np.datetime64(""))

‎Makefile

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.PHONY: up down build migrate createsuperuser shell bash test
22

33
up:
4-
docker compose -f docker-compose.development.yaml --env-file .env up --build
4+
docker compose -f docker-compose.development.yaml --env-file .env up
55

66
down:
77
docker compose -f docker-compose.development.yaml down
@@ -28,4 +28,10 @@ test:
2828
docker compose -f docker-compose.development.yaml exec web pytest
2929

3030
importmetadata:
31-
docker compose -f docker-compose.development.yaml exec web python scripts/import_from_notion.py
31+
docker compose -f docker-compose.development.yaml exec web python scripts/import_from_notion.py
32+
33+
build-uploader:
34+
docker buildx build --platform linux/amd64 -t ghcr.io/ecophysviz-lab/uploader-job:latest --push -f upload.Dockerfile .
35+
36+
build-django:
37+
docker buildx build --platform linux/amd64 -t ghcr.io/ecophysviz-lab/divedb-django:latest --push -f Dockerfile .

‎data_querying_documentation.ipynb

+652-5
Large diffs are not rendered by default.

‎deployment/secrets.sh

+11-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
source .env
22

3-
kubectl delete secret divedb-postgres-credentials
3+
kubectl delete secret divedb-credentials
44

5-
kubectl create secret generic divedb-postgres-credentials \
5+
kubectl create secret generic divedb-credentials \
66
--from-literal=POSTGRES_DB=$POSTGRES_DB \
77
--from-literal=POSTGRES_USER=$POSTGRES_USER \
88
--from-literal=POSTGRES_PASSWORD=$POSTGRES_PASSWORD \
@@ -11,4 +11,12 @@ kubectl create secret generic divedb-postgres-credentials \
1111
--from-literal=DJANGO_SECRET_KEY=$DJANGO_SECRET_KEY \
1212
--from-literal=OPENSTACK_AUTH_URL=$OPENSTACK_AUTH_URL \
1313
--from-literal=OPENSTACK_APPLICATION_CREDENTIAL_ID=$OPENSTACK_APPLICATION_CREDENTIAL_ID \
14-
--from-literal=OPENSTACK_APPLICATION_CREDENTIAL_SECRET=$OPENSTACK_APPLICATION_CREDENTIAL_SECRET
14+
--from-literal=OPENSTACK_APPLICATION_CREDENTIAL_SECRET=$OPENSTACK_APPLICATION_CREDENTIAL_SECRET \
15+
--from-literal=OPENSTACK_FILE_STORAGE_CONTAINER_NAME=$OPENSTACK_FILE_STORAGE_CONTAINER_NAME \
16+
--from-literal=OPENSTACK_USER_ID=$OPENSTACK_USER_ID \
17+
--from-literal=OPENSTACK_PROJECT_NAME=$OPENSTACK_PROJECT_NAME \
18+
--from-literal=OPENSTACK_PROJECT_ID=$OPENSTACK_PROJECT_ID \
19+
--from-literal=AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
20+
--from-literal=AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
21+
--from-literal=AWS_REGION=$AWS_REGION \
22+
--from-literal=AWS_ENDPOINT_URL=$AWS_ENDPOINT_URL

‎deployment/upload-job/build.sh

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
2+
echo "Script directory: $SCRIPT_DIR"
3+
4+
kubectl delete jobs divedb-uploader-job
5+
kubectl apply -f $SCRIPT_DIR/job.yaml

‎deployment/upload-job/job.yaml

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# job.yaml
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: divedb-uploader-job
6+
labels:
7+
app: divedb-uploader-job
8+
spec:
9+
template:
10+
spec:
11+
imagePullSecrets:
12+
- name: ghcr-login-ecophysviz
13+
containers:
14+
- name: divedb-uploader-job
15+
image: ghcr.io/ecophysviz-lab/uploader-job:latest
16+
imagePullPolicy: Always
17+
resources:
18+
requests:
19+
memory: "256Gi"
20+
cpu: "64"
21+
limits:
22+
memory: "256Gi"
23+
cpu: "64"
24+
volumeMounts:
25+
- name: divedb-uploader-data-storage
26+
mountPath: /data
27+
env:
28+
- name: DJANGO_SECRET_KEY
29+
valueFrom:
30+
secretKeyRef:
31+
name: divedb-credentials
32+
key: DJANGO_SECRET_KEY
33+
- name: POSTGRES_HOST
34+
value: divedb-pg-service # Updated to match the service name
35+
- name: POSTGRES_PORT
36+
value: "5432"
37+
- name: POSTGRES_DB
38+
valueFrom:
39+
secretKeyRef:
40+
name: divedb-credentials
41+
key: POSTGRES_DB
42+
- name: POSTGRES_USER
43+
valueFrom:
44+
secretKeyRef:
45+
name: divedb-credentials
46+
key: POSTGRES_USER
47+
- name: POSTGRES_PASSWORD
48+
valueFrom:
49+
secretKeyRef:
50+
name: divedb-credentials
51+
key: POSTGRES_PASSWORD
52+
- name: OPENSTACK_AUTH_URL
53+
valueFrom:
54+
secretKeyRef:
55+
name: divedb-credentials
56+
key: OPENSTACK_AUTH_URL
57+
- name: OPENSTACK_APPLICATION_CREDENTIAL_ID
58+
valueFrom:
59+
secretKeyRef:
60+
name: divedb-credentials
61+
key: OPENSTACK_APPLICATION_CREDENTIAL_ID
62+
- name: OPENSTACK_APPLICATION_CREDENTIAL_SECRET
63+
valueFrom:
64+
secretKeyRef:
65+
name: divedb-credentials
66+
key: OPENSTACK_APPLICATION_CREDENTIAL_SECRET
67+
- name: OPENSTACK_FILE_STORAGE_CONTAINER_NAME
68+
valueFrom:
69+
secretKeyRef:
70+
name: divedb-credentials
71+
key: OPENSTACK_FILE_STORAGE_CONTAINER_NAME
72+
- name: OPENSTACK_USER_ID
73+
valueFrom:
74+
secretKeyRef:
75+
name: divedb-credentials
76+
key: OPENSTACK_USER_ID
77+
- name: OPENSTACK_PROJECT_NAME
78+
valueFrom:
79+
secretKeyRef:
80+
name: divedb-credentials
81+
key: OPENSTACK_PROJECT_NAME
82+
- name: OPENSTACK_PROJECT_ID
83+
valueFrom:
84+
secretKeyRef:
85+
name: divedb-credentials
86+
key: OPENSTACK_PROJECT_ID
87+
- name: AWS_ACCESS_KEY_ID
88+
valueFrom:
89+
secretKeyRef:
90+
name: divedb-credentials
91+
key: AWS_ACCESS_KEY_ID
92+
- name: AWS_SECRET_ACCESS_KEY
93+
valueFrom:
94+
secretKeyRef:
95+
name: divedb-credentials
96+
key: AWS_SECRET_ACCESS_KEY
97+
- name: AWS_REGION
98+
valueFrom:
99+
secretKeyRef:
100+
name: divedb-credentials
101+
key: AWS_REGION
102+
- name: AWS_ENDPOINT_URL
103+
valueFrom:
104+
secretKeyRef:
105+
name: divedb-credentials
106+
key: AWS_ENDPOINT_URL
107+
restartPolicy: Never
108+
volumes:
109+
- name: divedb-uploader-data-storage
110+
persistentVolumeClaim:
111+
claimName: divedb-uploader-data-storage

‎deployment/upload-job/pvc.yaml

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# pvc.yaml
2+
apiVersion: v1
3+
kind: PersistentVolumeClaim
4+
metadata:
5+
name: divedb-uploader-data-storage
6+
labels:
7+
app: divedb
8+
spec:
9+
accessModes:
10+
- ReadWriteOnce
11+
resources:
12+
requests:
13+
storage: 500Gi
14+
storageClassName: rook-ceph-block

‎docker-compose.development.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,7 @@ services:
4747
- ./data:/app/data
4848
- ./pyologger:/app/pyologger
4949
- ./scripts:/app/scripts
50+
- ./pyproject.toml:/app/pyproject.toml
51+
- ./setup.py:/app/setup.py
5052
depends_on:
5153
- metadata_database

‎pyproject.toml

+3
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,18 @@ version = "0.1.0"
88
dependencies = [
99
"black",
1010
"bs4",
11+
"dask",
1112
"deltalake",
1213
"django",
1314
"django-storages",
1415
"duckdb",
1516
"edfio",
1617
"flake8",
18+
"google-cloud-storage",
1719
"importlib-metadata==4.8.3",
1820
"load_dotenv",
1921
"mne",
22+
"netcdf4",
2023
"notion_client",
2124
"pandas",
2225
"pre-commit",

‎scripts/import_from_dryad.py

-78
This file was deleted.

‎scripts/import_from_dryad_dash.py

+200
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import os
2+
import uuid
3+
import xarray as xr
4+
from google.cloud import storage
5+
import pandas as pd
6+
from DiveDB.services.data_uploader import DataUploader
7+
from DiveDB.services.utils.netcdf_conversions import convert_to_formatted_dataset
8+
from DiveDB.services.duck_pond import DuckPond
9+
import datetime
10+
from dask import delayed, compute
11+
12+
# Initialize the Google Cloud Storage client
13+
client = storage.Client()
14+
15+
# Define the bucket
16+
bucket_name = "female_elephant_seal_netcdfs"
17+
18+
# Get the bucket
19+
bucket = client.get_bucket(bucket_name)
20+
21+
# List all blobs in the specified bucket
22+
blobs = list(bucket.list_blobs())
23+
24+
# Load the CSV file
25+
metadata_df = pd.read_csv("scripts/metadata/11_Restimates_ALL_SealsUsed.csv")
26+
27+
os.environ["SKIP_OPENSTACK_UPLOAD"] = "true"
28+
29+
30+
def process_file(blob, idx):
31+
# Skip directories
32+
if blob.name.endswith("/"):
33+
return None
34+
file_name = blob.name.split("/")[-1]
35+
converted_file_path = f"./data/processed_{idx}.nc"
36+
37+
# Check if the converted file already exists
38+
if os.path.exists(converted_file_path):
39+
print(f"Skipping conversion for {file_name}, already processed.")
40+
else:
41+
print(f"Converting file: {file_name}")
42+
43+
# Download the blob to a local file
44+
temp_dir = "data/temp"
45+
os.makedirs(temp_dir, exist_ok=True)
46+
temp_file_path = os.path.join(temp_dir, file_name)
47+
blob.download_to_filename(temp_file_path)
48+
try:
49+
convert_to_formatted_dataset(
50+
temp_file_path, output_file_path=converted_file_path
51+
)
52+
except Exception as e:
53+
print(f"Error converting file {file_name}: {e}")
54+
os.remove(temp_file_path)
55+
return None
56+
finally:
57+
os.remove(temp_file_path)
58+
59+
# Now upload the converted file
60+
duckpond = DuckPond()
61+
data_uploader = DataUploader(
62+
duckpond=duckpond
63+
) # Instantiate per task to ensure thread safety
64+
try:
65+
with xr.open_dataset(converted_file_path) as ds:
66+
# Extract necessary data from the dataset
67+
deployment_id = int(ds.attrs["Deployment_ID"])
68+
logger_id = ds.attrs["Tags_TDR1_Model"] + "_" + ds.attrs["Tags_TDR1_ID"]
69+
filtered_df = metadata_df[metadata_df["TOPPID"] == deployment_id]
70+
seal_id = (
71+
filtered_df.iloc[0]["SEALID"]
72+
if not filtered_df.empty
73+
else str(uuid.uuid4())
74+
)
75+
76+
# Convert date strings to the correct format
77+
arrival_datetime_str = ds.attrs.get("Deployment_Arrival_Datetime")
78+
departure_datetime_str = ds.attrs.get("Deployment_Departure_Datetime")
79+
80+
# Assuming the original format is "MM/DD/YYYY HH:MM"
81+
arrival_datetime = datetime.datetime.strptime(
82+
arrival_datetime_str, "%m/%d/%Y %H:%M"
83+
)
84+
departure_datetime = datetime.datetime.strptime(
85+
departure_datetime_str, "%m/%d/%Y %H:%M"
86+
)
87+
88+
# Format to "YYYY-MM-DD HH:MM"
89+
formatted_arrival_datetime = arrival_datetime.strftime("%Y-%m-%d %H:%M")
90+
formatted_departure_datetime = departure_datetime.strftime("%Y-%m-%d %H:%M")
91+
92+
# Prepare data for each model
93+
animal_data = {
94+
"animal_id": seal_id,
95+
"project_id": ds.attrs.get("Animal_ID"),
96+
"scientific_name": ds.attrs.get("Animal_Species"),
97+
"common_name": ds.attrs.get("Animal_Species_CommonName"),
98+
"lab_id": ds.attrs.get("Animal_ID"),
99+
"birth_year": (
100+
ds.attrs.get("Animal_BirthYear")
101+
if not pd.isna(ds.attrs.get("Animal_BirthYear"))
102+
else 0
103+
),
104+
"sex": ds.attrs.get("Animal_Sex"),
105+
"domain_ids": str(ds.attrs.get("Animal_OtherDeployments")),
106+
}
107+
108+
deployment_data = {
109+
"deployment_id": ds.attrs.get("Deployment_ID"),
110+
"domain_deployment_id": ds.attrs.get("Deployment_ID"),
111+
"animal_age_class": ds.attrs.get("Animal_AgeClass"),
112+
"animal_age": (
113+
ds.attrs.get("Deployment_Year") - ds.attrs.get("Animal_BirthYear")
114+
if not pd.isna(ds.attrs.get("Animal_BirthYear"))
115+
else 0
116+
),
117+
"deployment_type": ds.attrs.get("Deployment_Trip"),
118+
"deployment_name": ds.attrs.get("Deployment_ID"),
119+
"rec_date": departure_datetime.strftime("%Y-%m-%d"),
120+
"deployment_latitude": ds.attrs.get("Deployment_Departure_Lat"),
121+
"deployment_longitude": ds.attrs.get("Deployment_Departure_Lon"),
122+
"deployment_location": ds.attrs.get("Deployment_Departure_Location"),
123+
"departure_datetime": formatted_departure_datetime,
124+
"recovery_latitude": ds.attrs.get("Deployment_Arrival_Lat"),
125+
"recovery_longitude": ds.attrs.get("Deployment_Arrival_Lon"),
126+
"recovery_location": ds.attrs.get("Deployment_Arrival_Location"),
127+
"arrival_datetime": formatted_arrival_datetime,
128+
"notes": ds.attrs.get("Notes"),
129+
}
130+
131+
logger_data = {
132+
"logger_id": logger_id,
133+
"manufacturer": ds.attrs.get("Tags_TDR1_Manufacturer"),
134+
"manufacturer_name": ds.attrs.get("Tags_TDR1_Model"),
135+
"serial_no": ds.attrs.get("Tags_TDR1_ID"),
136+
"ptt": ds.attrs.get("Tags_PTT"),
137+
"type": ds.attrs.get("TDR"),
138+
"notes": ds.attrs.get("Tags_TDR1_Comments"),
139+
}
140+
141+
# Create or get records
142+
animal, _ = data_uploader.get_or_create_animal(animal_data)
143+
logger, _ = data_uploader.get_or_create_logger(logger_data)
144+
deployment, _ = data_uploader.get_or_create_deployment(deployment_data)
145+
146+
recording_data = {
147+
"recording_id": f"{deployment_id}_{seal_id}_{logger_id}",
148+
"name": f"Recording {idx}",
149+
"animal": animal,
150+
"deployment": deployment,
151+
"logger": logger,
152+
"start_time": formatted_arrival_datetime,
153+
"end_time": formatted_departure_datetime,
154+
"timezone": ds.attrs.get("Time_Zone"),
155+
"quality": ds.attrs.get("Quality"),
156+
"attachment_location": ds.attrs.get("Attachment_Location"),
157+
"attachment_type": ds.attrs.get("Attachment_Type"),
158+
}
159+
160+
recording, _ = data_uploader.get_or_create_recording(recording_data)
161+
162+
metadata = {
163+
"animal": animal.id,
164+
"deployment": deployment.id,
165+
"recording": recording.id,
166+
}
167+
168+
data_uploader.upload_netcdf(
169+
converted_file_path,
170+
metadata,
171+
rename_map={
172+
"depth": "sensor_data_pressure",
173+
"corr_depth": "derived_data_depth",
174+
"lat": "derived_data_latitude",
175+
"lon": "derived_data_longitude",
176+
"loc_class": "derived_data_location_class",
177+
"light": "sensor_data_light",
178+
"exernal_temp": "sensor_data_exernal_temp",
179+
},
180+
)
181+
182+
print(f"Uploaded {converted_file_path}")
183+
except Exception as e:
184+
print(f"Error uploading file {converted_file_path}: {e}")
185+
return None
186+
187+
188+
# Build delayed tasks
189+
tasks = []
190+
for idx, blob in enumerate(blobs):
191+
if blob.name.endswith("/"):
192+
continue
193+
task = delayed(process_file)(blob, idx)
194+
tasks.append(task)
195+
196+
# Compute tasks in parallel
197+
# You can specify the number of workers; Dask defaults to the number of cores
198+
compute(
199+
*tasks, scheduler="threads"
200+
) # You can also use 'processes' or specify a Dask scheduler

‎scripts/import_from_dryad_sync.py

+212
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
import os
2+
import uuid
3+
import xarray as xr
4+
from google.cloud import storage
5+
import pandas as pd
6+
from DiveDB.services.data_uploader import DataUploader
7+
from DiveDB.services.utils.netcdf_conversions import convert_to_formatted_dataset
8+
from DiveDB.services.duck_pond import DuckPond
9+
import datetime
10+
11+
# Initialize the Google Cloud Storage client
12+
client = storage.Client()
13+
14+
# Define the bucket and prefix
15+
bucket_name = "female_elephant_seal_netcdfs"
16+
# prefix = "female_elephant_seal_raw/"
17+
18+
# Get the bucket
19+
bucket = client.get_bucket(bucket_name)
20+
21+
# List all blobs in the specified bucket with the given prefix
22+
blobs = bucket.list_blobs()
23+
blobs_list = [blob for blob in blobs if "20170" in blob.name]
24+
25+
print(f"Found {len(blobs_list)} files")
26+
27+
# Load the CSV file
28+
metadata_df = pd.read_csv("scripts/metadata/11_Restimates_ALL_SealsUsed.csv")
29+
30+
duckpond = DuckPond()
31+
data_uploader = DataUploader(duckpond=duckpond)
32+
33+
os.environ["SKIP_OPENSTACK_UPLOAD"] = "true"
34+
35+
36+
def convert_file(file_path, idx):
37+
# Convert the file if it doesn't already exist
38+
converted_file_path = f"./data/processed_{idx}.nc"
39+
if not os.path.exists(converted_file_path):
40+
convert_to_formatted_dataset(file_path, output_file_path=converted_file_path)
41+
return converted_file_path
42+
43+
44+
def upload_file(converted_file_path, idx):
45+
with xr.open_dataset(converted_file_path) as ds:
46+
# Extract necessary data from the dataset
47+
deployment_id = int(ds.attrs["Deployment_ID"])
48+
logger_id = ds.attrs["Tags_TDR1_Model"] + "_" + ds.attrs["Tags_TDR1_ID"]
49+
filtered_df = metadata_df[metadata_df["TOPPID"] == deployment_id]
50+
seal_id = (
51+
filtered_df.iloc[0]["SEALID"]
52+
if not filtered_df.empty
53+
else str(uuid.uuid4())
54+
)
55+
56+
# Convert date strings to the correct format
57+
arrival_datetime_str = ds.attrs.get("Deployment_Arrival_Datetime")
58+
departure_datetime_str = ds.attrs.get("Deployment_Departure_Datetime")
59+
60+
# Assuming the original format is "MM/DD/YYYY HH:MM"
61+
arrival_datetime = datetime.datetime.strptime(
62+
arrival_datetime_str, "%m/%d/%Y %H:%M"
63+
)
64+
departure_datetime = datetime.datetime.strptime(
65+
departure_datetime_str, "%m/%d/%Y %H:%M"
66+
)
67+
68+
# Format to "YYYY-MM-DD HH:MM"
69+
formatted_arrival_datetime = arrival_datetime.strftime("%Y-%m-%d %H:%M")
70+
formatted_departure_datetime = departure_datetime.strftime("%Y-%m-%d %H:%M")
71+
72+
# Prepare data for each model
73+
animal_data = {
74+
"animal_id": seal_id,
75+
"project_id": ds.attrs.get("Animal_ID"),
76+
"scientific_name": ds.attrs.get("Animal_Species"),
77+
"common_name": ds.attrs.get("Animal_Species_CommonName"),
78+
"lab_id": ds.attrs.get("Animal_ID"),
79+
"birth_year": (
80+
ds.attrs.get("Animal_BirthYear")
81+
if not pd.isna(ds.attrs.get("Animal_BirthYear"))
82+
else 0
83+
),
84+
"sex": ds.attrs.get("Animal_Sex"),
85+
"domain_ids": str(ds.attrs.get("Animal_OtherDeployments")),
86+
}
87+
88+
deployment_data = {
89+
"deployment_id": ds.attrs.get("Deployment_ID"),
90+
"domain_deployment_id": ds.attrs.get("Deployment_ID"),
91+
"animal_age_class": ds.attrs.get("Animal_AgeClass"),
92+
"animal_age": (
93+
ds.attrs.get("Deployment_Year") - ds.attrs.get("Animal_BirthYear")
94+
if not pd.isna(ds.attrs.get("Animal_BirthYear"))
95+
else 0
96+
),
97+
"deployment_type": ds.attrs.get("Deployment_Trip"),
98+
"deployment_name": ds.attrs.get("Deployment_ID"),
99+
"rec_date": departure_datetime.strftime("%Y-%m-%d"),
100+
"deployment_latitude": ds.attrs.get("Deployment_Departure_Lat"),
101+
"deployment_longitude": ds.attrs.get("Deployment_Departure_Lon"),
102+
"deployment_location": ds.attrs.get("Deployment_Departure_Location"),
103+
"departure_datetime": formatted_departure_datetime,
104+
"recovery_latitude": ds.attrs.get("Deployment_Arrival_Lat"),
105+
"recovery_longitude": ds.attrs.get("Deployment_Arrival_Lon"),
106+
"recovery_location": ds.attrs.get("Deployment_Arrival_Location"),
107+
"arrival_datetime": formatted_arrival_datetime,
108+
"notes": ds.attrs.get("Notes"),
109+
}
110+
111+
logger_data = {
112+
"logger_id": logger_id,
113+
"manufacturer": ds.attrs.get("Tags_TDR1_Manufacturer"),
114+
"manufacturer_name": ds.attrs.get("Tags_TDR1_Model"),
115+
"serial_no": ds.attrs.get("Tags_TDR1_ID"),
116+
"ptt": ds.attrs.get("Tags_PTT"),
117+
"type": ds.attrs.get("TDR"),
118+
"notes": ds.attrs.get("Tags_TDR1_Comments"),
119+
}
120+
121+
# Create or get records
122+
animal, _ = data_uploader.get_or_create_animal(animal_data)
123+
logger, _ = data_uploader.get_or_create_logger(logger_data)
124+
deployment, _ = data_uploader.get_or_create_deployment(deployment_data)
125+
126+
recording_data = {
127+
"recording_id": f"{deployment_id}_{seal_id}_{logger_id}",
128+
"name": f"Recording {idx}",
129+
"animal": animal,
130+
"deployment": deployment,
131+
"logger": logger,
132+
"start_time": formatted_arrival_datetime,
133+
"end_time": formatted_departure_datetime,
134+
"timezone": ds.attrs.get("Time_Zone"),
135+
"quality": ds.attrs.get("Quality"),
136+
"attachment_location": ds.attrs.get("Attachment_Location"),
137+
"attachment_type": ds.attrs.get("Attachment_Type"),
138+
}
139+
140+
recording, _ = data_uploader.get_or_create_recording(recording_data)
141+
142+
metadata = {
143+
"animal": animal.id,
144+
"deployment": deployment.id,
145+
"recording": recording.id,
146+
}
147+
148+
data_uploader.upload_netcdf(
149+
converted_file_path,
150+
metadata,
151+
rename_map={
152+
"depth": "sensor_data_pressure",
153+
"corr_depth": "derived_data_depth",
154+
"lat": "derived_data_latitude",
155+
"lon": "derived_data_longitude",
156+
"loc_class": "derived_data_location_class",
157+
"light": "sensor_data_light",
158+
"exernal_temp": "sensor_data_exernal_temp",
159+
},
160+
)
161+
162+
print(f"Uploaded {converted_file_path}")
163+
164+
165+
files_with_errors = []
166+
converted_files = []
167+
168+
# First, convert all files
169+
for idx, blob in enumerate(blobs_list):
170+
# Skip directories
171+
if blob.name.endswith("/"):
172+
continue
173+
174+
file_name = blob.name.split("/")[-1]
175+
converted_file_path = f"./data/processed_{idx}.nc"
176+
177+
# Check if the converted file already exists
178+
if os.path.exists(converted_file_path):
179+
print(f"Skipping conversion for {file_name}, already processed.")
180+
converted_files.append((converted_file_path, idx))
181+
continue
182+
183+
print(f"Converting file: {file_name}")
184+
185+
# Download the blob to a local file
186+
temp_dir = "data/temp"
187+
os.makedirs(temp_dir, exist_ok=True)
188+
temp_file_path = os.path.join(temp_dir, file_name)
189+
blob.download_to_filename(temp_file_path)
190+
try:
191+
converted_file_path = convert_file(temp_file_path, idx)
192+
converted_files.append((converted_file_path, idx))
193+
except Exception as e:
194+
print(f"Error converting file: {e}")
195+
files_with_errors.append(file_name)
196+
os.remove(temp_file_path)
197+
198+
# Then, upload all converted files
199+
for converted_file_path, idx in converted_files:
200+
try:
201+
upload_file(converted_file_path, idx)
202+
except Exception as e:
203+
print(f"Error uploading file: {e}")
204+
files_with_errors.append(converted_file_path)
205+
206+
print(files_with_errors)
207+
208+
# data_directory = "data/files"
209+
# for idx, file_name in enumerate(os.listdir(data_directory)):
210+
# file_path = os.path.join(data_directory, file_name)
211+
212+
# upload_file(file_path, idx)

‎setup.py

+5
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,18 @@
88
include_package_data=True,
99
install_requires=[
1010
"black",
11+
"bs4",
1112
"deltalake",
1213
"django",
14+
"django-storages",
1315
"duckdb",
1416
"edfio",
1517
"flake8",
18+
"google-cloud-storage",
1619
"importlib-metadata==4.8.3",
1720
"load_dotenv",
1821
"mne",
22+
"netcdf4",
1923
"notion_client",
2024
"pandas",
2125
"pre-commit",
@@ -30,6 +34,7 @@
3034
"python-swiftclient",
3135
"setuptools",
3236
"tqdm",
37+
"xarray",
3338
],
3439
url="https://github.com/ecophysviz-lab/DiveDB",
3540
)

‎upload.Dockerfile

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Use the official Python image from the Docker Hub
2+
FROM python:3.12
3+
4+
# Set the working directory in the container
5+
WORKDIR /app
6+
7+
# Copy the rest of the application code into the container
8+
COPY . /app/
9+
10+
# Create the data directory
11+
RUN mkdir -p data
12+
13+
# Install the dependencies
14+
RUN pip install .
15+
16+
# Set environment variables
17+
ENV PYTHONUNBUFFERED=1
18+
19+
# Expose the port the app runs on
20+
EXPOSE 8000
21+
22+
# Set the DJANGO_PREFIX environment variable
23+
ENV DJANGO_PREFIX=DiveDB
24+
25+
# Set the GOOGLE_APPLICATION_CREDENTIALS environment variable
26+
ENV GOOGLE_APPLICATION_CREDENTIALS=brave-sonar-390402-7644b983ce44.json
27+
28+
# Set the CONTAINER_DELTA_LAKE_PATH environment variable
29+
ENV CONTAINER_DELTA_LAKE_PATH=s3://divedb-delta-lakes-dryad-10-21
30+
31+
# Command to run your script
32+
CMD ["python", "scripts/import_from_dryad.py"]

0 commit comments

Comments
 (0)
Please sign in to comment.