ecophysviz-lab · Oct 25, 2024
diff --git a/‎.dockerignore
+2-1 b/‎.dockerignore
+2-1
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎DiveDB/server/metadata/migrations/0015_animals_birth_year_animals_domain_ids_animals_lab_id_and_more.py
+110 b/‎DiveDB/server/metadata/migrations/0015_animals_birth_year_animals_domain_ids_animals_lab_id_and_more.py
+110
diff --git a/‎DiveDB/server/metadata/migrations/0016_recordings_attachment_location_and_more.py
+35 b/‎DiveDB/server/metadata/migrations/0016_recordings_attachment_location_and_more.py
+35
diff --git a/‎DiveDB/server/metadata/models.py
+22 b/‎DiveDB/server/metadata/models.py
+22
diff --git a/‎DiveDB/services/data_uploader.py
+119-16 b/‎DiveDB/services/data_uploader.py
+119-16
diff --git a/‎DiveDB/services/duck_pond.py
+3-12 b/‎DiveDB/services/duck_pond.py
+3-12
diff --git a/‎DiveDB/services/utils/netcdf_conversions.py
+43-2 b/‎DiveDB/services/utils/netcdf_conversions.py
+43-2
diff --git a/‎Makefile
+8-2 b/‎Makefile
+8-2
diff --git a/‎data_querying_documentation.ipynb
+652-5 b/‎data_querying_documentation.ipynb
+652-5
diff --git a/‎deployment/secrets.sh
+11-3 b/‎deployment/secrets.sh
+11-3
diff --git a/‎deployment/upload-job/build.sh
+5 b/‎deployment/upload-job/build.sh
+5
diff --git a/‎deployment/upload-job/job.yaml
+111 b/‎deployment/upload-job/job.yaml
+111
diff --git a/‎deployment/upload-job/pvc.yaml
+14 b/‎deployment/upload-job/pvc.yaml
+14
diff --git a/‎docker-compose.development.yaml
+2 b/‎docker-compose.development.yaml
+2
diff --git a/‎pyproject.toml
+3 b/‎pyproject.toml
+3
diff --git a/‎scripts/import_from_dryad.py
-78 b/‎scripts/import_from_dryad.py
-78
diff --git a/‎scripts/import_from_dryad_dash.py
+200 b/‎scripts/import_from_dryad_dash.py
+200
diff --git a/‎scripts/import_from_dryad_sync.py
+212 b/‎scripts/import_from_dryad_sync.py
+212
diff --git a/‎setup.py
+5 b/‎setup.py
+5
diff --git a/‎upload.Dockerfile
+32 b/‎upload.Dockerfile
+32
@@ -1 +1,2 @@
-jupyter
+jupyter
+data
@@ -167,3 +167,5 @@ cython_debug/
 data
 .DS_Store
 .env.*
+
+brave-sonar-390402-7644b983ce44.json
@@ -0,0 +1,110 @@
+# Generated by Django 5.1.2 on 2024-10-13 22:57
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("metadata", "0014_remove_files_file_path"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="animals",
+            name="birth_year",
+            field=models.IntegerField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="animals",
+            name="domain_ids",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="animals",
+            name="lab_id",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="animals",
+            name="sex",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="animal_age",
+            field=models.IntegerField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="animal_age_class",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="arrival_datetime",
+            field=models.DateTimeField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="departure_datetime",
+            field=models.DateTimeField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="deployment_latitude",
+            field=models.FloatField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="deployment_location",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="deployment_longitude",
+            field=models.FloatField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="deployment_type",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="domain_deployment_id",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="recovery_latitude",
+            field=models.FloatField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="recovery_location",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="deployments",
+            name="recovery_longitude",
+            field=models.FloatField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="loggers",
+            name="manufacturer_name",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="loggers",
+            name="ptt",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AlterField(
+            model_name="files",
+            name="file",
+            field=models.FileField(
+                storage="DiveDB.services.utils.storage.OpenStackStorage",
+                upload_to="divedb-media/",
+            ),
+        ),
+    ]
@@ -0,0 +1,35 @@
+# Generated by Django 5.1.2 on 2024-10-13 23:04
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        (
+            "metadata",
+            "0015_animals_birth_year_animals_domain_ids_animals_lab_id_and_more",
+        ),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="recordings",
+            name="attachment_location",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="recordings",
+            name="attachment_type",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="recordings",
+            name="quality",
+            field=models.CharField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="recordings",
+            name="timezone",
+            field=models.CharField(blank=True, max_length=32, null=True),
+        ),
+    ]
@@ -38,6 +38,8 @@ class Loggers(models.Model):
     icon_url = models.URLField(max_length=1000, null=True, blank=True)
     serial_no = models.CharField(null=True, blank=True)
     manufacturer = models.CharField(null=True, blank=True)
+    manufacturer_name = models.CharField(null=True, blank=True)
+    ptt = models.CharField(null=True, blank=True)
     type = models.CharField(null=True, blank=True)
     type_name = models.CharField(null=True, blank=True)
     notes = models.TextField(null=True, blank=True)
@@ -59,6 +61,10 @@ class Animals(models.Model):
     project_id = models.CharField()
     common_name = models.CharField()
     scientific_name = models.CharField()
+    lab_id = models.CharField(null=True, blank=True)
+    birth_year = models.IntegerField(null=True, blank=True)
+    sex = models.CharField(null=True, blank=True)
+    domain_ids = models.CharField(null=True, blank=True)
 
     class Meta:
         db_table = "Animals"
@@ -81,8 +87,20 @@ class Deployments(models.Model):
     ]
 
     id = models.CharField(primary_key=True)
+    domain_deployment_id = models.CharField(null=True, blank=True)
+    animal_age_class = models.CharField(null=True, blank=True)
+    animal_age = models.IntegerField(null=True, blank=True)
+    deployment_type = models.CharField(null=True, blank=True)
     deployment_name = models.CharField()
     rec_date = models.DateField()
+    deployment_latitude = models.FloatField(null=True, blank=True)
+    deployment_longitude = models.FloatField(null=True, blank=True)
+    deployment_location = models.CharField(null=True, blank=True)
+    departure_datetime = models.DateTimeField(null=True, blank=True)
+    recovery_latitude = models.FloatField(null=True, blank=True)
+    recovery_longitude = models.FloatField(null=True, blank=True)
+    recovery_location = models.CharField(null=True, blank=True)
+    arrival_datetime = models.DateTimeField(null=True, blank=True)
     animal = models.CharField()
     start_time = models.DateTimeField(null=True, blank=True)
     start_time_precision = models.TextField(null=True, blank=True)
@@ -130,6 +148,10 @@ class Recordings(models.Model):
     start_time_precision = models.CharField(
         null=True, blank=True, choices=PRECISION_CHOICES
     )
+    timezone = models.CharField(max_length=32, null=True, blank=True)
+    quality = models.CharField(null=True, blank=True)
+    attachment_location = models.CharField(null=True, blank=True)
+    attachment_type = models.CharField(null=True, blank=True)
 
     def __str__(self):
         return f"{self.name} ({self.id})"
 
@@ -19,16 +19,21 @@
 from datetime import datetime, timezone
 from DiveDB.services.utils.openstack import SwiftClient
 
-duckpond = DuckPond()
-swift_client = SwiftClient()
 
 django_prefix = os.environ.get("DJANGO_PREFIX", "DiveDB")
 os.environ.setdefault(
     "DJANGO_SETTINGS_MODULE", f"{django_prefix}.server.django_app.settings"
 )
 django.setup()
 
-from DiveDB.server.metadata.models import Files, Recordings  # noqa: E402
+from DiveDB.server.metadata.models import (  # noqa: E402
+    Files,
+    Recordings,
+    Deployments,
+    Animals,
+    Loggers,
+    AnimalDeployments,
+)
 
 
 @dataclass
@@ -55,6 +60,11 @@ class NetCDFValidationError(Exception):
 class DataUploader:
     """Data Uploader"""
 
+    def __init__(self, duckpond: DuckPond = None, swift_client: SwiftClient = None):
+        """Initialize DataUploader with optional DuckPond and SwiftClient instances."""
+        self.duckpond = duckpond or DuckPond()
+        self.swift_client = swift_client or SwiftClient()
+
     def _read_edf_signal(self, edf: edfio.Edf, label: str):
         """Function to read a single signal from an EDF file."""
         signal = edf.get_signal(label)
@@ -119,15 +129,15 @@ def _create_value_structs(self, values):
             lambda x: (float(x) if isinstance(x, (int, float)) else np.nan)
         )(values)
 
-        # Determine if any value has a decimal place
-        if np.any(numeric_values % 1 != 0):
-            float_values = np.where(np.isfinite(numeric_values), numeric_values, None)
-            int_values = None
-        else:
+        # Check if the data type is integer
+        if np.issubdtype(numeric_values.dtype, np.integer):
             float_values = None
             int_values = np.where(
                 np.isfinite(numeric_values), numeric_values.astype(int), None
             )
+        else:
+            float_values = np.where(np.isfinite(numeric_values), numeric_values, None)
+            int_values = None
 
         string_values = np.where(
             ~np.isin(values, [True, False])
@@ -186,7 +196,9 @@ def _write_data_to_duckpond(
                     [metadata["animal"]] * len(values), type=pa.string()
                 ),
                 "deployment": pa.array(
-                    [metadata["deployment"]] * len(values), type=pa.string()
+                    # This fix isn't working, make it a string
+                    [str(metadata["deployment"])] * len(values),
+                    type=pa.string(),
                 ),
                 "recording": pa.array(
                     [metadata["recording"]] * len(values), type=pa.string()
@@ -199,7 +211,7 @@ def _write_data_to_duckpond(
             },
             schema=LAKE_CONFIGS["DATA"]["schema"],
         )
-        duckpond.write_to_delta(
+        self.duckpond.write_to_delta(
             data=batch_table,
             lake="DATA",
             mode="append",
@@ -241,7 +253,7 @@ def _write_event_to_duckpond(
                     [metadata["animal"]] * len(event_keys), type=pa.string()
                 ),
                 "deployment": pa.array(
-                    [metadata["deployment"]] * len(event_keys), type=pa.string()
+                    [str(metadata["deployment"])] * len(event_keys), type=pa.string()
                 ),
                 "recording": pa.array(
                     [metadata["recording"]] * len(event_keys), type=pa.string()
@@ -256,7 +268,7 @@ def _write_event_to_duckpond(
             },
             schema=LAKE_CONFIGS["STATE_EVENTS"]["schema"],
         )
-        duckpond.write_to_delta(
+        self.duckpond.write_to_delta(
             data=batch_table,
             lake="STATE_EVENTS",
             mode="append",
@@ -266,7 +278,6 @@ def _write_event_to_duckpond(
         )
         del batch_table
         gc.collect()
-
     def _validate_netcdf(self, ds: xr.Dataset):
         """
         Validates netCDF file before upload.
@@ -356,8 +367,84 @@ def _validate_netcdf(self, ds: xr.Dataset):
                 )
         return True
 
+    def get_or_create_logger(self, logger_data):
+        logger, created = Loggers.objects.get_or_create(
+            id=logger_data["logger_id"],
+            defaults={
+                "manufacturer": logger_data.get("manufacturer"),
+                "manufacturer_name": logger_data.get("manufacturer_name"),
+                "serial_no": logger_data.get("serial_no"),
+                "ptt": logger_data.get("ptt"),
+                "type": logger_data.get("type"),
+                "notes": logger_data.get("notes"),
+            },
+        )
+        return logger, created
+
+    def get_or_create_recording(self, recording_data):
+        animal_deployment, _ = AnimalDeployments.objects.get_or_create(
+            animal=recording_data["animal"], deployment=recording_data["deployment"]
+        )
+        recording, created = Recordings.objects.get_or_create(
+            id=recording_data["recording_id"],
+            defaults={
+                "name": recording_data.get("name"),
+                "animal_deployment": animal_deployment,
+                "logger": recording_data.get("logger"),
+                "start_time": recording_data.get("start_time"),
+                "end_time": recording_data.get("end_time"),
+                "timezone": recording_data.get("timezone"),
+                "quality": recording_data.get("quality"),
+                "attachment_location": recording_data.get("attachment_location"),
+                "attachment_type": recording_data.get("attachment_type"),
+            },
+        )
+        return recording, created
+
+    def get_or_create_deployment(self, deployment_data):
+        deployment, created = Deployments.objects.get_or_create(
+            id=deployment_data["deployment_id"],
+            defaults={
+                "domain_deployment_id": deployment_data.get("domain_deployment_id"),
+                "animal_age_class": deployment_data.get("animal_age_class"),
+                "animal_age": deployment_data.get("animal_age"),
+                "deployment_type": deployment_data.get("deployment_type"),
+                "deployment_name": deployment_data.get("deployment_name"),
+                "rec_date": deployment_data.get("rec_date"),
+                "deployment_latitude": deployment_data.get("deployment_latitude"),
+                "deployment_longitude": deployment_data.get("deployment_longitude"),
+                "deployment_location": deployment_data.get("deployment_location"),
+                "departure_datetime": deployment_data.get("departure_datetime"),
+                "recovery_latitude": deployment_data.get("recovery_latitude"),
+                "recovery_longitude": deployment_data.get("recovery_longitude"),
+                "recovery_location": deployment_data.get("recovery_location"),
+                "arrival_datetime": deployment_data.get("arrival_datetime"),
+                "notes": deployment_data.get("notes"),
+            },
+        )
+        return deployment, created
+
+    def get_or_create_animal(self, animal_data):
+        animal, created = Animals.objects.get_or_create(
+            id=animal_data["animal_id"],
+            defaults={
+                "project_id": animal_data.get("project_id"),
+                "common_name": animal_data.get("common_name"),
+                "scientific_name": animal_data.get("scientific_name"),
+                "lab_id": animal_data.get("lab_id"),
+                "birth_year": animal_data.get("birth_year"),
+                "sex": animal_data.get("sex"),
+                "domain_ids": animal_data.get("domain_ids"),
+            },
+        )
+        return animal, created
+
     def upload_netcdf(
-        self, netcdf_file_path: str, metadata: dict, batch_size: int = 1000000
+        self,
+        netcdf_file_path: str,
+        metadata: dict,
+        batch_size: int = 1000000,
+        rename_map: dict = None,
     ):
         """
         Uploads a netCDF file to the database and DuckPond.
@@ -370,9 +457,22 @@ def upload_netcdf(
                 - deployment: Deployment Name (str)
                 - recording: Recording Name (str)
         batch_size (int, optional): Size of data batches for processing. Defaults to 1 million
+        rename_map (dict, optional): A dictionary mapping original variable names to new names.
         """
+
         ds = xr.open_dataset(netcdf_file_path)
 
+        # Apply renaming if rename_map is provided
+        if rename_map:
+            # Convert all data variable names to lowercase
+            lower_case_rename_map = {k.lower(): v for k, v in rename_map.items()}
+            ds = ds.rename(
+                {
+                    var: lower_case_rename_map.get(var.lower(), var)
+                    for var in ds.data_vars
+                }
+            )
+
         print(
             f"Creating file record for {os.path.basename(netcdf_file_path)} and uploading to OpenStack..."
         )
@@ -431,7 +531,7 @@ def __init__(self, name):
                         group=coord.replace("_samples", ""),
                         event_keys=event_keys,
                         event_data=event_data,
-                        file_name=file.file["name"],
+                        file_name=file.file.name,
                     )
                 for var_name, var_data in ds[variables_with_coord].items():
                     if (
@@ -453,7 +553,9 @@ def __init__(self, name):
 
                                 group = var_data.attrs.get("group", "ungrouped")
                                 class_name = var_name
-                                label = sub_var_name
+                                label = rename_map.get(
+                                    sub_var_name.lower(), sub_var_name
+                                )
 
                                 values = var_data.values[start:end, var_index]
                                 self._write_data_to_duckpond(
@@ -487,6 +589,7 @@ def __init__(self, name):
                                 if "variable" in var_data.attrs
                                 else var_name
                             )
+                            label = rename_map.get(label.lower(), label)
 
                             values = var_data.values[start:end]
                             self._write_data_to_duckpond(
 
@@ -14,8 +14,6 @@
 
 # flake8: noqa
 
-
-os.environ["CONTAINER_DELTA_LAKE_PATH"] = "s3://divedb-delta-lakes"
 os.environ["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true"
 
 LAKES = [
@@ -105,9 +103,7 @@ def __init__(self, delta_path: str | None = None, connect_to_postgres: bool = Tr
             self.conn.execute("LOAD httpfs;")
 
             # Set S3 configurations
-            self.conn.execute(
-                "SET s3_url_style='path';"
-            )  # Important for OpenStack Swift
+            self.conn.execute("SET s3_url_style='path';")
             self.conn.execute("SET s3_use_ssl=true;")
             self.conn.execute(
                 """
@@ -122,11 +118,11 @@ def __init__(self, delta_path: str | None = None, connect_to_postgres: bool = Tr
                     os.getenv("AWS_REGION"),
                     os.getenv("AWS_ACCESS_KEY_ID"),
                     os.getenv("AWS_SECRET_ACCESS_KEY"),
-                    os.getenv("AWS_ENDPOINT_URL"),
+                    os.getenv("AWS_ENDPOINT_URL").replace("https://", ""),
                 )
             )
 
-        # self._create_lake_views()
+        self._create_lake_views()
 
         if connect_to_postgres:
             logging.info("Connecting to PostgreSQL")
@@ -220,11 +216,6 @@ def get_delta_data(
         - If frequency is not None, returns a pd.DataFrame.
         - If frequency is None, returns a DuckDBPyRelation object with pivoted data.
         """
-        has_predicates = False
-
-        def get_predicate_preface():
-            nonlocal has_predicates
-            return " AND" if has_predicates else " WHERE"
 
         def get_predicate_string(predicate: str, values: List[str]):
             if not values:
 
@@ -2,6 +2,7 @@
 import xarray as xr
 import pandas as pd
 import numpy as np
+from datetime import datetime
 
 
 def matlab_datenum_to_datetime_vectorized(
@@ -29,6 +30,39 @@ def matlab_datenum_to_datetime_vectorized(
     return converted_dates
 
 
+possible_formats = [
+    "%H:%M:%S %d-%b-%Y",  # e.g., "00:11:12 21-Feb-2017"
+    "%Y-%m-%d %H:%M:%S",  # e.g., "2017-02-21 00:11:12"
+    "%d/%m/%Y %H:%M",  # e.g., "21/02/2017 00:11"
+    "%m/%d/%Y %I:%M:%S %p",  # e.g., "02/21/2017 12:11:12 AM"
+]
+
+
+def infer_date_format(date_str, possible_formats=possible_formats):
+    """
+    Infers the date format of a date string from a list of possible formats.
+
+    Parameters:
+    - date_str (str): The date string to parse.
+    - possible_formats (list): A list of date format strings.
+
+    Returns:
+    - str: The matching date format string.
+
+    Raises:
+    - ValueError: If no matching format is found.
+    """
+    if date_str is None:
+        return None
+
+    for fmt in possible_formats:
+        try:
+            datetime.strptime(date_str, fmt)
+            return fmt
+        except ValueError:
+            continue
+
+
 def convert_to_formatted_dataset(
     input_file_path: str,
     output_file_path: str = None,
@@ -55,9 +89,16 @@ def convert_to_formatted_dataset(
                         ds["DATE"].values
                     )
                 else:
-                    datetime_coord = np.array(pd.to_datetime(ds["DATE"].values)).astype(
-                        "datetime64[ns]"
+                    first_date_str = (
+                        ds["DATE"].values[0] if len(ds["DATE"].values) > 0 else None
                     )
+                    date_format = infer_date_format(first_date_str, possible_formats)
+                    datetime_coord = np.array(
+                        pd.to_datetime(
+                            ds["DATE"].values,
+                            format=date_format if date_format else "mixed",
+                        )
+                    ).astype("datetime64[ns]")
 
                 datetime_coord = datetime_coord[
                     ~np.isnat(datetime_coord) & (datetime_coord != np.datetime64(""))
 
@@ -1,7 +1,7 @@
 .PHONY: up down build migrate createsuperuser shell bash test
 
 up:
-	docker compose -f docker-compose.development.yaml --env-file .env up --build
+	docker compose -f docker-compose.development.yaml --env-file .env up
 
 down:
 	docker compose -f docker-compose.development.yaml down
@@ -28,4 +28,10 @@ test:
 	docker compose -f docker-compose.development.yaml exec web pytest
 
 importmetadata:
-	docker compose -f docker-compose.development.yaml exec web python scripts/import_from_notion.py
+	docker compose -f docker-compose.development.yaml exec web python scripts/import_from_notion.py
+
+build-uploader:
+	docker buildx build --platform linux/amd64 -t ghcr.io/ecophysviz-lab/uploader-job:latest --push -f upload.Dockerfile .
+
+build-django:
+	docker buildx build --platform linux/amd64 -t ghcr.io/ecophysviz-lab/divedb-django:latest --push -f Dockerfile .
@@ -1,8 +1,8 @@
 source .env
 
-kubectl delete secret divedb-postgres-credentials
+kubectl delete secret divedb-credentials
 
-kubectl create secret generic divedb-postgres-credentials \
+kubectl create secret generic divedb-credentials \
   --from-literal=POSTGRES_DB=$POSTGRES_DB \
   --from-literal=POSTGRES_USER=$POSTGRES_USER \
   --from-literal=POSTGRES_PASSWORD=$POSTGRES_PASSWORD \
@@ -11,4 +11,12 @@ kubectl create secret generic divedb-postgres-credentials \
   --from-literal=DJANGO_SECRET_KEY=$DJANGO_SECRET_KEY \
   --from-literal=OPENSTACK_AUTH_URL=$OPENSTACK_AUTH_URL \
   --from-literal=OPENSTACK_APPLICATION_CREDENTIAL_ID=$OPENSTACK_APPLICATION_CREDENTIAL_ID \
-  --from-literal=OPENSTACK_APPLICATION_CREDENTIAL_SECRET=$OPENSTACK_APPLICATION_CREDENTIAL_SECRET
+  --from-literal=OPENSTACK_APPLICATION_CREDENTIAL_SECRET=$OPENSTACK_APPLICATION_CREDENTIAL_SECRET \
+  --from-literal=OPENSTACK_FILE_STORAGE_CONTAINER_NAME=$OPENSTACK_FILE_STORAGE_CONTAINER_NAME \
+  --from-literal=OPENSTACK_USER_ID=$OPENSTACK_USER_ID \
+  --from-literal=OPENSTACK_PROJECT_NAME=$OPENSTACK_PROJECT_NAME \
+  --from-literal=OPENSTACK_PROJECT_ID=$OPENSTACK_PROJECT_ID \
+  --from-literal=AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+  --from-literal=AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+  --from-literal=AWS_REGION=$AWS_REGION \
+  --from-literal=AWS_ENDPOINT_URL=$AWS_ENDPOINT_URL
@@ -0,0 +1,5 @@
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+echo "Script directory: $SCRIPT_DIR"
+
+kubectl delete jobs divedb-uploader-job
+kubectl apply -f $SCRIPT_DIR/job.yaml
@@ -0,0 +1,111 @@
+# job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: divedb-uploader-job
+  labels:
+    app: divedb-uploader-job
+spec:
+  template:
+    spec:
+      imagePullSecrets:
+      - name: ghcr-login-ecophysviz
+      containers:
+      - name: divedb-uploader-job
+        image: ghcr.io/ecophysviz-lab/uploader-job:latest
+        imagePullPolicy: Always
+        resources:
+          requests:
+            memory: "256Gi"
+            cpu: "64"
+          limits:
+            memory: "256Gi"
+            cpu: "64"
+        volumeMounts:
+        - name: divedb-uploader-data-storage
+          mountPath: /data
+        env:
+        - name: DJANGO_SECRET_KEY
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key: DJANGO_SECRET_KEY
+        - name: POSTGRES_HOST
+          value: divedb-pg-service  # Updated to match the service name
+        - name: POSTGRES_PORT
+          value: "5432"
+        - name: POSTGRES_DB
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key: POSTGRES_DB
+        - name: POSTGRES_USER
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key: POSTGRES_USER
+        - name: POSTGRES_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key: POSTGRES_PASSWORD
+        - name: OPENSTACK_AUTH_URL
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key:  OPENSTACK_AUTH_URL
+        - name: OPENSTACK_APPLICATION_CREDENTIAL_ID
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key:  OPENSTACK_APPLICATION_CREDENTIAL_ID
+        - name: OPENSTACK_APPLICATION_CREDENTIAL_SECRET
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key:  OPENSTACK_APPLICATION_CREDENTIAL_SECRET
+        - name: OPENSTACK_FILE_STORAGE_CONTAINER_NAME
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key:  OPENSTACK_FILE_STORAGE_CONTAINER_NAME
+        - name: OPENSTACK_USER_ID
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key:  OPENSTACK_USER_ID
+        - name: OPENSTACK_PROJECT_NAME
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key:  OPENSTACK_PROJECT_NAME
+        - name: OPENSTACK_PROJECT_ID
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key:  OPENSTACK_PROJECT_ID
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key: AWS_ACCESS_KEY_ID
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key: AWS_SECRET_ACCESS_KEY
+        - name: AWS_REGION
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key: AWS_REGION
+        - name: AWS_ENDPOINT_URL
+          valueFrom:
+            secretKeyRef:
+              name: divedb-credentials
+              key: AWS_ENDPOINT_URL
+      restartPolicy: Never
+      volumes:
+      - name: divedb-uploader-data-storage
+        persistentVolumeClaim:
+          claimName: divedb-uploader-data-storage
@@ -0,0 +1,14 @@
+# pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: divedb-uploader-data-storage
+  labels:
+    app: divedb
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 500Gi
+  storageClassName: rook-ceph-block
@@ -47,5 +47,7 @@ services:
       - ./data:/app/data
       - ./pyologger:/app/pyologger
       - ./scripts:/app/scripts
+      - ./pyproject.toml:/app/pyproject.toml
+      - ./setup.py:/app/setup.py
     depends_on:
       - metadata_database
@@ -8,15 +8,18 @@ version = "0.1.0"
 dependencies = [
   "black",
   "bs4",
+  "dask",
   "deltalake",
   "django",
   "django-storages",
   "duckdb",
   "edfio",
   "flake8",
+  "google-cloud-storage",
   "importlib-metadata==4.8.3",
   "load_dotenv",
   "mne",
+  "netcdf4",
   "notion_client",
   "pandas",
   "pre-commit",
 
@@ -0,0 +1,200 @@
+import os
+import uuid
+import xarray as xr
+from google.cloud import storage
+import pandas as pd
+from DiveDB.services.data_uploader import DataUploader
+from DiveDB.services.utils.netcdf_conversions import convert_to_formatted_dataset
+from DiveDB.services.duck_pond import DuckPond
+import datetime
+from dask import delayed, compute
+
+# Initialize the Google Cloud Storage client
+client = storage.Client()
+
+# Define the bucket
+bucket_name = "female_elephant_seal_netcdfs"
+
+# Get the bucket
+bucket = client.get_bucket(bucket_name)
+
+# List all blobs in the specified bucket
+blobs = list(bucket.list_blobs())
+
+# Load the CSV file
+metadata_df = pd.read_csv("scripts/metadata/11_Restimates_ALL_SealsUsed.csv")
+
+os.environ["SKIP_OPENSTACK_UPLOAD"] = "true"
+
+
+def process_file(blob, idx):
+    # Skip directories
+    if blob.name.endswith("/"):
+        return None
+    file_name = blob.name.split("/")[-1]
+    converted_file_path = f"./data/processed_{idx}.nc"
+
+    # Check if the converted file already exists
+    if os.path.exists(converted_file_path):
+        print(f"Skipping conversion for {file_name}, already processed.")
+    else:
+        print(f"Converting file: {file_name}")
+
+        # Download the blob to a local file
+        temp_dir = "data/temp"
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_file_path = os.path.join(temp_dir, file_name)
+        blob.download_to_filename(temp_file_path)
+        try:
+            convert_to_formatted_dataset(
+                temp_file_path, output_file_path=converted_file_path
+            )
+        except Exception as e:
+            print(f"Error converting file {file_name}: {e}")
+            os.remove(temp_file_path)
+            return None
+        finally:
+            os.remove(temp_file_path)
+
+    # Now upload the converted file
+    duckpond = DuckPond()
+    data_uploader = DataUploader(
+        duckpond=duckpond
+    )  # Instantiate per task to ensure thread safety
+    try:
+        with xr.open_dataset(converted_file_path) as ds:
+            # Extract necessary data from the dataset
+            deployment_id = int(ds.attrs["Deployment_ID"])
+            logger_id = ds.attrs["Tags_TDR1_Model"] + "_" + ds.attrs["Tags_TDR1_ID"]
+            filtered_df = metadata_df[metadata_df["TOPPID"] == deployment_id]
+            seal_id = (
+                filtered_df.iloc[0]["SEALID"]
+                if not filtered_df.empty
+                else str(uuid.uuid4())
+            )
+
+            # Convert date strings to the correct format
+            arrival_datetime_str = ds.attrs.get("Deployment_Arrival_Datetime")
+            departure_datetime_str = ds.attrs.get("Deployment_Departure_Datetime")
+
+            # Assuming the original format is "MM/DD/YYYY HH:MM"
+            arrival_datetime = datetime.datetime.strptime(
+                arrival_datetime_str, "%m/%d/%Y %H:%M"
+            )
+            departure_datetime = datetime.datetime.strptime(
+                departure_datetime_str, "%m/%d/%Y %H:%M"
+            )
+
+            # Format to "YYYY-MM-DD HH:MM"
+            formatted_arrival_datetime = arrival_datetime.strftime("%Y-%m-%d %H:%M")
+            formatted_departure_datetime = departure_datetime.strftime("%Y-%m-%d %H:%M")
+
+            # Prepare data for each model
+            animal_data = {
+                "animal_id": seal_id,
+                "project_id": ds.attrs.get("Animal_ID"),
+                "scientific_name": ds.attrs.get("Animal_Species"),
+                "common_name": ds.attrs.get("Animal_Species_CommonName"),
+                "lab_id": ds.attrs.get("Animal_ID"),
+                "birth_year": (
+                    ds.attrs.get("Animal_BirthYear")
+                    if not pd.isna(ds.attrs.get("Animal_BirthYear"))
+                    else 0
+                ),
+                "sex": ds.attrs.get("Animal_Sex"),
+                "domain_ids": str(ds.attrs.get("Animal_OtherDeployments")),
+            }
+
+            deployment_data = {
+                "deployment_id": ds.attrs.get("Deployment_ID"),
+                "domain_deployment_id": ds.attrs.get("Deployment_ID"),
+                "animal_age_class": ds.attrs.get("Animal_AgeClass"),
+                "animal_age": (
+                    ds.attrs.get("Deployment_Year") - ds.attrs.get("Animal_BirthYear")
+                    if not pd.isna(ds.attrs.get("Animal_BirthYear"))
+                    else 0
+                ),
+                "deployment_type": ds.attrs.get("Deployment_Trip"),
+                "deployment_name": ds.attrs.get("Deployment_ID"),
+                "rec_date": departure_datetime.strftime("%Y-%m-%d"),
+                "deployment_latitude": ds.attrs.get("Deployment_Departure_Lat"),
+                "deployment_longitude": ds.attrs.get("Deployment_Departure_Lon"),
+                "deployment_location": ds.attrs.get("Deployment_Departure_Location"),
+                "departure_datetime": formatted_departure_datetime,
+                "recovery_latitude": ds.attrs.get("Deployment_Arrival_Lat"),
+                "recovery_longitude": ds.attrs.get("Deployment_Arrival_Lon"),
+                "recovery_location": ds.attrs.get("Deployment_Arrival_Location"),
+                "arrival_datetime": formatted_arrival_datetime,
+                "notes": ds.attrs.get("Notes"),
+            }
+
+            logger_data = {
+                "logger_id": logger_id,
+                "manufacturer": ds.attrs.get("Tags_TDR1_Manufacturer"),
+                "manufacturer_name": ds.attrs.get("Tags_TDR1_Model"),
+                "serial_no": ds.attrs.get("Tags_TDR1_ID"),
+                "ptt": ds.attrs.get("Tags_PTT"),
+                "type": ds.attrs.get("TDR"),
+                "notes": ds.attrs.get("Tags_TDR1_Comments"),
+            }
+
+            # Create or get records
+            animal, _ = data_uploader.get_or_create_animal(animal_data)
+            logger, _ = data_uploader.get_or_create_logger(logger_data)
+            deployment, _ = data_uploader.get_or_create_deployment(deployment_data)
+
+            recording_data = {
+                "recording_id": f"{deployment_id}_{seal_id}_{logger_id}",
+                "name": f"Recording {idx}",
+                "animal": animal,
+                "deployment": deployment,
+                "logger": logger,
+                "start_time": formatted_arrival_datetime,
+                "end_time": formatted_departure_datetime,
+                "timezone": ds.attrs.get("Time_Zone"),
+                "quality": ds.attrs.get("Quality"),
+                "attachment_location": ds.attrs.get("Attachment_Location"),
+                "attachment_type": ds.attrs.get("Attachment_Type"),
+            }
+
+            recording, _ = data_uploader.get_or_create_recording(recording_data)
+
+            metadata = {
+                "animal": animal.id,
+                "deployment": deployment.id,
+                "recording": recording.id,
+            }
+
+            data_uploader.upload_netcdf(
+                converted_file_path,
+                metadata,
+                rename_map={
+                    "depth": "sensor_data_pressure",
+                    "corr_depth": "derived_data_depth",
+                    "lat": "derived_data_latitude",
+                    "lon": "derived_data_longitude",
+                    "loc_class": "derived_data_location_class",
+                    "light": "sensor_data_light",
+                    "exernal_temp": "sensor_data_exernal_temp",
+                },
+            )
+
+        print(f"Uploaded {converted_file_path}")
+    except Exception as e:
+        print(f"Error uploading file {converted_file_path}: {e}")
+        return None
+
+
+# Build delayed tasks
+tasks = []
+for idx, blob in enumerate(blobs):
+    if blob.name.endswith("/"):
+        continue
+    task = delayed(process_file)(blob, idx)
+    tasks.append(task)
+
+# Compute tasks in parallel
+# You can specify the number of workers; Dask defaults to the number of cores
+compute(
+    *tasks, scheduler="threads"
+)  # You can also use 'processes' or specify a Dask scheduler
@@ -0,0 +1,212 @@
+import os
+import uuid
+import xarray as xr
+from google.cloud import storage
+import pandas as pd
+from DiveDB.services.data_uploader import DataUploader
+from DiveDB.services.utils.netcdf_conversions import convert_to_formatted_dataset
+from DiveDB.services.duck_pond import DuckPond
+import datetime
+
+# Initialize the Google Cloud Storage client
+client = storage.Client()
+
+# Define the bucket and prefix
+bucket_name = "female_elephant_seal_netcdfs"
+# prefix = "female_elephant_seal_raw/"
+
+# Get the bucket
+bucket = client.get_bucket(bucket_name)
+
+# List all blobs in the specified bucket with the given prefix
+blobs = bucket.list_blobs()
+blobs_list = [blob for blob in blobs if "20170" in blob.name]
+
+print(f"Found {len(blobs_list)} files")
+
+# Load the CSV file
+metadata_df = pd.read_csv("scripts/metadata/11_Restimates_ALL_SealsUsed.csv")
+
+duckpond = DuckPond()
+data_uploader = DataUploader(duckpond=duckpond)
+
+os.environ["SKIP_OPENSTACK_UPLOAD"] = "true"
+
+
+def convert_file(file_path, idx):
+    # Convert the file if it doesn't already exist
+    converted_file_path = f"./data/processed_{idx}.nc"
+    if not os.path.exists(converted_file_path):
+        convert_to_formatted_dataset(file_path, output_file_path=converted_file_path)
+    return converted_file_path
+
+
+def upload_file(converted_file_path, idx):
+    with xr.open_dataset(converted_file_path) as ds:
+        # Extract necessary data from the dataset
+        deployment_id = int(ds.attrs["Deployment_ID"])
+        logger_id = ds.attrs["Tags_TDR1_Model"] + "_" + ds.attrs["Tags_TDR1_ID"]
+        filtered_df = metadata_df[metadata_df["TOPPID"] == deployment_id]
+        seal_id = (
+            filtered_df.iloc[0]["SEALID"]
+            if not filtered_df.empty
+            else str(uuid.uuid4())
+        )
+
+        # Convert date strings to the correct format
+        arrival_datetime_str = ds.attrs.get("Deployment_Arrival_Datetime")
+        departure_datetime_str = ds.attrs.get("Deployment_Departure_Datetime")
+
+        # Assuming the original format is "MM/DD/YYYY HH:MM"
+        arrival_datetime = datetime.datetime.strptime(
+            arrival_datetime_str, "%m/%d/%Y %H:%M"
+        )
+        departure_datetime = datetime.datetime.strptime(
+            departure_datetime_str, "%m/%d/%Y %H:%M"
+        )
+
+        # Format to "YYYY-MM-DD HH:MM"
+        formatted_arrival_datetime = arrival_datetime.strftime("%Y-%m-%d %H:%M")
+        formatted_departure_datetime = departure_datetime.strftime("%Y-%m-%d %H:%M")
+
+        # Prepare data for each model
+        animal_data = {
+            "animal_id": seal_id,
+            "project_id": ds.attrs.get("Animal_ID"),
+            "scientific_name": ds.attrs.get("Animal_Species"),
+            "common_name": ds.attrs.get("Animal_Species_CommonName"),
+            "lab_id": ds.attrs.get("Animal_ID"),
+            "birth_year": (
+                ds.attrs.get("Animal_BirthYear")
+                if not pd.isna(ds.attrs.get("Animal_BirthYear"))
+                else 0
+            ),
+            "sex": ds.attrs.get("Animal_Sex"),
+            "domain_ids": str(ds.attrs.get("Animal_OtherDeployments")),
+        }
+
+        deployment_data = {
+            "deployment_id": ds.attrs.get("Deployment_ID"),
+            "domain_deployment_id": ds.attrs.get("Deployment_ID"),
+            "animal_age_class": ds.attrs.get("Animal_AgeClass"),
+            "animal_age": (
+                ds.attrs.get("Deployment_Year") - ds.attrs.get("Animal_BirthYear")
+                if not pd.isna(ds.attrs.get("Animal_BirthYear"))
+                else 0
+            ),
+            "deployment_type": ds.attrs.get("Deployment_Trip"),
+            "deployment_name": ds.attrs.get("Deployment_ID"),
+            "rec_date": departure_datetime.strftime("%Y-%m-%d"),
+            "deployment_latitude": ds.attrs.get("Deployment_Departure_Lat"),
+            "deployment_longitude": ds.attrs.get("Deployment_Departure_Lon"),
+            "deployment_location": ds.attrs.get("Deployment_Departure_Location"),
+            "departure_datetime": formatted_departure_datetime,
+            "recovery_latitude": ds.attrs.get("Deployment_Arrival_Lat"),
+            "recovery_longitude": ds.attrs.get("Deployment_Arrival_Lon"),
+            "recovery_location": ds.attrs.get("Deployment_Arrival_Location"),
+            "arrival_datetime": formatted_arrival_datetime,
+            "notes": ds.attrs.get("Notes"),
+        }
+
+        logger_data = {
+            "logger_id": logger_id,
+            "manufacturer": ds.attrs.get("Tags_TDR1_Manufacturer"),
+            "manufacturer_name": ds.attrs.get("Tags_TDR1_Model"),
+            "serial_no": ds.attrs.get("Tags_TDR1_ID"),
+            "ptt": ds.attrs.get("Tags_PTT"),
+            "type": ds.attrs.get("TDR"),
+            "notes": ds.attrs.get("Tags_TDR1_Comments"),
+        }
+
+        # Create or get records
+        animal, _ = data_uploader.get_or_create_animal(animal_data)
+        logger, _ = data_uploader.get_or_create_logger(logger_data)
+        deployment, _ = data_uploader.get_or_create_deployment(deployment_data)
+
+        recording_data = {
+            "recording_id": f"{deployment_id}_{seal_id}_{logger_id}",
+            "name": f"Recording {idx}",
+            "animal": animal,
+            "deployment": deployment,
+            "logger": logger,
+            "start_time": formatted_arrival_datetime,
+            "end_time": formatted_departure_datetime,
+            "timezone": ds.attrs.get("Time_Zone"),
+            "quality": ds.attrs.get("Quality"),
+            "attachment_location": ds.attrs.get("Attachment_Location"),
+            "attachment_type": ds.attrs.get("Attachment_Type"),
+        }
+
+        recording, _ = data_uploader.get_or_create_recording(recording_data)
+
+        metadata = {
+            "animal": animal.id,
+            "deployment": deployment.id,
+            "recording": recording.id,
+        }
+
+        data_uploader.upload_netcdf(
+            converted_file_path,
+            metadata,
+            rename_map={
+                "depth": "sensor_data_pressure",
+                "corr_depth": "derived_data_depth",
+                "lat": "derived_data_latitude",
+                "lon": "derived_data_longitude",
+                "loc_class": "derived_data_location_class",
+                "light": "sensor_data_light",
+                "exernal_temp": "sensor_data_exernal_temp",
+            },
+        )
+
+    print(f"Uploaded {converted_file_path}")
+
+
+files_with_errors = []
+converted_files = []
+
+# First, convert all files
+for idx, blob in enumerate(blobs_list):
+    # Skip directories
+    if blob.name.endswith("/"):
+        continue
+
+    file_name = blob.name.split("/")[-1]
+    converted_file_path = f"./data/processed_{idx}.nc"
+
+    # Check if the converted file already exists
+    if os.path.exists(converted_file_path):
+        print(f"Skipping conversion for {file_name}, already processed.")
+        converted_files.append((converted_file_path, idx))
+        continue
+
+    print(f"Converting file: {file_name}")
+
+    # Download the blob to a local file
+    temp_dir = "data/temp"
+    os.makedirs(temp_dir, exist_ok=True)
+    temp_file_path = os.path.join(temp_dir, file_name)
+    blob.download_to_filename(temp_file_path)
+    try:
+        converted_file_path = convert_file(temp_file_path, idx)
+        converted_files.append((converted_file_path, idx))
+    except Exception as e:
+        print(f"Error converting file: {e}")
+        files_with_errors.append(file_name)
+    os.remove(temp_file_path)
+
+# Then, upload all converted files
+for converted_file_path, idx in converted_files:
+    try:
+        upload_file(converted_file_path, idx)
+    except Exception as e:
+        print(f"Error uploading file: {e}")
+        files_with_errors.append(converted_file_path)
+
+print(files_with_errors)
+
+# data_directory = "data/files"
+# for idx, file_name in enumerate(os.listdir(data_directory)):
+#     file_path = os.path.join(data_directory, file_name)
+
+#     upload_file(file_path, idx)
@@ -8,14 +8,18 @@
     include_package_data=True,
     install_requires=[
         "black",
+        "bs4",
         "deltalake",
         "django",
+        "django-storages",
         "duckdb",
         "edfio",
         "flake8",
+        "google-cloud-storage",
         "importlib-metadata==4.8.3",
         "load_dotenv",
         "mne",
+        "netcdf4",
         "notion_client",
         "pandas",
         "pre-commit",
@@ -30,6 +34,7 @@
         "python-swiftclient",
         "setuptools",
         "tqdm",
+        "xarray",
     ],
     url="https://github.com/ecophysviz-lab/DiveDB",
 )
@@ -0,0 +1,32 @@
+# Use the official Python image from the Docker Hub
+FROM python:3.12
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the rest of the application code into the container
+COPY . /app/
+
+# Create the data directory
+RUN mkdir -p data
+
+# Install the dependencies
+RUN pip install .
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+
+# Expose the port the app runs on
+EXPOSE 8000
+
+# Set the DJANGO_PREFIX environment variable
+ENV DJANGO_PREFIX=DiveDB
+
+# Set the GOOGLE_APPLICATION_CREDENTIALS environment variable
+ENV GOOGLE_APPLICATION_CREDENTIALS=brave-sonar-390402-7644b983ce44.json
+
+# Set the CONTAINER_DELTA_LAKE_PATH environment variable
+ENV CONTAINER_DELTA_LAKE_PATH=s3://divedb-delta-lakes-dryad-10-21
+
+# Command to run your script
+CMD ["python", "scripts/import_from_dryad.py"]