formated code

openclimatefix · Jul 19, 2023 · 7480c15 · 7480c15
1 parent 22f4def
commit 7480c15
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 62 deletions.
diff --git a/nwp/excarta/merge_excarta.py b/nwp/excarta/merge_excarta.py
@@ -9,14 +9,19 @@
 import zarr
 import ocf_blosc2
 
+
 def merge_zarr_files(zarr_path, merged_zarr_path):
     # Collect paths of Zarr files in the specified directory
-    zarr_files = [os.path.join(zarr_path, file) for file in os.listdir(zarr_path) if file.endswith('.zarr')]
+    zarr_files = [
+        os.path.join(zarr_path, file)
+        for file in os.listdir(zarr_path)
+        if file.endswith(".zarr")
+    ]
 
     print("1")
     # Open the first Zarr file to create the initial dataset
     merged_ds = xr.open_zarr(zarr_files[0])
-    
+
     print("2")
 
     # Define the specific range of x and y coordinates
@@ -30,25 +35,20 @@ def merge_zarr_files(zarr_path, merged_zarr_path):
 
         # ds_filt = ds.sel(x=slice(*x_range), y=slice(*y_range))
         merged_ds = merged_ds.combine_first(ds_filt)
-        
+
     print("3")
 
     # Rechunk the merged dataset
     merged_ds = merged_ds.chunk(chunks={"init_time": 10, "x": 100, "y": 100})
-
-    print("4")
-
 
+    print("4")
 
-
     print(merged_ds)
 
     # Save the merged dataset as a new Zarr file
     merged_ds.to_zarr(merged_zarr_path)
-    
+
     print("5")
-
-
 
 
 # Specify the path where the independent Zarr files are located
@@ -59,4 +59,3 @@ def merge_zarr_files(zarr_path, merged_zarr_path):
 
 # Merge the Zarr files
 merge_zarr_files(zarr_path, merged_zarr_path)
-
diff --git a/nwp/excarta/parse_excarta_monthly.py b/nwp/excarta/parse_excarta_monthly.py
@@ -1,11 +1,12 @@
-#Low memory script
+# Low memory script
 import os
 from datetime import datetime
 import pandas as pd
 import xarray as xr
 import argparse
 import pathlib
 
+
 def _parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("output", type=pathlib.Path, help="Output zarr file")
@@ -14,28 +15,43 @@ def _parse_args():
     return parser.parse_args()
 
 
-
 def data_loader(folder_path, month_to_process):
     """
     Loads and transforms data from CSV files in the given folder_path and directly convert each DataFrame into an xarray Dataset.
     Only process files for the month 'YYYYMM' given by month_to_process
     """
     month_to_process = datetime.strptime(month_to_process, "%Y%m")
-    column_names = ['DateTimeUTC', 'LocationId', 'Latitude', 'Longitude', 'dni', 'dhi', 'ghi']
+    column_names = [
+        "DateTimeUTC",
+        "LocationId",
+        "Latitude",
+        "Longitude",
+        "dni",
+        "dhi",
+        "ghi",
+    ]
     files = os.listdir(folder_path)
     datasets = []
 
     for filename in files:
         if filename.endswith(".csv") and not filename.startswith("._"):
             file_datetime = datetime.strptime(filename[:-4], "%Y%m%d%H")
 
-            if (file_datetime.year == month_to_process.year) and (file_datetime.month == month_to_process.month):
-
+            if (file_datetime.year == month_to_process.year) and (
+                file_datetime.month == month_to_process.month
+            ):
                 file_path = os.path.join(folder_path, filename)
-                df = pd.read_csv(file_path, header=None, names=column_names, parse_dates=['DateTimeUTC'])
-
-                df['step'] = (df['DateTimeUTC'] - file_datetime).dt.total_seconds() / 3600  # convert timedelta to hours
-                df['init_time'] = file_datetime
+                df = pd.read_csv(
+                    file_path,
+                    header=None,
+                    names=column_names,
+                    parse_dates=["DateTimeUTC"],
+                )
+
+                df["step"] = (
+                    df["DateTimeUTC"] - file_datetime
+                ).dt.total_seconds() / 3600  # convert timedelta to hours
+                df["init_time"] = file_datetime
 
                 # Convert the dataframe to an xarray Dataset and append to the list
                 ds = xr.Dataset.from_dataframe(df)
@@ -62,26 +78,26 @@ def pdtocdf(datasets):
     """
     Processes the xarray Datasets and merges them.
     """
-
-    datasets = [ds.set_index(index=['init_time', 'step', 'Latitude', 'Longitude']) for ds in datasets]
 
-    ds = xr.concat(datasets, dim='index')
+    datasets = [
+        ds.set_index(index=["init_time", "step", "Latitude", "Longitude"])
+        for ds in datasets
+    ]
+
+    ds = xr.concat(datasets, dim="index")
 
     # # Define the specific range of x and y coordinates to filter the data on
     # x_range = (-10, 2)  # Example x coordinate range
     # y_range = (49, 59)  # Example y coordinate range
 
     ds = ds.rename({"Latitude": "y", "Longitude": "x"})
-
-
 
     var_names = ds.data_vars
     d2 = xr.concat([ds[v] for v in var_names], dim="variable")
     d2 = d2.assign_coords(variable=("variable", var_names))
     ds = xr.Dataset(dict(value=d2))
-    ds = ds.sortby('step')
-    ds = ds.sortby('init_time')
-
+    ds = ds.sortby("step")
+    ds = ds.sortby("init_time")
 
     return ds
 
@@ -103,7 +119,7 @@ def main():
     # ds = ds.sel(x=slice(float(-10), float(2)), y=slice(float(49), float(59)))
 
     print(ds)
-    ds = ds.unstack('index')
+    ds = ds.unstack("index")
 
     ds_filt = ds.sel(x=slice(float(13), float(15)), y=slice(float(35), float(37)))
 
@@ -118,4 +134,4 @@ def main():
 
 # Check if script is being run directly
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/nwp/excarta/parse_excarta_to_output.py b/nwp/excarta/parse_excarta_to_output.py
@@ -18,20 +18,32 @@ def data_loader(folder_path):
     """
     Loads and transforms data from CSV files in the given folder_path.
     """
-    column_names = ['DateTimeUTC', 'LocationId', 'Latitude', 'Longitude', 'dni', 'dhi', 'ghi']
+    column_names = [
+        "DateTimeUTC",
+        "LocationId",
+        "Latitude",
+        "Longitude",
+        "dni",
+        "dhi",
+        "ghi",
+    ]
     files = os.listdir(folder_path)
     dfs = []
 
     for filename in files:
         if filename.endswith(".csv") and not filename.startswith("._"):
             file_path = os.path.join(folder_path, filename)
-            df = pd.read_csv(file_path, header=None, names=column_names, parse_dates=['DateTimeUTC'])
+            df = pd.read_csv(
+                file_path, header=None, names=column_names, parse_dates=["DateTimeUTC"]
+            )
 
-            datetime_str = filename[:-4] 
+            datetime_str = filename[:-4]
             datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H")
 
-            df['step'] = (df['DateTimeUTC'] - datetime_obj).dt.total_seconds() / 3600  # convert timedelta to hours
-            df['init_time'] = datetime_obj
+            df["step"] = (
+                df["DateTimeUTC"] - datetime_obj
+            ).dt.total_seconds() / 3600  # convert timedelta to hours
+            df["init_time"] = datetime_obj
             dfs.append(df)
 
     return dfs
@@ -43,7 +55,6 @@ def load_data_from_all_years(parent_folder_path):
     """
     all_dataframes = []
 
-
     # Actual date range is 2018 to 2022 (for in range use (2018,2023))
     for year in range(2018, 2019):
         folder_path = os.path.join(parent_folder_path, str(year))
@@ -60,15 +71,17 @@ def pdtocdf(dfs):
     merged_df = pd.concat(dfs, ignore_index=True)
 
     ds = xr.Dataset.from_dataframe(merged_df)
-    ds = ds.set_index(index=['init_time', 'step','Latitude','Longitude']).unstack('index')    
+    ds = ds.set_index(index=["init_time", "step", "Latitude", "Longitude"]).unstack(
+        "index"
+    )
     ds = ds.drop_vars(["LocationId", "DateTimeUTC"])
 
     var_names = ds.data_vars
     d2 = xr.concat([ds[v] for v in var_names], dim="variable")
     d2 = d2.assign_coords(variable=("variable", var_names))
     ds = xr.Dataset(dict(value=d2))
-    ds = ds.sortby('step')
-    ds = ds.sortby('init_time')
+    ds = ds.sortby("step")
+    ds = ds.sortby("init_time")
     ds = ds.rename({"Latitude": "y", "Longitude": "x"})
 
     return ds

diff --git a/nwp/excarta/parse_excarta_to_output_low_mem.py b/nwp/excarta/parse_excarta_to_output_low_mem.py
@@ -1,4 +1,4 @@
-#Low memory script
+# Low memory script
 import os
 from datetime import datetime
 import pandas as pd
@@ -17,19 +17,31 @@ def data_loader(folder_path):
     """
     Loads and transforms data from CSV files in the given folder_path and directly convert each DataFrame into an xarray Dataset.
     """
-    column_names = ['DateTimeUTC', 'LocationId', 'Latitude', 'Longitude', 'dni', 'dhi', 'ghi']
+    column_names = [
+        "DateTimeUTC",
+        "LocationId",
+        "Latitude",
+        "Longitude",
+        "dni",
+        "dhi",
+        "ghi",
+    ]
     files = os.listdir(folder_path)
     datasets = []
 
     for filename in files:
         if filename.endswith(".csv") and not filename.startswith("._"):
             file_path = os.path.join(folder_path, filename)
 
-            df = pd.read_csv(file_path, header=None, names=column_names, parse_dates=['DateTimeUTC'])
+            df = pd.read_csv(
+                file_path, header=None, names=column_names, parse_dates=["DateTimeUTC"]
+            )
             datetime_str = filename[:-4]
             datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H")
-            df['step'] = (df['DateTimeUTC'] - datetime_obj).dt.total_seconds() / 3600  # convert timedelta to hours
-            df['init_time'] = datetime_obj
+            df["step"] = (
+                df["DateTimeUTC"] - datetime_obj
+            ).dt.total_seconds() / 3600  # convert timedelta to hours
+            df["init_time"] = datetime_obj
 
             # Convert the dataframe to an xarray Dataset and append to the list
             ds = xr.Dataset.from_dataframe(df)
@@ -55,26 +67,30 @@ def pdtocdf(datasets):
     Processes the xarray Datasets and merges them.
     """
     print(datasets)
-#     ds = xr.merge(datasets)
+    #     ds = xr.merge(datasets)
 
-    datasets = [ds.set_index(index=['init_time', 'step', 'Latitude', 'Longitude']) for ds in datasets]
+    datasets = [
+        ds.set_index(index=["init_time", "step", "Latitude", "Longitude"])
+        for ds in datasets
+    ]
 
-    ds = xr.concat(datasets, dim='index')
+    ds = xr.concat(datasets, dim="index")
 
     # Going to unstack and then combine in a different script
     # Get rid of the index dimension and just keep the desired ones
     # ds = ds.unstack('index')
-    
+
     var_names = ds.data_vars
     d2 = xr.concat([ds[v] for v in var_names], dim="variable")
     d2 = d2.assign_coords(variable=("variable", var_names))
     ds = xr.Dataset(dict(value=d2))
-    ds = ds.sortby('step')
-    ds = ds.sortby('init_time')
+    ds = ds.sortby("step")
+    ds = ds.sortby("init_time")
     ds = ds.rename({"Latitude": "y", "Longitude": "x"})
 
     return ds
 
+
 def main():
     args = _parse_args()
 
@@ -87,13 +103,11 @@ def main():
 
     print(ds)
 
-    ds = ds.unstack('index')
+    ds = ds.unstack("index")
 
     ds.to_zarr(args.output)
-
-
 
 
 # Check if script is being run directly
 if __name__ == "__main__":
-    main()
+    main()