Skip to content
This repository has been archived by the owner on Jun 11, 2024. It is now read-only.

Commit

Permalink
formated code
Browse files Browse the repository at this point in the history
  • Loading branch information
zakwatts committed Jul 19, 2023
1 parent 22f4def commit 7480c15
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 62 deletions.
21 changes: 10 additions & 11 deletions nwp/excarta/merge_excarta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,19 @@
import zarr
import ocf_blosc2


def merge_zarr_files(zarr_path, merged_zarr_path):
# Collect paths of Zarr files in the specified directory
zarr_files = [os.path.join(zarr_path, file) for file in os.listdir(zarr_path) if file.endswith('.zarr')]
zarr_files = [
os.path.join(zarr_path, file)
for file in os.listdir(zarr_path)
if file.endswith(".zarr")
]

print("1")
# Open the first Zarr file to create the initial dataset
merged_ds = xr.open_zarr(zarr_files[0])

print("2")

# Define the specific range of x and y coordinates
Expand All @@ -30,25 +35,20 @@ def merge_zarr_files(zarr_path, merged_zarr_path):

# ds_filt = ds.sel(x=slice(*x_range), y=slice(*y_range))
merged_ds = merged_ds.combine_first(ds_filt)

print("3")

# Rechunk the merged dataset
merged_ds = merged_ds.chunk(chunks={"init_time": 10, "x": 100, "y": 100})

print("4")


print("4")


print(merged_ds)

# Save the merged dataset as a new Zarr file
merged_ds.to_zarr(merged_zarr_path)

print("5")




# Specify the path where the independent Zarr files are located
Expand All @@ -59,4 +59,3 @@ def merge_zarr_files(zarr_path, merged_zarr_path):

# Merge the Zarr files
merge_zarr_files(zarr_path, merged_zarr_path)

54 changes: 35 additions & 19 deletions nwp/excarta/parse_excarta_monthly.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#Low memory script
# Low memory script
import os
from datetime import datetime
import pandas as pd
import xarray as xr
import argparse
import pathlib


def _parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("output", type=pathlib.Path, help="Output zarr file")
Expand All @@ -14,28 +15,43 @@ def _parse_args():
return parser.parse_args()



def data_loader(folder_path, month_to_process):
"""
Loads and transforms data from CSV files in the given folder_path and directly convert each DataFrame into an xarray Dataset.
Only process files for the month 'YYYYMM' given by month_to_process
"""
month_to_process = datetime.strptime(month_to_process, "%Y%m")
column_names = ['DateTimeUTC', 'LocationId', 'Latitude', 'Longitude', 'dni', 'dhi', 'ghi']
column_names = [
"DateTimeUTC",
"LocationId",
"Latitude",
"Longitude",
"dni",
"dhi",
"ghi",
]
files = os.listdir(folder_path)
datasets = []

for filename in files:
if filename.endswith(".csv") and not filename.startswith("._"):
file_datetime = datetime.strptime(filename[:-4], "%Y%m%d%H")

if (file_datetime.year == month_to_process.year) and (file_datetime.month == month_to_process.month):

if (file_datetime.year == month_to_process.year) and (
file_datetime.month == month_to_process.month
):
file_path = os.path.join(folder_path, filename)
df = pd.read_csv(file_path, header=None, names=column_names, parse_dates=['DateTimeUTC'])

df['step'] = (df['DateTimeUTC'] - file_datetime).dt.total_seconds() / 3600 # convert timedelta to hours
df['init_time'] = file_datetime
df = pd.read_csv(
file_path,
header=None,
names=column_names,
parse_dates=["DateTimeUTC"],
)

df["step"] = (
df["DateTimeUTC"] - file_datetime
).dt.total_seconds() / 3600 # convert timedelta to hours
df["init_time"] = file_datetime

# Convert the dataframe to an xarray Dataset and append to the list
ds = xr.Dataset.from_dataframe(df)
Expand All @@ -62,26 +78,26 @@ def pdtocdf(datasets):
"""
Processes the xarray Datasets and merges them.
"""

datasets = [ds.set_index(index=['init_time', 'step', 'Latitude', 'Longitude']) for ds in datasets]

ds = xr.concat(datasets, dim='index')
datasets = [
ds.set_index(index=["init_time", "step", "Latitude", "Longitude"])
for ds in datasets
]

ds = xr.concat(datasets, dim="index")

# # Define the specific range of x and y coordinates to filter the data on
# x_range = (-10, 2) # Example x coordinate range
# y_range = (49, 59) # Example y coordinate range

ds = ds.rename({"Latitude": "y", "Longitude": "x"})



var_names = ds.data_vars
d2 = xr.concat([ds[v] for v in var_names], dim="variable")
d2 = d2.assign_coords(variable=("variable", var_names))
ds = xr.Dataset(dict(value=d2))
ds = ds.sortby('step')
ds = ds.sortby('init_time')

ds = ds.sortby("step")
ds = ds.sortby("init_time")

return ds

Expand All @@ -103,7 +119,7 @@ def main():
# ds = ds.sel(x=slice(float(-10), float(2)), y=slice(float(49), float(59)))

print(ds)
ds = ds.unstack('index')
ds = ds.unstack("index")

ds_filt = ds.sel(x=slice(float(13), float(15)), y=slice(float(35), float(37)))

Expand All @@ -118,4 +134,4 @@ def main():

# Check if script is being run directly
if __name__ == "__main__":
main()
main()
31 changes: 22 additions & 9 deletions nwp/excarta/parse_excarta_to_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,32 @@ def data_loader(folder_path):
"""
Loads and transforms data from CSV files in the given folder_path.
"""
column_names = ['DateTimeUTC', 'LocationId', 'Latitude', 'Longitude', 'dni', 'dhi', 'ghi']
column_names = [
"DateTimeUTC",
"LocationId",
"Latitude",
"Longitude",
"dni",
"dhi",
"ghi",
]
files = os.listdir(folder_path)
dfs = []

for filename in files:
if filename.endswith(".csv") and not filename.startswith("._"):
file_path = os.path.join(folder_path, filename)
df = pd.read_csv(file_path, header=None, names=column_names, parse_dates=['DateTimeUTC'])
df = pd.read_csv(
file_path, header=None, names=column_names, parse_dates=["DateTimeUTC"]
)

datetime_str = filename[:-4]
datetime_str = filename[:-4]
datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H")

df['step'] = (df['DateTimeUTC'] - datetime_obj).dt.total_seconds() / 3600 # convert timedelta to hours
df['init_time'] = datetime_obj
df["step"] = (
df["DateTimeUTC"] - datetime_obj
).dt.total_seconds() / 3600 # convert timedelta to hours
df["init_time"] = datetime_obj
dfs.append(df)

return dfs
Expand All @@ -43,7 +55,6 @@ def load_data_from_all_years(parent_folder_path):
"""
all_dataframes = []


# Actual date range is 2018 to 2022 (for in range use (2018,2023))
for year in range(2018, 2019):
folder_path = os.path.join(parent_folder_path, str(year))
Expand All @@ -60,15 +71,17 @@ def pdtocdf(dfs):
merged_df = pd.concat(dfs, ignore_index=True)

ds = xr.Dataset.from_dataframe(merged_df)
ds = ds.set_index(index=['init_time', 'step','Latitude','Longitude']).unstack('index')
ds = ds.set_index(index=["init_time", "step", "Latitude", "Longitude"]).unstack(
"index"
)
ds = ds.drop_vars(["LocationId", "DateTimeUTC"])

var_names = ds.data_vars
d2 = xr.concat([ds[v] for v in var_names], dim="variable")
d2 = d2.assign_coords(variable=("variable", var_names))
ds = xr.Dataset(dict(value=d2))
ds = ds.sortby('step')
ds = ds.sortby('init_time')
ds = ds.sortby("step")
ds = ds.sortby("init_time")
ds = ds.rename({"Latitude": "y", "Longitude": "x"})

return ds
Expand Down
44 changes: 29 additions & 15 deletions nwp/excarta/parse_excarta_to_output_low_mem.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#Low memory script
# Low memory script
import os
from datetime import datetime
import pandas as pd
Expand All @@ -17,19 +17,31 @@ def data_loader(folder_path):
"""
Loads and transforms data from CSV files in the given folder_path and directly convert each DataFrame into an xarray Dataset.
"""
column_names = ['DateTimeUTC', 'LocationId', 'Latitude', 'Longitude', 'dni', 'dhi', 'ghi']
column_names = [
"DateTimeUTC",
"LocationId",
"Latitude",
"Longitude",
"dni",
"dhi",
"ghi",
]
files = os.listdir(folder_path)
datasets = []

for filename in files:
if filename.endswith(".csv") and not filename.startswith("._"):
file_path = os.path.join(folder_path, filename)

df = pd.read_csv(file_path, header=None, names=column_names, parse_dates=['DateTimeUTC'])
df = pd.read_csv(
file_path, header=None, names=column_names, parse_dates=["DateTimeUTC"]
)
datetime_str = filename[:-4]
datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H")
df['step'] = (df['DateTimeUTC'] - datetime_obj).dt.total_seconds() / 3600 # convert timedelta to hours
df['init_time'] = datetime_obj
df["step"] = (
df["DateTimeUTC"] - datetime_obj
).dt.total_seconds() / 3600 # convert timedelta to hours
df["init_time"] = datetime_obj

# Convert the dataframe to an xarray Dataset and append to the list
ds = xr.Dataset.from_dataframe(df)
Expand All @@ -55,26 +67,30 @@ def pdtocdf(datasets):
Processes the xarray Datasets and merges them.
"""
print(datasets)
# ds = xr.merge(datasets)
# ds = xr.merge(datasets)

datasets = [ds.set_index(index=['init_time', 'step', 'Latitude', 'Longitude']) for ds in datasets]
datasets = [
ds.set_index(index=["init_time", "step", "Latitude", "Longitude"])
for ds in datasets
]

ds = xr.concat(datasets, dim='index')
ds = xr.concat(datasets, dim="index")

# Going to unstack and then combine in a different script
# Get rid of the index dimension and just keep the desired ones
# ds = ds.unstack('index')

var_names = ds.data_vars
d2 = xr.concat([ds[v] for v in var_names], dim="variable")
d2 = d2.assign_coords(variable=("variable", var_names))
ds = xr.Dataset(dict(value=d2))
ds = ds.sortby('step')
ds = ds.sortby('init_time')
ds = ds.sortby("step")
ds = ds.sortby("init_time")
ds = ds.rename({"Latitude": "y", "Longitude": "x"})

return ds


def main():
args = _parse_args()

Expand All @@ -87,13 +103,11 @@ def main():

print(ds)

ds = ds.unstack('index')
ds = ds.unstack("index")

ds.to_zarr(args.output)




# Check if script is being run directly
if __name__ == "__main__":
main()
main()
Loading

0 comments on commit 7480c15

Please sign in to comment.