From afbebba2cdf60bfab1c48552687d7390b4ba4223 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Wed, 25 May 2022 12:21:49 -0700 Subject: [PATCH 1/3] Add notebook experimenting with masked weights --- validation/v1.0.0/temporal_average/masked_data_weights.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 validation/v1.0.0/temporal_average/masked_data_weights.ipynb diff --git a/validation/v1.0.0/temporal_average/masked_data_weights.ipynb b/validation/v1.0.0/temporal_average/masked_data_weights.ipynb new file mode 100644 index 0000000..ff5ebff --- /dev/null +++ b/validation/v1.0.0/temporal_average/masked_data_weights.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Experimenting with Masked Data and Weighting in xarray\n","\n","This notebook experiments with how weighting with masked data can affect the results of reduction operations such as simple averages and grouped averages. Specifically, this notebook looks into weighted temporal averaging."]},{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["# flake8: noqa f401\n","import numpy as np\n","import xarray as xr\n","\n","import xcdat"]},{"cell_type":"markdown","metadata":{},"source":["## Create a sample monthly time series dataset"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["ds = xr.Dataset(\n"," coords={\n"," \"lat\": [-90],\n"," \"lon\": [0],\n"," \"time\": xr.DataArray(\n"," data=np.array(\n"," [\n"," \"2000-01-01T00:00:00.000000000\",\n"," \"2000-02-01T00:00:00.000000000\",\n"," \"2000-03-01T00:00:00.000000000\",\n"," \"2000-04-01T00:00:00.000000000\",\n"," \"2001-01-01T00:00:00.000000000\",\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," dims=[\"time\"],\n"," attrs={\n"," \"axis\": \"T\",\n"," \"long_name\": \"time\",\n"," \"standard_name\": \"time\",\n"," \"bounds\": \"time_bnds\",\n"," },\n"," ),\n"," }\n",")\n","ds[\"time_bnds\"] = xr.DataArray(\n"," name=\"time_bnds\",\n"," data=np.array(\n"," [\n"," [\"2000-01-01T00:00:00.000000000\", \"2000-02-01T00:00:00.000000000\"],\n"," [\"2000-02-01T00:00:00.000000000\", \"2000-03-01T00:00:00.000000000\"],\n"," [\"2000-03-01T00:00:00.000000000\", \"2000-04-01T00:00:00.000000000\"],\n"," [\"2000-04-01T00:00:00.000000000\", \"2000-05-01T00:00:00.000000000\"],\n"," [\"2000-12-01T00:00:00.000000000\", \"2001-01-01T00:00:00.000000000\"],\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," coords={\"time\": ds.time},\n"," dims=[\"time\", \"bnds\"],\n"," attrs={\"is_generated\": \"True\"},\n",")\n","\n","ds[\"ts\"] = xr.DataArray(\n"," data=np.array([[[2]], [[np.nan]], [[1]], [[1]], [[2]]]),\n"," coords={\"lat\": ds.lat, \"lon\": ds.lon, \"time\": ds.time},\n"," dims=[\"time\", \"lat\", \"lon\"],\n",")"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.Dataset>\n","Dimensions:    (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n","  * lat        (lat) int64 -90\n","  * lon        (lon) int64 0\n","  * time       (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n","    time_bnds  (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    ts         (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0
"],"text/plain":["\n","Dimensions: (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n"," time_bnds (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," ts (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["ds"]},{"cell_type":"markdown","metadata":{},"source":["## 1. Set the TemporalAccessor class attributes to call internal methods."]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["ds.temporal._time_bounds = ds.time_bnds\n","ds.temporal._mode = \"average\"\n","ds.temporal._freq = ds.temporal._infer_freq()"]},{"cell_type":"markdown","metadata":{},"source":["## 2. Calculate the weights for monthly data (uses time bounds)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["weights = ds.temporal._get_weights()\n","weights"]},{"cell_type":"markdown","metadata":{},"source":["## 3. Calculate weighted means with generated weights."]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["# Grouped weighted average (annual cycle)\n","dv_gb_avg1 = (ds.ts * weights).groupby(\"time.month\").sum()\n","dv_gb_avg1"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'ts' (lat: 1, lon: 1)>\n","array([[1.33333333]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0
"],"text/plain":["\n","array([[1.33333333]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["# Simple average with monthly weights\n","dv_avg1 = ds.ts.weighted(weights).mean(dim=\"time\")\n","dv_avg1"]},{"cell_type":"markdown","metadata":{},"source":["## 4. Means with weights masked with `np.nan` for missing values"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["masked_weights = weights.copy()\n","masked_weights.data[1] = np.nan"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["dv_gb_avg2 = (ds.ts * masked_weights).groupby(\"time.month\").sum()\n","dv_gb_avg2"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["# ValueError: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.\n","dv_avg2 = ds.ts.weighted(masked_weights).mean(dim=\"time\")"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["## 5. Means with weights masked with 0 for missing values\n","filled_weights = masked_weights.copy().fillna(0)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["dv_gb_avg3 = (ds.ts * filled_weights).groupby(\"time.month\").sum()\n","dv_gb_avg3"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["dv_avg3 = ds.ts.weighted(filled_weights).mean(dim=\"time\")\n","dv_avg3"]},{"cell_type":"markdown","metadata":{},"source":["## Compare results"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Generated weights vs. generated weights with np.nan for missing values\n","dv_gb_avg1.identical(dv_gb_avg2)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# dv_avg1.identical(dv_avg2) # Does not work since missing weight values must be filled"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Generated weights vs. generated weights with 0 for missing values\n","dv_gb_avg1.identical(dv_gb_avg3)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["dv_avg1.identical(dv_avg3)"]},{"cell_type":"markdown","metadata":{},"source":["# Key Takeaways"]},{"cell_type":"markdown","metadata":{},"source":["- `weights.fillna(0)` is required if weights contain `np.nan` and the `.weighted().mean()` API is used for calculating simple weighted averages (e.g., ds.spatial.average())\n"," - `ValueError: 'weights' cannot contain missing values. Missing values can be replaced by ';weights.fillna(0)'.`\n"," - Question: Do the weights generated by our spatial averaging methods include `np.nan`?\n","- `weights` generated from time coordinates do not contain `np.nan` for missing values, so `weights.fillna(0)` is not required for temporal averaging\n","\n","**In any case, multiplying any weight value (`np.nan`, 0, 1, etc.) with a missing value (represented in xarray as `np.nan`) results in `np.nan`. This has no affect on the final reduction operation. (Refer to below cells)**\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["ds.ts * weights"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["ds.ts * filled_weights"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["ds.ts * masked_weights"]},{"cell_type":"markdown","metadata":{},"source":[]}],"metadata":{"interpreter":{"hash":"937205ea97caa5d37934716e0a0c6b9e4ffcdaf6e0f20ca1ff82361fb532cef2"},"kernelspec":{"display_name":"Python 3.9.12 ('xcdat_dev')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From d26a94338399ab378effac8cd2e1c0e0558de82f Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Wed, 25 May 2022 12:32:30 -0700 Subject: [PATCH 2/3] Update notebook --- validation/v1.0.0/temporal_average/masked_data_weights.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validation/v1.0.0/temporal_average/masked_data_weights.ipynb b/validation/v1.0.0/temporal_average/masked_data_weights.ipynb index ff5ebff..1586483 100644 --- a/validation/v1.0.0/temporal_average/masked_data_weights.ipynb +++ b/validation/v1.0.0/temporal_average/masked_data_weights.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Experimenting with Masked Data and Weighting in xarray\n","\n","This notebook experiments with how weighting with masked data can affect the results of reduction operations such as simple averages and grouped averages. Specifically, this notebook looks into weighted temporal averaging."]},{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["# flake8: noqa f401\n","import numpy as np\n","import xarray as xr\n","\n","import xcdat"]},{"cell_type":"markdown","metadata":{},"source":["## Create a sample monthly time series dataset"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["ds = xr.Dataset(\n"," coords={\n"," \"lat\": [-90],\n"," \"lon\": [0],\n"," \"time\": xr.DataArray(\n"," data=np.array(\n"," [\n"," \"2000-01-01T00:00:00.000000000\",\n"," \"2000-02-01T00:00:00.000000000\",\n"," \"2000-03-01T00:00:00.000000000\",\n"," \"2000-04-01T00:00:00.000000000\",\n"," \"2001-01-01T00:00:00.000000000\",\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," dims=[\"time\"],\n"," attrs={\n"," \"axis\": \"T\",\n"," \"long_name\": \"time\",\n"," \"standard_name\": \"time\",\n"," \"bounds\": \"time_bnds\",\n"," },\n"," ),\n"," }\n",")\n","ds[\"time_bnds\"] = xr.DataArray(\n"," name=\"time_bnds\",\n"," data=np.array(\n"," [\n"," [\"2000-01-01T00:00:00.000000000\", \"2000-02-01T00:00:00.000000000\"],\n"," [\"2000-02-01T00:00:00.000000000\", \"2000-03-01T00:00:00.000000000\"],\n"," [\"2000-03-01T00:00:00.000000000\", \"2000-04-01T00:00:00.000000000\"],\n"," [\"2000-04-01T00:00:00.000000000\", \"2000-05-01T00:00:00.000000000\"],\n"," [\"2000-12-01T00:00:00.000000000\", \"2001-01-01T00:00:00.000000000\"],\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," coords={\"time\": ds.time},\n"," dims=[\"time\", \"bnds\"],\n"," attrs={\"is_generated\": \"True\"},\n",")\n","\n","ds[\"ts\"] = xr.DataArray(\n"," data=np.array([[[2]], [[np.nan]], [[1]], [[1]], [[2]]]),\n"," coords={\"lat\": ds.lat, \"lon\": ds.lon, \"time\": ds.time},\n"," dims=[\"time\", \"lat\", \"lon\"],\n",")"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.Dataset>\n","Dimensions:    (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n","  * lat        (lat) int64 -90\n","  * lon        (lon) int64 0\n","  * time       (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n","    time_bnds  (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    ts         (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0
"],"text/plain":["\n","Dimensions: (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n"," time_bnds (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," ts (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["ds"]},{"cell_type":"markdown","metadata":{},"source":["## 1. Set the TemporalAccessor class attributes to call internal methods."]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["ds.temporal._time_bounds = ds.time_bnds\n","ds.temporal._mode = \"average\"\n","ds.temporal._freq = ds.temporal._infer_freq()"]},{"cell_type":"markdown","metadata":{},"source":["## 2. Calculate the weights for monthly data (uses time bounds)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["weights = ds.temporal._get_weights()\n","weights"]},{"cell_type":"markdown","metadata":{},"source":["## 3. Calculate weighted means with generated weights."]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["# Grouped weighted average (annual cycle)\n","dv_gb_avg1 = (ds.ts * weights).groupby(\"time.month\").sum()\n","dv_gb_avg1"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'ts' (lat: 1, lon: 1)>\n","array([[1.33333333]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0
"],"text/plain":["\n","array([[1.33333333]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["# Simple average with monthly weights\n","dv_avg1 = ds.ts.weighted(weights).mean(dim=\"time\")\n","dv_avg1"]},{"cell_type":"markdown","metadata":{},"source":["## 4. Means with weights masked with `np.nan` for missing values"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["masked_weights = weights.copy()\n","masked_weights.data[1] = np.nan"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["dv_gb_avg2 = (ds.ts * masked_weights).groupby(\"time.month\").sum()\n","dv_gb_avg2"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["# ValueError: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.\n","dv_avg2 = ds.ts.weighted(masked_weights).mean(dim=\"time\")"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["## 5. Means with weights masked with 0 for missing values\n","filled_weights = masked_weights.copy().fillna(0)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["dv_gb_avg3 = (ds.ts * filled_weights).groupby(\"time.month\").sum()\n","dv_gb_avg3"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["dv_avg3 = ds.ts.weighted(filled_weights).mean(dim=\"time\")\n","dv_avg3"]},{"cell_type":"markdown","metadata":{},"source":["## Compare results"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Generated weights vs. generated weights with np.nan for missing values\n","dv_gb_avg1.identical(dv_gb_avg2)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# dv_avg1.identical(dv_avg2) # Does not work since missing weight values must be filled"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Generated weights vs. generated weights with 0 for missing values\n","dv_gb_avg1.identical(dv_gb_avg3)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["dv_avg1.identical(dv_avg3)"]},{"cell_type":"markdown","metadata":{},"source":["# Key Takeaways"]},{"cell_type":"markdown","metadata":{},"source":["- `weights.fillna(0)` is required if weights contain `np.nan` and the `.weighted().mean()` API is used for calculating simple weighted averages (e.g., ds.spatial.average())\n"," - `ValueError: 'weights' cannot contain missing values. Missing values can be replaced by ';weights.fillna(0)'.`\n"," - Question: Do the weights generated by our spatial averaging methods include `np.nan`?\n","- `weights` generated from time coordinates do not contain `np.nan` for missing values, so `weights.fillna(0)` is not required for temporal averaging\n","\n","**In any case, multiplying any weight value (`np.nan`, 0, 1, etc.) with a missing value (represented in xarray as `np.nan`) results in `np.nan`. This has no affect on the final reduction operation. (Refer to below cells)**\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["ds.ts * weights"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["ds.ts * filled_weights"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["ds.ts * masked_weights"]},{"cell_type":"markdown","metadata":{},"source":[]}],"metadata":{"interpreter":{"hash":"937205ea97caa5d37934716e0a0c6b9e4ffcdaf6e0f20ca1ff82361fb532cef2"},"kernelspec":{"display_name":"Python 3.9.12 ('xcdat_dev')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Experimenting with Masked Data and Weighting in xarray\n","\n","This notebook experiments with how weighting with masked data can affect the results of reduction operations such as simple averages and grouped averages. Specifically, this notebook looks into weighted temporal averaging."]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["# flake8: noqa f401\n","import numpy as np\n","import xarray as xr\n","\n","import xcdat"]},{"cell_type":"markdown","metadata":{},"source":["## Create a sample monthly time series dataset"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[],"source":["ds = xr.Dataset(\n"," coords={\n"," \"lat\": [-90],\n"," \"lon\": [0],\n"," \"time\": xr.DataArray(\n"," data=np.array(\n"," [\n"," \"2000-01-01T00:00:00.000000000\",\n"," \"2000-02-01T00:00:00.000000000\",\n"," \"2000-03-01T00:00:00.000000000\",\n"," \"2000-04-01T00:00:00.000000000\",\n"," \"2001-01-01T00:00:00.000000000\",\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," dims=[\"time\"],\n"," attrs={\n"," \"axis\": \"T\",\n"," \"long_name\": \"time\",\n"," \"standard_name\": \"time\",\n"," \"bounds\": \"time_bnds\",\n"," },\n"," ),\n"," }\n",")\n","ds[\"time_bnds\"] = xr.DataArray(\n"," name=\"time_bnds\",\n"," data=np.array(\n"," [\n"," [\"2000-01-01T00:00:00.000000000\", \"2000-02-01T00:00:00.000000000\"],\n"," [\"2000-02-01T00:00:00.000000000\", \"2000-03-01T00:00:00.000000000\"],\n"," [\"2000-03-01T00:00:00.000000000\", \"2000-04-01T00:00:00.000000000\"],\n"," [\"2000-04-01T00:00:00.000000000\", \"2000-05-01T00:00:00.000000000\"],\n"," [\"2000-12-01T00:00:00.000000000\", \"2001-01-01T00:00:00.000000000\"],\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," coords={\"time\": ds.time},\n"," dims=[\"time\", \"bnds\"],\n"," attrs={\"is_generated\": \"True\"},\n",")\n","\n","ds[\"ts\"] = xr.DataArray(\n"," data=np.array([[[2]], [[np.nan]], [[1]], [[1]], [[2]]]),\n"," coords={\"lat\": ds.lat, \"lon\": ds.lon, \"time\": ds.time},\n"," dims=[\"time\", \"lat\", \"lon\"],\n",")"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.Dataset>\n","Dimensions:    (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n","  * lat        (lat) int64 -90\n","  * lon        (lon) int64 0\n","  * time       (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n","    time_bnds  (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    ts         (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0
"],"text/plain":["\n","Dimensions: (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n"," time_bnds (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," ts (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["ds"]},{"cell_type":"markdown","metadata":{},"source":["## 1. Set the TemporalAccessor class attributes to call internal methods."]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["ds.temporal._time_bounds = ds.time_bnds\n","ds.temporal._mode = \"average\"\n","ds.temporal._freq = ds.temporal._infer_freq()"]},{"cell_type":"markdown","metadata":{},"source":["## 2. Calculate the weights for monthly data (uses time bounds)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["weights = ds.temporal._get_weights()\n","weights"]},{"cell_type":"markdown","metadata":{},"source":["## 3. Means with weights (no masking)"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["# Grouped weighted average (annual cycle)\n","dv_gb_avg1 = (ds.ts * weights).groupby(\"time.month\").sum()\n","dv_gb_avg1"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'ts' (lat: 1, lon: 1)>\n","array([[1.33333333]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0
"],"text/plain":["\n","array([[1.33333333]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["# Simple average with monthly weights\n","dv_avg1 = ds.ts.weighted(weights).mean(dim=\"time\")\n","dv_avg1"]},{"cell_type":"markdown","metadata":{},"source":["## 4. Means with weights (masked with `np.nan` for missing values)"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, nan, 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, nan, 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["masked_weights = weights.copy()\n","masked_weights.data[1] = np.nan\n","masked_weights"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":["dv_gb_avg2 = (ds.ts * masked_weights).groupby(\"time.month\").sum()\n","dv_gb_avg2"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[{"ename":"ValueError","evalue":"`weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)","\u001b[1;32m/home/vo13/XCDAT/xcdat_test/validation/v1.0.0/temporal_average/masked_data_weights.ipynb Cell 16'\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# ValueError: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m dv_avg2 \u001b[39m=\u001b[39m ds\u001b[39m.\u001b[39;49mts\u001b[39m.\u001b[39;49mweighted(masked_weights)\u001b[39m.\u001b[39mmean(dim\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mtime\u001b[39m\u001b[39m\"\u001b[39m)\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/common.py:822\u001b[0m, in \u001b[0;36mDataWithCoords.weighted\u001b[0;34m(self, weights)\u001b[0m\n\u001b[1;32m 805\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mweighted\u001b[39m(\u001b[39mself\u001b[39m: T_DataWithCoords, weights: \u001b[39m\"\u001b[39m\u001b[39mDataArray\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Weighted[T_Xarray]:\n\u001b[1;32m 806\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 807\u001b[0m \u001b[39m Weighted operations.\u001b[39;00m\n\u001b[1;32m 808\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[39m Missing values can be replaced by ``weights.fillna(0)``.\u001b[39;00m\n\u001b[1;32m 820\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 822\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_weighted_cls(\u001b[39mself\u001b[39;49m, weights)\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/weighted.py:118\u001b[0m, in \u001b[0;36mWeighted.__init__\u001b[0;34m(self, obj, weights)\u001b[0m\n\u001b[1;32m 112\u001b[0m weights \u001b[39m=\u001b[39m weights\u001b[39m.\u001b[39mcopy(\n\u001b[1;32m 113\u001b[0m data\u001b[39m=\u001b[39mweights\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mmap_blocks(_weight_check, dtype\u001b[39m=\u001b[39mweights\u001b[39m.\u001b[39mdtype),\n\u001b[1;32m 114\u001b[0m deep\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 115\u001b[0m )\n\u001b[1;32m 117\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m _weight_check(weights\u001b[39m.\u001b[39;49mdata)\n\u001b[1;32m 120\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mobj: T_Xarray \u001b[39m=\u001b[39m obj\n\u001b[1;32m 121\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mweights: \u001b[39m\"\u001b[39m\u001b[39mDataArray\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m weights\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/weighted.py:104\u001b[0m, in \u001b[0;36mWeighted.__init__.._weight_check\u001b[0;34m(w)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_weight_check\u001b[39m(w):\n\u001b[1;32m 102\u001b[0m \u001b[39m# Ref https://github.com/pydata/xarray/pull/4559/files#r515968670\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[39mif\u001b[39;00m duck_array_ops\u001b[39m.\u001b[39misnull(w)\u001b[39m.\u001b[39many():\n\u001b[0;32m--> 104\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 105\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m`weights` cannot contain missing values. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 106\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mMissing values can be replaced by `weights.fillna(0)`.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 107\u001b[0m )\n\u001b[1;32m 108\u001b[0m \u001b[39mreturn\u001b[39;00m w\n","\u001b[0;31mValueError\u001b[0m: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`."]}],"source":["# ValueError: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.\n","dv_avg2 = ds.ts.weighted(masked_weights).mean(dim=\"time\")"]},{"cell_type":"markdown","metadata":{},"source":["## 5. Means with weights (masked with 0 for missing values)"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, 0. , 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, 0. , 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["filled_weights = masked_weights.copy().fillna(0)\n","filled_weights"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["dv_gb_avg3 = (ds.ts * filled_weights).groupby(\"time.month\").sum()\n","dv_gb_avg3"]},{"cell_type":"code","execution_count":24,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'ts' (lat: 1, lon: 1)>\n","array([[1.33333333]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0
"],"text/plain":["\n","array([[1.33333333]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0"]},"execution_count":24,"metadata":{},"output_type":"execute_result"}],"source":["dv_avg3 = ds.ts.weighted(filled_weights).mean(dim=\"time\")\n","dv_avg3"]},{"cell_type":"markdown","metadata":{},"source":["## Compare results"]},{"cell_type":"code","execution_count":25,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["# Generated weights vs. generated weights with np.nan for missing values\n","dv_gb_avg1.identical(dv_gb_avg2)"]},{"cell_type":"code","execution_count":26,"metadata":{},"outputs":[],"source":["# dv_avg1.identical(dv_avg2) # Does not work since missing weight values must be filled"]},{"cell_type":"code","execution_count":27,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["# Generated weights vs. generated weights with 0 for missing values\n","dv_gb_avg1.identical(dv_gb_avg3)"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["dv_avg1.identical(dv_avg3)"]},{"cell_type":"markdown","metadata":{},"source":["# Key Takeaways"]},{"cell_type":"markdown","metadata":{},"source":["- `weights.fillna(0)` is required if weights contain `np.nan` and the `.weighted().mean()` API is used for calculating simple weighted averages (e.g., `ds.spatial.average()`)\n"," - `ValueError: 'weights' cannot contain missing values. Missing values can be replaced by 'weights.fillna(0)'.`\n"," - **Question: Do the weights generated by our spatial averaging methods include `np.nan`?**\n","- `weights` generated from time coordinates do not contain `np.nan` for missing values, so `weights.fillna(0)` is not required for temporal averaging\n","\n","**In any case, multiplying any weight value (`np.nan`, 0, 1, etc.) with a missing value (represented in xarray as `np.nan`) results in `np.nan`.**\n","\n","**This has no affect on the final reduction operation. (Refer to below cells)**\n"]},{"cell_type":"code","execution_count":29,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * weights"]},{"cell_type":"code","execution_count":30,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":30,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * filled_weights"]},{"cell_type":"code","execution_count":31,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":31,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * masked_weights"]},{"cell_type":"markdown","metadata":{},"source":[]}],"metadata":{"interpreter":{"hash":"937205ea97caa5d37934716e0a0c6b9e4ffcdaf6e0f20ca1ff82361fb532cef2"},"kernelspec":{"display_name":"Python 3.9.12 ('xcdat_dev')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From 3895c299abe05b890a178c592736247a62e481e7 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Tue, 28 Jun 2022 12:03:01 -0700 Subject: [PATCH 3/3] Move files to v0.3.0 - Add `output_dtype_comparison.ipynb` --- .../regrid_cdat_xarray.ipynb | 0 .../regrid_cdat_xarray_xcdat.ipynb | 0 .../regrid_cdat_xcdat.ipynb | 0 .../output_dtype_comparison.ipynb | 784 ++++++++++++++++++ .../floating_point_comparison.ipynb | 0 .../masked_data_weights.ipynb | 2 +- 6 files changed, 785 insertions(+), 1 deletion(-) rename validation/{v1.0.0 => v0.3.0}/horizontal_regridding/regrid_cdat_xarray.ipynb (100%) rename validation/{v1.0.0 => v0.3.0}/horizontal_regridding/regrid_cdat_xarray_xcdat.ipynb (100%) rename validation/{v1.0.0 => v0.3.0}/horizontal_regridding/regrid_cdat_xcdat.ipynb (100%) create mode 100644 validation/v0.3.0/spatial_average/output_dtype_comparison.ipynb rename validation/{v1.0.0 => v0.3.0}/temporal_average/floating_point_comparison.ipynb (100%) rename validation/{v1.0.0 => v0.3.0}/temporal_average/masked_data_weights.ipynb (99%) diff --git a/validation/v1.0.0/horizontal_regridding/regrid_cdat_xarray.ipynb b/validation/v0.3.0/horizontal_regridding/regrid_cdat_xarray.ipynb similarity index 100% rename from validation/v1.0.0/horizontal_regridding/regrid_cdat_xarray.ipynb rename to validation/v0.3.0/horizontal_regridding/regrid_cdat_xarray.ipynb diff --git a/validation/v1.0.0/horizontal_regridding/regrid_cdat_xarray_xcdat.ipynb b/validation/v0.3.0/horizontal_regridding/regrid_cdat_xarray_xcdat.ipynb similarity index 100% rename from validation/v1.0.0/horizontal_regridding/regrid_cdat_xarray_xcdat.ipynb rename to validation/v0.3.0/horizontal_regridding/regrid_cdat_xarray_xcdat.ipynb diff --git a/validation/v1.0.0/horizontal_regridding/regrid_cdat_xcdat.ipynb b/validation/v0.3.0/horizontal_regridding/regrid_cdat_xcdat.ipynb similarity index 100% rename from validation/v1.0.0/horizontal_regridding/regrid_cdat_xcdat.ipynb rename to validation/v0.3.0/horizontal_regridding/regrid_cdat_xcdat.ipynb diff --git a/validation/v0.3.0/spatial_average/output_dtype_comparison.ipynb b/validation/v0.3.0/spatial_average/output_dtype_comparison.ipynb new file mode 100644 index 0000000..b3b146d --- /dev/null +++ b/validation/v0.3.0/spatial_average/output_dtype_comparison.ipynb @@ -0,0 +1,784 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# xCDAT vs. CDAT Spatial Averaging Output `dtype`\n", + "\n", + "Objective:\n", + "\n", + "* Figure out the root cause for the large floating point differences (absolute and relative), which might be related to the `dtype`.\n", + "\n", + "Questions:\n", + "* Is xCDAT or CDAT doing something incorrectly?\n", + "* Does CDAT cast the type of the data from `float32` to `float64`?\n", + " * If it does, when does it cast the type?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Resources:\n", + "https://discourse.pangeo.io/t/variable-type-changing-when-using-xarray/1823/2\n", + "\n", + "* Same array, same bits, but different precision. In general, arbitrary smooth decimals can’t be represented exactly using float32 precision.\n", + "* The index type (Float64Index) actually comes from Pandas (pandas.Float64Index — pandas 1.3.3 documentation). In this case, promoting a float32 type to a float64 in the context of indexing should not affect selection operations, since float32 can safely be cast to float64\n", + "* The number 10.45 is interpreted differently if it is float32 vs float64.\n", + "* Builtin python floats are 64-bit, so IMO Xarray does the right thing by promoting everything to float64 when making comparisons with your lat values.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "import xcdat\n", + "import cdms2\n", + "import cdutil" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "fn = \"/p/user_pub/climate_work/pochedley1/surface/gistemp1200_GHCNv4_ERSSTv5.nc\"\n", + "\n", + "ds_xcdat = xcdat.open_dataset(fn)\n", + "ds_cdat = cdms2.open(fn)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Check dtype for the variable `\"tempanomaly\"`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float32')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "ds_xcdat[\"tempanomaly\"].dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float32')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "ds_cdat(\"tempanomaly\").dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### The `dtype` of the variable is `float32` for both libraries, so they agree." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Check dtype for the spatial averaging output for `\"tempanomaly\"`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "xCDAT" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "ds_xcdat_avg = ds_xcdat.spatial.average(\"tempanomaly\", axis=[\"Y\"])\n", + "ta_xcdat_avg = ds_xcdat_avg[\"tempanomaly\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'tempanomaly' (time: 1706, lon: 180)>\n",
+       "array([[ 0.0355457 ,  0.02695628,  0.01610215, ..., -0.03079494,\n",
+       "        -0.00960354,  0.00881284],\n",
+       "       [ 0.06634343,  0.04435139,  0.01892282, ...,  0.02397297,\n",
+       "         0.03883512,  0.05697105],\n",
+       "       [ 0.14746004,  0.11239842,  0.07544763, ...,  0.04206945,\n",
+       "         0.07549783,  0.11411799],\n",
+       "       ...,\n",
+       "       [ 0.8559086 ,  0.81907433,  0.75153327, ...,  1.0020294 ,\n",
+       "         0.9598275 ,  0.90502965],\n",
+       "       [ 0.64544296,  0.6352519 ,  0.5955443 , ...,  0.8723912 ,\n",
+       "         0.82596165,  0.80084634],\n",
+       "       [ 0.8317095 ,  0.8324741 ,  0.809845  , ...,  0.9373833 ,\n",
+       "         0.9137498 ,  0.8913162 ]], dtype=float32)\n",
+       "Coordinates:\n",
+       "  * lon      (lon) float32 -179.0 -177.0 -175.0 -173.0 ... 175.0 177.0 179.0\n",
+       "  * time     (time) datetime64[ns] 1880-01-15 1880-02-15 ... 2022-02-15\n",
+       "Attributes:\n",
+       "    long_name:     Surface temperature anomaly\n",
+       "    units:         K\n",
+       "    cell_methods:  time: mean
" + ], + "text/plain": [ + "\n", + "array([[ 0.0355457 , 0.02695628, 0.01610215, ..., -0.03079494,\n", + " -0.00960354, 0.00881284],\n", + " [ 0.06634343, 0.04435139, 0.01892282, ..., 0.02397297,\n", + " 0.03883512, 0.05697105],\n", + " [ 0.14746004, 0.11239842, 0.07544763, ..., 0.04206945,\n", + " 0.07549783, 0.11411799],\n", + " ...,\n", + " [ 0.8559086 , 0.81907433, 0.75153327, ..., 1.0020294 ,\n", + " 0.9598275 , 0.90502965],\n", + " [ 0.64544296, 0.6352519 , 0.5955443 , ..., 0.8723912 ,\n", + " 0.82596165, 0.80084634],\n", + " [ 0.8317095 , 0.8324741 , 0.809845 , ..., 0.9373833 ,\n", + " 0.9137498 , 0.8913162 ]], dtype=float32)\n", + "Coordinates:\n", + " * lon (lon) float32 -179.0 -177.0 -175.0 -173.0 ... 175.0 177.0 179.0\n", + " * time (time) datetime64[ns] 1880-01-15 1880-02-15 ... 2022-02-15\n", + "Attributes:\n", + " long_name: Surface temperature anomaly\n", + " units: K\n", + " cell_methods: time: mean" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ta_xcdat_avg" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float32')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ta_xcdat_avg.dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "CDAT" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "ta_cdat_avg = cdutil.averager(ds_cdat(\"tempanomaly\"), axis=\"y\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "variable_6\n", + "masked_array(\n", + " data=[[0.035545653325030215, 0.026956274832029305,\n", + " 0.016102136600269726, ..., -0.030794956835496247,\n", + " -0.009603562061872723, 0.00881278244252939],\n", + " [0.06634341186248234, 0.04435135777310842, 0.01892279890685338,\n", + " ..., 0.02397299728132458, 0.03883514230588276,\n", + " 0.05697104274523798],\n", + " [0.14746000307247378, 0.11239838593655853, 0.07544755841048628,\n", + " ..., 0.04206947944889023, 0.0754978114092197,\n", + " 0.11411796429245147],\n", + " ...,\n", + " [0.8559087033405837, 0.8190742797330746, 0.7515332662789139, ...,\n", + " 1.0020296439006402, 0.9598274150062646, 0.9050297894921276],\n", + " [0.645442851180381, 0.6352518904793731, 0.5955442821497092, ...,\n", + " 0.8723914081140204, 0.8259617007564366, 0.8008464881940878],\n", + " [0.8317094150775682, 0.8324741985594918, 0.8098450089447665, ...,\n", + " 0.937383456065645, 0.9137497736748647, 0.8913162676287111]],\n", + " mask=[[False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " ...,\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False]],\n", + " fill_value=1e+20)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ta_cdat_avg" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float64')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ta_cdat_avg.dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### xCDAT is returning `float32` (same as input dtype).\n", + "#### CDAT is returning `float64` (not the same as input type)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Why is CDAT returning `float64`?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you do a search for float in [genutil.averager](https://github.com/CDAT/genutil/blob/master/Lib/averager.py), you will find `numpy.float` referenced several times." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. What exactly is a `numpy.float`? It is `float64`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_46796/699376334.py:1: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " np.float\n" + ] + }, + { + "data": { + "text/plain": [ + "float" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.float" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Key Takeaways\n", + "\n", + "* `ds.spatial.average()` maintains a `dtype` of `float32`, which is **CORRECT**.\n", + "* `cdutil.averager()`/`genutil.averager()` is typecasting the `dtype` to `float64`, which is **INCORRECT**.\n", + "* Floating point comparisons between CDAT and xCDAT results in large differences since the `dtype` of the outputs are different (`float32` vs. `float64`)\n", + "\n", + "**Based on these findings, we are confident that xarray/xCDAT is doing the right thing with floating points, while CDAT is not.**" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.4 ('xcdat_test_stable')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "c4e4f1671e4a459c6edb3b171f1296b089ced6d44722ffcf6c4b28381b927c72" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/validation/v1.0.0/temporal_average/floating_point_comparison.ipynb b/validation/v0.3.0/temporal_average/floating_point_comparison.ipynb similarity index 100% rename from validation/v1.0.0/temporal_average/floating_point_comparison.ipynb rename to validation/v0.3.0/temporal_average/floating_point_comparison.ipynb diff --git a/validation/v1.0.0/temporal_average/masked_data_weights.ipynb b/validation/v0.3.0/temporal_average/masked_data_weights.ipynb similarity index 99% rename from validation/v1.0.0/temporal_average/masked_data_weights.ipynb rename to validation/v0.3.0/temporal_average/masked_data_weights.ipynb index 1586483..4ff1313 100644 --- a/validation/v1.0.0/temporal_average/masked_data_weights.ipynb +++ b/validation/v0.3.0/temporal_average/masked_data_weights.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Experimenting with Masked Data and Weighting in xarray\n","\n","This notebook experiments with how weighting with masked data can affect the results of reduction operations such as simple averages and grouped averages. Specifically, this notebook looks into weighted temporal averaging."]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["# flake8: noqa f401\n","import numpy as np\n","import xarray as xr\n","\n","import xcdat"]},{"cell_type":"markdown","metadata":{},"source":["## Create a sample monthly time series dataset"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[],"source":["ds = xr.Dataset(\n"," coords={\n"," \"lat\": [-90],\n"," \"lon\": [0],\n"," \"time\": xr.DataArray(\n"," data=np.array(\n"," [\n"," \"2000-01-01T00:00:00.000000000\",\n"," \"2000-02-01T00:00:00.000000000\",\n"," \"2000-03-01T00:00:00.000000000\",\n"," \"2000-04-01T00:00:00.000000000\",\n"," \"2001-01-01T00:00:00.000000000\",\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," dims=[\"time\"],\n"," attrs={\n"," \"axis\": \"T\",\n"," \"long_name\": \"time\",\n"," \"standard_name\": \"time\",\n"," \"bounds\": \"time_bnds\",\n"," },\n"," ),\n"," }\n",")\n","ds[\"time_bnds\"] = xr.DataArray(\n"," name=\"time_bnds\",\n"," data=np.array(\n"," [\n"," [\"2000-01-01T00:00:00.000000000\", \"2000-02-01T00:00:00.000000000\"],\n"," [\"2000-02-01T00:00:00.000000000\", \"2000-03-01T00:00:00.000000000\"],\n"," [\"2000-03-01T00:00:00.000000000\", \"2000-04-01T00:00:00.000000000\"],\n"," [\"2000-04-01T00:00:00.000000000\", \"2000-05-01T00:00:00.000000000\"],\n"," [\"2000-12-01T00:00:00.000000000\", \"2001-01-01T00:00:00.000000000\"],\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," coords={\"time\": ds.time},\n"," dims=[\"time\", \"bnds\"],\n"," attrs={\"is_generated\": \"True\"},\n",")\n","\n","ds[\"ts\"] = xr.DataArray(\n"," data=np.array([[[2]], [[np.nan]], [[1]], [[1]], [[2]]]),\n"," coords={\"lat\": ds.lat, \"lon\": ds.lon, \"time\": ds.time},\n"," dims=[\"time\", \"lat\", \"lon\"],\n",")"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.Dataset>\n","Dimensions:    (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n","  * lat        (lat) int64 -90\n","  * lon        (lon) int64 0\n","  * time       (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n","    time_bnds  (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    ts         (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0
"],"text/plain":["\n","Dimensions: (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n"," time_bnds (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," ts (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["ds"]},{"cell_type":"markdown","metadata":{},"source":["## 1. Set the TemporalAccessor class attributes to call internal methods."]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["ds.temporal._time_bounds = ds.time_bnds\n","ds.temporal._mode = \"average\"\n","ds.temporal._freq = ds.temporal._infer_freq()"]},{"cell_type":"markdown","metadata":{},"source":["## 2. Calculate the weights for monthly data (uses time bounds)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["weights = ds.temporal._get_weights()\n","weights"]},{"cell_type":"markdown","metadata":{},"source":["## 3. Means with weights (no masking)"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["# Grouped weighted average (annual cycle)\n","dv_gb_avg1 = (ds.ts * weights).groupby(\"time.month\").sum()\n","dv_gb_avg1"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'ts' (lat: 1, lon: 1)>\n","array([[1.33333333]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0
"],"text/plain":["\n","array([[1.33333333]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["# Simple average with monthly weights\n","dv_avg1 = ds.ts.weighted(weights).mean(dim=\"time\")\n","dv_avg1"]},{"cell_type":"markdown","metadata":{},"source":["## 4. Means with weights (masked with `np.nan` for missing values)"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, nan, 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, nan, 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["masked_weights = weights.copy()\n","masked_weights.data[1] = np.nan\n","masked_weights"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":["dv_gb_avg2 = (ds.ts * masked_weights).groupby(\"time.month\").sum()\n","dv_gb_avg2"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[{"ename":"ValueError","evalue":"`weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)","\u001b[1;32m/home/vo13/XCDAT/xcdat_test/validation/v1.0.0/temporal_average/masked_data_weights.ipynb Cell 16'\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# ValueError: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m dv_avg2 \u001b[39m=\u001b[39m ds\u001b[39m.\u001b[39;49mts\u001b[39m.\u001b[39;49mweighted(masked_weights)\u001b[39m.\u001b[39mmean(dim\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mtime\u001b[39m\u001b[39m\"\u001b[39m)\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/common.py:822\u001b[0m, in \u001b[0;36mDataWithCoords.weighted\u001b[0;34m(self, weights)\u001b[0m\n\u001b[1;32m 805\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mweighted\u001b[39m(\u001b[39mself\u001b[39m: T_DataWithCoords, weights: \u001b[39m\"\u001b[39m\u001b[39mDataArray\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Weighted[T_Xarray]:\n\u001b[1;32m 806\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 807\u001b[0m \u001b[39m Weighted operations.\u001b[39;00m\n\u001b[1;32m 808\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[39m Missing values can be replaced by ``weights.fillna(0)``.\u001b[39;00m\n\u001b[1;32m 820\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 822\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_weighted_cls(\u001b[39mself\u001b[39;49m, weights)\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/weighted.py:118\u001b[0m, in \u001b[0;36mWeighted.__init__\u001b[0;34m(self, obj, weights)\u001b[0m\n\u001b[1;32m 112\u001b[0m weights \u001b[39m=\u001b[39m weights\u001b[39m.\u001b[39mcopy(\n\u001b[1;32m 113\u001b[0m data\u001b[39m=\u001b[39mweights\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mmap_blocks(_weight_check, dtype\u001b[39m=\u001b[39mweights\u001b[39m.\u001b[39mdtype),\n\u001b[1;32m 114\u001b[0m deep\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 115\u001b[0m )\n\u001b[1;32m 117\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m _weight_check(weights\u001b[39m.\u001b[39;49mdata)\n\u001b[1;32m 120\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mobj: T_Xarray \u001b[39m=\u001b[39m obj\n\u001b[1;32m 121\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mweights: \u001b[39m\"\u001b[39m\u001b[39mDataArray\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m weights\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/weighted.py:104\u001b[0m, in \u001b[0;36mWeighted.__init__.._weight_check\u001b[0;34m(w)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_weight_check\u001b[39m(w):\n\u001b[1;32m 102\u001b[0m \u001b[39m# Ref https://github.com/pydata/xarray/pull/4559/files#r515968670\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[39mif\u001b[39;00m duck_array_ops\u001b[39m.\u001b[39misnull(w)\u001b[39m.\u001b[39many():\n\u001b[0;32m--> 104\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 105\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m`weights` cannot contain missing values. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 106\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mMissing values can be replaced by `weights.fillna(0)`.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 107\u001b[0m )\n\u001b[1;32m 108\u001b[0m \u001b[39mreturn\u001b[39;00m w\n","\u001b[0;31mValueError\u001b[0m: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`."]}],"source":["# ValueError: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.\n","dv_avg2 = ds.ts.weighted(masked_weights).mean(dim=\"time\")"]},{"cell_type":"markdown","metadata":{},"source":["## 5. Means with weights (masked with 0 for missing values)"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, 0. , 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, 0. , 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["filled_weights = masked_weights.copy().fillna(0)\n","filled_weights"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["dv_gb_avg3 = (ds.ts * filled_weights).groupby(\"time.month\").sum()\n","dv_gb_avg3"]},{"cell_type":"code","execution_count":24,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'ts' (lat: 1, lon: 1)>\n","array([[1.33333333]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0
"],"text/plain":["\n","array([[1.33333333]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0"]},"execution_count":24,"metadata":{},"output_type":"execute_result"}],"source":["dv_avg3 = ds.ts.weighted(filled_weights).mean(dim=\"time\")\n","dv_avg3"]},{"cell_type":"markdown","metadata":{},"source":["## Compare results"]},{"cell_type":"code","execution_count":25,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["# Generated weights vs. generated weights with np.nan for missing values\n","dv_gb_avg1.identical(dv_gb_avg2)"]},{"cell_type":"code","execution_count":26,"metadata":{},"outputs":[],"source":["# dv_avg1.identical(dv_avg2) # Does not work since missing weight values must be filled"]},{"cell_type":"code","execution_count":27,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["# Generated weights vs. generated weights with 0 for missing values\n","dv_gb_avg1.identical(dv_gb_avg3)"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["dv_avg1.identical(dv_avg3)"]},{"cell_type":"markdown","metadata":{},"source":["# Key Takeaways"]},{"cell_type":"markdown","metadata":{},"source":["- `weights.fillna(0)` is required if weights contain `np.nan` and the `.weighted().mean()` API is used for calculating simple weighted averages (e.g., `ds.spatial.average()`)\n"," - `ValueError: 'weights' cannot contain missing values. Missing values can be replaced by 'weights.fillna(0)'.`\n"," - **Question: Do the weights generated by our spatial averaging methods include `np.nan`?**\n","- `weights` generated from time coordinates do not contain `np.nan` for missing values, so `weights.fillna(0)` is not required for temporal averaging\n","\n","**In any case, multiplying any weight value (`np.nan`, 0, 1, etc.) with a missing value (represented in xarray as `np.nan`) results in `np.nan`.**\n","\n","**This has no affect on the final reduction operation. (Refer to below cells)**\n"]},{"cell_type":"code","execution_count":29,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * weights"]},{"cell_type":"code","execution_count":30,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":30,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * filled_weights"]},{"cell_type":"code","execution_count":31,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":31,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * masked_weights"]},{"cell_type":"markdown","metadata":{},"source":[]}],"metadata":{"interpreter":{"hash":"937205ea97caa5d37934716e0a0c6b9e4ffcdaf6e0f20ca1ff82361fb532cef2"},"kernelspec":{"display_name":"Python 3.9.12 ('xcdat_dev')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Experimenting with Masked Data and Weighting in xarray\n","\n","This notebook experiments with how weighting with masked data can affect the results of reduction operations such as simple averages and grouped averages. Specifically, this notebook looks into weighted temporal averaging."]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["# flake8: noqa f401\n","import numpy as np\n","import xarray as xr\n","\n","import xcdat"]},{"cell_type":"markdown","metadata":{},"source":["## Create a sample monthly time series dataset"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[],"source":["ds = xr.Dataset(\n"," coords={\n"," \"lat\": [-90],\n"," \"lon\": [0],\n"," \"time\": xr.DataArray(\n"," data=np.array(\n"," [\n"," \"2000-01-01T00:00:00.000000000\",\n"," \"2000-02-01T00:00:00.000000000\",\n"," \"2000-03-01T00:00:00.000000000\",\n"," \"2000-04-01T00:00:00.000000000\",\n"," \"2001-01-01T00:00:00.000000000\",\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," dims=[\"time\"],\n"," attrs={\n"," \"axis\": \"T\",\n"," \"long_name\": \"time\",\n"," \"standard_name\": \"time\",\n"," \"bounds\": \"time_bnds\",\n"," },\n"," ),\n"," }\n",")\n","ds[\"time_bnds\"] = xr.DataArray(\n"," name=\"time_bnds\",\n"," data=np.array(\n"," [\n"," [\"2000-01-01T00:00:00.000000000\", \"2000-02-01T00:00:00.000000000\"],\n"," [\"2000-02-01T00:00:00.000000000\", \"2000-03-01T00:00:00.000000000\"],\n"," [\"2000-03-01T00:00:00.000000000\", \"2000-04-01T00:00:00.000000000\"],\n"," [\"2000-04-01T00:00:00.000000000\", \"2000-05-01T00:00:00.000000000\"],\n"," [\"2000-12-01T00:00:00.000000000\", \"2001-01-01T00:00:00.000000000\"],\n"," ],\n"," dtype=\"datetime64[ns]\",\n"," ),\n"," coords={\"time\": ds.time},\n"," dims=[\"time\", \"bnds\"],\n"," attrs={\"is_generated\": \"True\"},\n",")\n","\n","ds[\"ts\"] = xr.DataArray(\n"," data=np.array([[[2]], [[np.nan]], [[1]], [[1]], [[2]]]),\n"," coords={\"lat\": ds.lat, \"lon\": ds.lon, \"time\": ds.time},\n"," dims=[\"time\", \"lat\", \"lon\"],\n",")"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.Dataset>\n","Dimensions:    (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n","  * lat        (lat) int64 -90\n","  * lon        (lon) int64 0\n","  * time       (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n","    time_bnds  (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    ts         (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0
"],"text/plain":["\n","Dimensions: (lat: 1, lon: 1, time: 5, bnds: 2)\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","Dimensions without coordinates: bnds\n","Data variables:\n"," time_bnds (time, bnds) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," ts (time, lat, lon) float64 2.0 nan 1.0 1.0 2.0"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["ds"]},{"cell_type":"markdown","metadata":{},"source":["## 1. Set the TemporalAccessor class attributes to call internal methods."]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["ds.temporal._time_bounds = ds.time_bnds\n","ds.temporal._mode = \"average\"\n","ds.temporal._freq = ds.temporal._infer_freq()"]},{"cell_type":"markdown","metadata":{},"source":["## 2. Calculate the weights for monthly data (uses time bounds)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, 1. , 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["weights = ds.temporal._get_weights()\n","weights"]},{"cell_type":"markdown","metadata":{},"source":["## 3. Means with weights (no masking)"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["# Grouped weighted average (annual cycle)\n","dv_gb_avg1 = (ds.ts * weights).groupby(\"time.month\").sum()\n","dv_gb_avg1"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'ts' (lat: 1, lon: 1)>\n","array([[1.33333333]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0
"],"text/plain":["\n","array([[1.33333333]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["# Simple average with monthly weights\n","dv_avg1 = ds.ts.weighted(weights).mean(dim=\"time\")\n","dv_avg1"]},{"cell_type":"markdown","metadata":{},"source":["## 4. Means with weights (masked with `np.nan` for missing values)"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, nan, 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, nan, 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["masked_weights = weights.copy()\n","masked_weights.data[1] = np.nan\n","masked_weights"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":["dv_gb_avg2 = (ds.ts * masked_weights).groupby(\"time.month\").sum()\n","dv_gb_avg2"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[{"ename":"ValueError","evalue":"`weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)","\u001b[1;32m/home/vo13/XCDAT/xcdat_test/validation/v1.0.0/temporal_average/masked_data_weights.ipynb Cell 16'\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# ValueError: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m dv_avg2 \u001b[39m=\u001b[39m ds\u001b[39m.\u001b[39;49mts\u001b[39m.\u001b[39;49mweighted(masked_weights)\u001b[39m.\u001b[39mmean(dim\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mtime\u001b[39m\u001b[39m\"\u001b[39m)\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/common.py:822\u001b[0m, in \u001b[0;36mDataWithCoords.weighted\u001b[0;34m(self, weights)\u001b[0m\n\u001b[1;32m 805\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mweighted\u001b[39m(\u001b[39mself\u001b[39m: T_DataWithCoords, weights: \u001b[39m\"\u001b[39m\u001b[39mDataArray\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Weighted[T_Xarray]:\n\u001b[1;32m 806\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 807\u001b[0m \u001b[39m Weighted operations.\u001b[39;00m\n\u001b[1;32m 808\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[39m Missing values can be replaced by ``weights.fillna(0)``.\u001b[39;00m\n\u001b[1;32m 820\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 822\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_weighted_cls(\u001b[39mself\u001b[39;49m, weights)\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/weighted.py:118\u001b[0m, in \u001b[0;36mWeighted.__init__\u001b[0;34m(self, obj, weights)\u001b[0m\n\u001b[1;32m 112\u001b[0m weights \u001b[39m=\u001b[39m weights\u001b[39m.\u001b[39mcopy(\n\u001b[1;32m 113\u001b[0m data\u001b[39m=\u001b[39mweights\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mmap_blocks(_weight_check, dtype\u001b[39m=\u001b[39mweights\u001b[39m.\u001b[39mdtype),\n\u001b[1;32m 114\u001b[0m deep\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 115\u001b[0m )\n\u001b[1;32m 117\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m _weight_check(weights\u001b[39m.\u001b[39;49mdata)\n\u001b[1;32m 120\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mobj: T_Xarray \u001b[39m=\u001b[39m obj\n\u001b[1;32m 121\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mweights: \u001b[39m\"\u001b[39m\u001b[39mDataArray\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m weights\n","File \u001b[0;32m~/miniconda3/envs/xcdat_test_dev/lib/python3.9/site-packages/xarray/core/weighted.py:104\u001b[0m, in \u001b[0;36mWeighted.__init__.._weight_check\u001b[0;34m(w)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_weight_check\u001b[39m(w):\n\u001b[1;32m 102\u001b[0m \u001b[39m# Ref https://github.com/pydata/xarray/pull/4559/files#r515968670\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[39mif\u001b[39;00m duck_array_ops\u001b[39m.\u001b[39misnull(w)\u001b[39m.\u001b[39many():\n\u001b[0;32m--> 104\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 105\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m`weights` cannot contain missing values. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 106\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mMissing values can be replaced by `weights.fillna(0)`.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 107\u001b[0m )\n\u001b[1;32m 108\u001b[0m \u001b[39mreturn\u001b[39;00m w\n","\u001b[0;31mValueError\u001b[0m: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`."]}],"source":["# ValueError: `weights` cannot contain missing values. Missing values can be replaced by `weights.fillna(0)`.\n","dv_avg2 = ds.ts.weighted(masked_weights).mean(dim=\"time\")"]},{"cell_type":"markdown","metadata":{},"source":["## 5. Means with weights (masked with 0 for missing values)"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'time_bnds' (time: 5)>\n","array([0.5, 0. , 1. , 1. , 0.5])\n","Coordinates:\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([0.5, 0. , 1. , 1. , 0.5])\n","Coordinates:\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["filled_weights = masked_weights.copy().fillna(0)\n","filled_weights"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (month: 4, lat: 1, lon: 1)>\n","array([[[2.]],\n","\n","       [[0.]],\n","\n","       [[1.]],\n","\n","       [[1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * month    (month) int64 1 2 3 4
"],"text/plain":["\n","array([[[2.]],\n","\n"," [[0.]],\n","\n"," [[1.]],\n","\n"," [[1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * month (month) int64 1 2 3 4"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["dv_gb_avg3 = (ds.ts * filled_weights).groupby(\"time.month\").sum()\n","dv_gb_avg3"]},{"cell_type":"code","execution_count":24,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray 'ts' (lat: 1, lon: 1)>\n","array([[1.33333333]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0
"],"text/plain":["\n","array([[1.33333333]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0"]},"execution_count":24,"metadata":{},"output_type":"execute_result"}],"source":["dv_avg3 = ds.ts.weighted(filled_weights).mean(dim=\"time\")\n","dv_avg3"]},{"cell_type":"markdown","metadata":{},"source":["## Compare results"]},{"cell_type":"code","execution_count":25,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["# Generated weights vs. generated weights with np.nan for missing values\n","dv_gb_avg1.identical(dv_gb_avg2)"]},{"cell_type":"code","execution_count":26,"metadata":{},"outputs":[],"source":["# dv_avg1.identical(dv_avg2) # Does not work since missing weight values must be filled"]},{"cell_type":"code","execution_count":27,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["# Generated weights vs. generated weights with 0 for missing values\n","dv_gb_avg1.identical(dv_gb_avg3)"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[{"data":{"text/plain":["True"]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["dv_avg1.identical(dv_avg3)"]},{"cell_type":"markdown","metadata":{},"source":["# Key Takeaways"]},{"cell_type":"markdown","metadata":{},"source":["- `weights.fillna(0)` is required if weights contain `np.nan` and the `.weighted().mean()` API is used for calculating simple weighted averages (e.g., `ds.spatial.average()`)\n"," - `ValueError: 'weights' cannot contain missing values. Missing values can be replaced by 'weights.fillna(0)'.`\n"," - **Question: Do the weights generated by our spatial averaging methods include `np.nan`?**\n","- `weights` generated from time coordinates do not contain `np.nan` for missing values, so `weights.fillna(0)` is not required for temporal averaging\n","\n","**In any case, multiplying any weight value (`np.nan`, 0, 1, etc.) with a missing value (represented in xarray as `np.nan`) results in `np.nan`.**\n","\n","**This has no affect on the final reduction operation. (Refer to below cells)**\n"]},{"cell_type":"code","execution_count":29,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * weights"]},{"cell_type":"code","execution_count":30,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":30,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * filled_weights"]},{"cell_type":"code","execution_count":31,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","
<xarray.DataArray (time: 5, lat: 1, lon: 1)>\n","array([[[ 1.]],\n","\n","       [[nan]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]],\n","\n","       [[ 1.]]])\n","Coordinates:\n","  * lat      (lat) int64 -90\n","  * lon      (lon) int64 0\n","  * time     (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n","    month    (time) int64 1 2 3 4 1
"],"text/plain":["\n","array([[[ 1.]],\n","\n"," [[nan]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]],\n","\n"," [[ 1.]]])\n","Coordinates:\n"," * lat (lat) int64 -90\n"," * lon (lon) int64 0\n"," * time (time) datetime64[ns] 2000-01-01 2000-02-01 ... 2001-01-01\n"," month (time) int64 1 2 3 4 1"]},"execution_count":31,"metadata":{},"output_type":"execute_result"}],"source":["ds.ts * masked_weights"]},{"cell_type":"markdown","metadata":{},"source":[]}],"metadata":{"kernelspec":{"display_name":"Python 3.9.10 ('xcdat_test_dev')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.10"},"orig_nbformat":4,"vscode":{"interpreter":{"hash":"c3f4adc33f1fe9042637935adbad7be3e61b57faba27a9893d01cc27a363d602"}}},"nbformat":4,"nbformat_minor":2}