Merge branch 'main' into ig/fix_equality_checl

pydata · May 17, 2024 · 89acd11 · 89acd11
2 parents 5262440 + 31111b3
commit 89acd11
Show file tree

Hide file tree

Showing 29 changed files with 886 additions and 324 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -28,8 +28,11 @@ jobs:
           environment-name: xarray-tests
           cache-environment: true
           cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark"
+          # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385
           create-args: >-
             asv
+            build
+            mamba
 
 
       - name: Run benchmarks
@@ -47,9 +50,6 @@ jobs:
           asv machine --yes
           echo "Baseline:  ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})"
           echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})"
-          # Use mamba for env creation
-          # export CONDA_EXE=$(which mamba)
-          export CONDA_EXE=$(which conda)
           # Run benchmarks for current commit against base
           ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR"
           asv continuous $ASV_OPTIONS ${{ github.event.pull_request.base.sha }} ${GITHUB_SHA} \

diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
@@ -6,6 +6,12 @@ on:
   pull_request:
     branches:
       - "main"
+    paths:
+      - 'ci/**'
+      - '.github/**'
+      - '/*'  # covers files such as `pyproject.toml`
+      - 'properties/**'
+      - 'xarray/**'
   workflow_dispatch: # allows you to trigger manually
 
 concurrency:
@@ -127,7 +133,7 @@ jobs:
           python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/
 
       - name: Upload mypy coverage to Codecov
-        uses: codecov/[email protected].0
+        uses: codecov/[email protected].1
         with:
           file: mypy_report/cobertura.xml
           flags: mypy
@@ -181,7 +187,7 @@ jobs:
           python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/
 
       - name: Upload mypy coverage to Codecov
-        uses: codecov/[email protected].0
+        uses: codecov/[email protected].1
         with:
           file: mypy_report/cobertura.xml
           flags: mypy39
@@ -242,7 +248,7 @@ jobs:
           python -m pyright xarray/
 
       - name: Upload pyright coverage to Codecov
-        uses: codecov/[email protected].0
+        uses: codecov/[email protected].1
         with:
           file: pyright_report/cobertura.xml
           flags: pyright
@@ -301,7 +307,7 @@ jobs:
           python -m pyright xarray/
 
       - name: Upload pyright coverage to Codecov
-        uses: codecov/[email protected].0
+        uses: codecov/[email protected].1
         with:
           file: pyright_report/cobertura.xml
           flags: pyright39

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -6,6 +6,12 @@ on:
   pull_request:
     branches:
       - "main"
+    paths:
+      - 'ci/**'
+      - '.github/**'
+      - '/*'  # covers files such as `pyproject.toml`
+      - 'properties/**'
+      - 'xarray/**'
   workflow_dispatch: # allows you to trigger manually
 
 concurrency:
@@ -156,7 +162,7 @@ jobs:
           path: pytest.xml
 
       - name: Upload code coverage to Codecov
-        uses: codecov/[email protected].0
+        uses: codecov/[email protected].1
         with:
           file: ./coverage.xml
           flags: unittests

diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml
@@ -143,7 +143,7 @@ jobs:
         run: |
           python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report
       - name: Upload mypy coverage to Codecov
-        uses: codecov/[email protected].0
+        uses: codecov/[email protected].1
         with:
           file: mypy_report/cobertura.xml
           flags: mypy

diff --git a/.gitignore b/.gitignore
@@ -50,7 +50,8 @@ nosetests.xml
 dask-worker-space/
 
 # asv environments
-.asv
+asv_bench/.asv
+asv_bench/pkgs
 
 # Translations
 *.mo
@@ -68,7 +69,7 @@ dask-worker-space/
 
 # xarray specific
 doc/_build
-generated/
+doc/generated/
 xarray/tests/data/*.grib.*.idx
 
 # Sync tools

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@ ci:
 exclude: 'xarray/datatree_.*'
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
@@ -13,24 +13,24 @@ repos:
       - id: mixed-line-ending
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.3.4'
+    rev: 'v0.4.3'
     hooks:
       - id: ruff
         args: ["--fix", "--show-fixes"]
   # https://github.com/python/black#version-control-integration
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.3.0
+    rev: 24.4.2
     hooks:
       - id: black-jupyter
   - repo: https://github.com/keewis/blackdoc
     rev: v0.3.9
     hooks:
       - id: blackdoc
         exclude: "generate_aggregations.py"
-        additional_dependencies: ["black==24.3.0"]
+        additional_dependencies: ["black==24.4.2"]
       - id: blackdoc-autoupdate-black
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.9.0
+    rev: v1.10.0
     hooks:
       - id: mypy
         # Copied from setup.cfg

diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -29,7 +29,7 @@
     // If missing or the empty string, the tool will be automatically
     // determined by looking for tools on the PATH environment
     // variable.
-    "environment_type": "conda",
+    "environment_type": "mamba",
     "conda_channels": ["conda-forge"],
 
     // timeout in seconds for installing any dependencies in environment
@@ -41,7 +41,7 @@
 
     // The Pythons you'd like to test against.  If not provided, defaults
     // to the current version of Python used to run `asv`.
-    "pythons": ["3.10"],
+    "pythons": ["3.11"],
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
@@ -72,8 +72,12 @@
         "sparse": [""],
         "cftime": [""]
     },
-
-
+    // fix for bad builds
+    // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185
+    "build_command": [
+        "python -m build",
+        "python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
+    ],
     // Combinations of libraries/python versions can be excluded/included
     // from the set to test. Each entry is a dictionary containing additional
     // key-value pairs to include/exclude.

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -68,6 +68,7 @@ def setup(self, *args, **kwargs):
         self.ds2d_mean = self.ds2d.groupby("b").mean().compute()
 
 
+# TODO: These don't work now because we are calling `.compute` explicitly.
 class GroupByPandasDataFrame(GroupBy):
     """Run groupby tests using pandas DataFrame."""
 
@@ -111,11 +112,11 @@ def setup(self, *args, **kwargs):
             {
                 "b": ("time", np.arange(365.0 * 24)),
             },
-            coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)},
+            coords={"time": pd.date_range("2001-01-01", freq="h", periods=365 * 24)},
         )
         self.ds2d = self.ds1d.expand_dims(z=10)
-        self.ds1d_mean = self.ds1d.resample(time="48H").mean()
-        self.ds2d_mean = self.ds2d.resample(time="48H").mean()
+        self.ds1d_mean = self.ds1d.resample(time="48h").mean()
+        self.ds2d_mean = self.ds2d.resample(time="48h").mean()
 
     @parameterized(["ndim"], [(1, 2)])
     def time_init(self, ndim):
@@ -127,15 +128,15 @@ def time_init(self, ndim):
     def time_agg_small_num_groups(self, method, ndim, use_flox):
         ds = getattr(self, f"ds{ndim}d")
         with xr.set_options(use_flox=use_flox):
-            getattr(ds.resample(time="3M"), method)().compute()
+            getattr(ds.resample(time="3ME"), method)().compute()
 
     @parameterized(
         ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)]
     )
     def time_agg_large_num_groups(self, method, ndim, use_flox):
         ds = getattr(self, f"ds{ndim}d")
         with xr.set_options(use_flox=use_flox):
-            getattr(ds.resample(time="48H"), method)().compute()
+            getattr(ds.resample(time="48h"), method)().compute()
 
 
 class ResampleDask(Resample):
@@ -154,13 +155,13 @@ def setup(self, *args, **kwargs):
             },
             coords={
                 "time": xr.date_range(
-                    "2001-01-01", freq="H", periods=365 * 24, calendar="noleap"
+                    "2001-01-01", freq="h", periods=365 * 24, calendar="noleap"
                 )
             },
         )
         self.ds2d = self.ds1d.expand_dims(z=10)
-        self.ds1d_mean = self.ds1d.resample(time="48H").mean()
-        self.ds2d_mean = self.ds2d.resample(time="48H").mean()
+        self.ds1d_mean = self.ds1d.resample(time="48h").mean()
+        self.ds2d_mean = self.ds2d.resample(time="48h").mean()
 
 
 @parameterized(["use_cftime", "use_flox"], [[True, False], [True, False]])

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -12,12 +12,14 @@
 nt = 500
 
 basic_indexes = {
+    "1scalar": {"x": 0},
     "1slice": {"x": slice(0, 3)},
     "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)},
     "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)},
 }
 
 basic_assignment_values = {
+    "1scalar": 0,
     "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]),
     "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]),
     "2slicess-1scalar": xr.DataArray(
@@ -74,6 +76,10 @@ def setup(self, key):
                 "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
             },
         )
+        # Benchmark how indexing is slowed down by adding many scalar variable
+        # to the dataset
+        # https://github.com/pydata/xarray/pull/9003
+        self.ds_large = self.ds.merge({f"extra_var{i}": i for i in range(400)})
 
 
 class Indexing(Base):
@@ -89,6 +95,11 @@ def time_indexing_outer(self, key):
     def time_indexing_vectorized(self, key):
         self.ds.isel(**vectorized_indexes[key]).load()
 
+    @parameterized(["key"], [list(basic_indexes.keys())])
+    def time_indexing_basic_ds_large(self, key):
+        # https://github.com/pydata/xarray/pull/9003
+        self.ds_large.isel(**basic_indexes[key]).load()
+
 
 class Assignment(Base):
     @parameterized(["key"], [list(basic_indexes.keys())])

diff --git a/ci/min_deps_check.py b/ci/min_deps_check.py
@@ -133,7 +133,7 @@ def process_pkg(
     - publication date of version suggested by policy (YYYY-MM-DD)
     - status ("<", "=", "> (!)")
     """
-    print("Analyzing %s..." % pkg)
+    print(f"Analyzing {pkg}...")
     versions = query_conda(pkg)
 
     try:

diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst
@@ -874,7 +874,7 @@ and then calling ``to_zarr`` with ``compute=False`` to write only metadata
     # The values of this dask array are entirely irrelevant; only the dtype,
     # shape and chunks are used
     dummies = dask.array.zeros(30, chunks=10)
-    ds = xr.Dataset({"foo": ("x", dummies)})
+    ds = xr.Dataset({"foo": ("x", dummies)}, coords={"x": np.arange(30)})
     path = "path/to/directory.zarr"
     # Now we write the metadata without computing any array values
     ds.to_zarr(path, compute=False)
@@ -890,7 +890,7 @@ where the data should be written (in index space, not label space), e.g.,
 
     # For convenience, we'll slice a single dataset, but in the real use-case
     # we would create them separately possibly even from separate processes.
-    ds = xr.Dataset({"foo": ("x", np.arange(30))})
+    ds = xr.Dataset({"foo": ("x", np.arange(30))}, coords={"x": np.arange(30)})
     # Any of the following region specifications are valid
     ds.isel(x=slice(0, 10)).to_zarr(path, region="auto")
     ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"})