diff --git a/.github/workflows/lint-black.yaml b/.github/workflows/lint-black.yaml
index a1933b32..821edea9 100644
--- a/.github/workflows/lint-black.yaml
+++ b/.github/workflows/lint-black.yaml
@@ -1,13 +1,6 @@
 name: Lint-black
 
-on:
-  push:
-    branches:
-      - '*'
-  pull_request:
-    branches:
-      - '*'
-
+on: [push, pull_request]
 
 jobs:
   black:
diff --git a/.github/workflows/lint-darglint.yaml b/.github/workflows/lint-darglint.yaml
index 903b3ca5..0974e9bf 100644
--- a/.github/workflows/lint-darglint.yaml
+++ b/.github/workflows/lint-darglint.yaml
@@ -1,13 +1,6 @@
 name: Lint-darglint
 
-on:
-  push:
-    branches:
-      - '*'
-  pull_request:
-    branches:
-      - '*'
-
+on: [push, pull_request]
 
 jobs:
   darglint:
diff --git a/.github/workflows/lint-isort.yaml b/.github/workflows/lint-isort.yaml
index db09cb85..6737815d 100644
--- a/.github/workflows/lint-isort.yaml
+++ b/.github/workflows/lint-isort.yaml
@@ -1,13 +1,6 @@
 name: Lint-isort
 
-on:
-  push:
-    branches:
-      - '*'
-  pull_request:
-    branches:
-      - '*'
-
+on: [push, pull_request]
 
 jobs:
   isort:
diff --git a/.github/workflows/lint-pydocstyle.yaml b/.github/workflows/lint-pydocstyle.yaml
index d0037497..7e7e8e8f 100644
--- a/.github/workflows/lint-pydocstyle.yaml
+++ b/.github/workflows/lint-pydocstyle.yaml
@@ -1,12 +1,6 @@
 name: Lint-pydocstyle
 
-on:
-  push:
-    branches:
-      - '*'
-  pull_request:
-    branches:
-      - '*'
+on: [push, pull_request]
 
 jobs:
   pydocstyle:
diff --git a/.github/workflows/lint-flake8.yaml b/.github/workflows/lint-ruff.yaml
similarity index 68%
rename from .github/workflows/lint-flake8.yaml
rename to .github/workflows/lint-ruff.yaml
index 3b46b309..ba74f1ff 100644
--- a/.github/workflows/lint-flake8.yaml
+++ b/.github/workflows/lint-ruff.yaml
@@ -1,15 +1,9 @@
-name: Lint-flake8
+name: Lint-ruff
 
-on:
-  push:
-    branches:
-      - '*'
-  pull_request:
-    branches:
-      - '*'
+on: [push, pull_request]
 
 jobs:
-  flake8:
+  ruff:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
@@ -22,6 +16,6 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         make install-lint
-    - name: Run flake8
+    - name: Run ruff
       run: |
-        make flake8
+        make ruff 
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index c7a3288d..375d0379 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ src/
 .vscode/*
 .coverage
 .eggs
+**.DS_Store
diff --git a/curvlinops/_torch_base.py b/curvlinops/_torch_base.py
index 00e71975..33df807c 100644
--- a/curvlinops/_torch_base.py
+++ b/curvlinops/_torch_base.py
@@ -191,7 +191,7 @@ def __check_tensor_and_preprocess(
         Raises:
             ValueError: If the input tensor has an invalid shape.
         """
-        if X.ndim > 2 or X.shape[0] != self.shape[1]:
+        if X.ndim > 2 or X.shape[0] != self.shape[1]:  # noqa: PLR2004
             raise ValueError(
                 f"Input tensor must have shape ({self.shape[1]},) or "
                 + f"({self.shape[1]}, K), with K arbitrary. Got {X.shape}."
@@ -583,8 +583,8 @@ def _loop_over_data(
             # Assume everything is handled by the model
             # if `X` is a custom data format
             if isinstance(X, Tensor):
-                X = X.to(self._device)
-            y = y.to(self._device)
+                X = X.to(self._device)  # noqa: PLW2901
+            y = y.to(self._device)  # noqa: PLW2901
             yield (X, y)
 
     def _get_normalization_factor(
diff --git a/curvlinops/diagonal/hutchinson.py b/curvlinops/diagonal/hutchinson.py
index b7647a5a..4f3e99c8 100644
--- a/curvlinops/diagonal/hutchinson.py
+++ b/curvlinops/diagonal/hutchinson.py
@@ -62,7 +62,7 @@ def __init__(self, A: LinearOperator):
         Raises:
             ValueError: If the operator is not square.
         """
-        if len(A.shape) != 2 or A.shape[0] != A.shape[1]:
+        if len(A.shape) != 2 or A.shape[0] != A.shape[1]:  # noqa: PLR2004
             raise ValueError(f"A must be square. Got shape {A.shape}.")
         self._A = A
 
diff --git a/curvlinops/fisher.py b/curvlinops/fisher.py
index d4a1e5e5..d5bc9d9d 100644
--- a/curvlinops/fisher.py
+++ b/curvlinops/fisher.py
@@ -292,7 +292,7 @@ def sample_grad_output(self, output: Tensor, num_samples: int, y: Tensor) -> Ten
             NotImplementedError: If the prediction does not have two dimensions.
             NotImplementedError: If binary classification labels are not binary.
         """
-        if output.ndim != 2:
+        if output.ndim != 2:  # noqa: PLR2004
             raise NotImplementedError(f"Only 2d outputs supported. Got {output.shape}")
 
         C = output.shape[1]
diff --git a/curvlinops/inverse.py b/curvlinops/inverse.py
index 0a35f578..7eb4afff 100644
--- a/curvlinops/inverse.py
+++ b/curvlinops/inverse.py
@@ -455,7 +455,8 @@ def _compute_inverse_factors(
                     warn(
                         f"Failed to compute Cholesky decomposition in {aaT.dtype} "
                         f"precision with error {error}. "
-                        "Retrying in double precision..."
+                        "Retrying in double precision...",
+                        stacklevel=2
                     )
                     # Retry in double precision
                     original_type = aaT.dtype
@@ -476,7 +477,8 @@ def _compute_inverse_factors(
                     warn(
                         f"Failed to compute Cholesky decomposition in {ggT.dtype} "
                         f"precision with error {error}. "
-                        "Retrying in double precision..."
+                        "Retrying in double precision...",
+                        stacklevel=2
                     )
                     # Retry in double precision
                     original_dtype = ggT.dtype
diff --git a/curvlinops/kfac.py b/curvlinops/kfac.py
index 1f6e488e..b0435e33 100644
--- a/curvlinops/kfac.py
+++ b/curvlinops/kfac.py
@@ -548,7 +548,7 @@ def draw_label(self, output: Tensor) -> Tensor:
             ValueError: If the output is not 2d.
             NotImplementedError: If the loss function is not supported.
         """
-        if output.ndim != 2:
+        if output.ndim != 2:  # noqa: PLR2004
             raise ValueError("Only a 2d output is supported.")
 
         if isinstance(self._loss_func, MSELoss):
diff --git a/curvlinops/kfac_utils.py b/curvlinops/kfac_utils.py
index d7061739..679cbf27 100644
--- a/curvlinops/kfac_utils.py
+++ b/curvlinops/kfac_utils.py
@@ -100,7 +100,7 @@ def loss_hessian_matrix_sqrt(
         NotImplementedError: If the loss function is ``BCEWithLogitsLoss`` but the
             target is not binary.
     """
-    if output_one_datum.ndim != 2 or output_one_datum.shape[0] != 1:
+    if output_one_datum.ndim != 2 or output_one_datum.shape[0] != 1:  # noqa: PLR2004
         raise ValueError(
             f"Expected 'output_one_datum' to be 2d with shape [1, C], got "
             f"{output_one_datum.shape}"
diff --git a/curvlinops/trace/hutchinson.py b/curvlinops/trace/hutchinson.py
index a2d1506c..26f5e8a8 100644
--- a/curvlinops/trace/hutchinson.py
+++ b/curvlinops/trace/hutchinson.py
@@ -58,7 +58,7 @@ def __init__(self, A: LinearOperator):
         Raises:
             ValueError: If the operator is not square.
         """
-        if len(A.shape) != 2 or A.shape[0] != A.shape[1]:
+        if len(A.shape) != 2 or A.shape[0] != A.shape[1]:  # noqa: PLR2004
             raise ValueError(f"A must be square. Got shape {A.shape}.")
         self._A = A
 
diff --git a/curvlinops/trace/meyer2020hutch.py b/curvlinops/trace/meyer2020hutch.py
index e37f32e3..a0f5b409 100644
--- a/curvlinops/trace/meyer2020hutch.py
+++ b/curvlinops/trace/meyer2020hutch.py
@@ -86,7 +86,7 @@ def __init__(
             ``basis_dim = s1`` and draw ``s2`` samples from Hutch++ such that
             ``2 * s1 + s2 = s``.
         """
-        if len(A.shape) != 2 or A.shape[0] != A.shape[1]:
+        if len(A.shape) != 2 or A.shape[0] != A.shape[1]:  # noqa: PLR2004
             raise ValueError(f"A must be square. Got shape {A.shape}.")
         self._A = A
 
diff --git a/docs/examples/basic_usage/example_benchmark.py b/docs/examples/basic_usage/example_benchmark.py
index 925febfb..afd572cc 100644
--- a/docs/examples/basic_usage/example_benchmark.py
+++ b/docs/examples/basic_usage/example_benchmark.py
@@ -185,7 +185,7 @@ def setup_problem(
         for m in supported_layers:
             # ignore the last layer of GPT because it has 50k outputs, which
             # will yield an extremely large Kronecker factor
-            if all(d <= 50_000 for d in m.weight.shape):
+            if all(d <= 50_000 for d in m.weight.shape):  # noqa: PLR2004
                 params.extend([p for p in m.parameters() if p.requires_grad])
     else:
         params = [p for p in model.parameters() if p.requires_grad]
@@ -523,7 +523,7 @@ def visualize_time_benchmark(
     num_gradients = x_max / reference
     spacing = 1 / 4
     num_ticks = 1 + floor(num_gradients / spacing)
-    while num_ticks > 8:
+    while num_ticks > 8:  # noqa: PLR2004
         spacing *= 2
         num_ticks = 1 + floor(num_gradients / spacing)
 
@@ -701,7 +701,7 @@ def visualize_peakmem_benchmark(
     num_gradients = x_max / reference
     spacing = 1 / 4
     num_ticks = 1 + floor(num_gradients / spacing)
-    while num_ticks > 8:
+    while num_ticks > 8:  # noqa: PLR2004
         spacing *= 2
         num_ticks = 1 + floor(num_gradients / spacing)
 
diff --git a/docs/examples/basic_usage/example_eigenvalues.py b/docs/examples/basic_usage/example_eigenvalues.py
index 5fdbccdd..8916500e 100644
--- a/docs/examples/basic_usage/example_eigenvalues.py
+++ b/docs/examples/basic_usage/example_eigenvalues.py
@@ -169,11 +169,10 @@ def orthonormalize(v: numpy.ndarray, basis: List[numpy.ndarray]) -> numpy.ndarra
 
             if eigenvalue is None:
                 eigenvalue = tmp_eigenvalue
+            elif abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) + 1e-6) < tol:
+                break
             else:
-                if abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) + 1e-6) < tol:
-                    break
-                else:
-                    eigenvalue = tmp_eigenvalue
+                eigenvalue = tmp_eigenvalue
 
         eigenvalues.append(eigenvalue)
         eigenvectors.append(v)
diff --git a/docs/examples/basic_usage/example_inverses.py b/docs/examples/basic_usage/example_inverses.py
index 849babd3..b68840c0 100644
--- a/docs/examples/basic_usage/example_inverses.py
+++ b/docs/examples/basic_usage/example_inverses.py
@@ -249,7 +249,7 @@
 # of the matrix to be inverted:
 max_eigval = eigsh(damped_GGN, k=1, which="LM", return_eigenvectors=False)[0]
 # eigenvalues (scale * damped_GGN_mat) are in [0; 2)
-scale = 1.0 if max_eigval < 2.0 else 1.99 / max_eigval
+scale = 1.0 if max_eigval < 2.0 else 1.99 / max_eigval  # noqa: PLR2004
 
 # %%
 #
diff --git a/docs/examples/basic_usage/example_model_merging.py b/docs/examples/basic_usage/example_model_merging.py
index ed91e982..0295f307 100644
--- a/docs/examples/basic_usage/example_model_merging.py
+++ b/docs/examples/basic_usage/example_model_merging.py
@@ -115,7 +115,7 @@ def make_dataset() -> TensorDataset:
     for epoch in range(num_epochs):
         for batch_idx, (X, y) in enumerate(data_loader):
             optimizer.zero_grad()
-            X, y = X.to(DEVICE), y.to(DEVICE)
+            X, y = X.to(DEVICE), y.to(DEVICE)  # noqa: PLW2901
             loss = loss_function(model(X), y)
             loss.backward()
             optimizer.step()
diff --git a/docs/examples/basic_usage/memory_benchmark.py b/docs/examples/basic_usage/memory_benchmark.py
index 6e7f70cc..d16642eb 100644
--- a/docs/examples/basic_usage/memory_benchmark.py
+++ b/docs/examples/basic_usage/memory_benchmark.py
@@ -21,7 +21,7 @@
 from curvlinops import KFACInverseLinearOperator, KFACLinearOperator
 
 
-def run_peakmem_benchmark(  # noqa: C901
+def run_peakmem_benchmark(  # noqa: C901, PLR0915
     linop_str: str, problem_str: str, device_str: str, op_str: str
 ):
     """Execute the memory benchmark for a given linear operator class and save results.
diff --git a/makefile b/makefile
index 84341dca..2a1dba50 100644
--- a/makefile
+++ b/makefile
@@ -29,8 +29,8 @@ help:
 	@echo "        Run black on the project"
 	@echo "black-check"
 	@echo "        Check if black would change files"
-	@echo "flake8"
-	@echo "        Run flake8 on the project"
+	@echo "ruff"
+	@echo "        Run ruff on the project"
 	@echo "conda-env"
 	@echo "        Create conda environment 'curvlinops' with dev setup"
 	@echo "darglint-check"
@@ -97,10 +97,10 @@ black:
 black-check:
 	@black . --config=black.toml --check
 
-.PHONY: flake8
+.PHONY: ruff
 
-flake8:
-	@flake8 .
+ruff:
+	@ruff check .
 
 .PHONY: darglint-check
 
@@ -122,6 +122,6 @@ conda-env:
 lint:
 	make black-check
 	make isort-check
-	make flake8
+	make ruff
 	make darglint-check
 	make pydocstyle-check
diff --git a/pyproject.toml b/pyproject.toml
index dd43effc..39128dbe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,14 +65,7 @@ test = [
 # Dependencies needed for linting.
 lint = [
     "black",
-    "flake8",
-    "mccabe",
-    "pycodestyle",
-    "pyflakes",
-    "pep8-naming",
-    "flake8-bugbear",
-    "flake8-comprehensions",
-    "flake8-tidy-imports",
+    "ruff",
     "darglint",
     "pydocstyle",
     "isort",
@@ -106,3 +99,38 @@ use_parentheses = true
 convention = "google"
 match = '.*\.py'
 match_dir = '^(?!(test|.git)).*'
+
+[tool.ruff]
+# Same as flake8's max-line-length
+line-length = 88
+
+[tool.ruff.lint]
+# Enable all rules from flake8 (E, F), plus additional ones
+select = ["E", "F", "B", "C", "W", "B9", "PLE", "PLW", "PLR"]
+ignore = [
+    # E501 max-line-length (replaced by B950 (max-line-length + 10%))
+    "E501",
+    # C408 use {} instead of dict() (ignored because pytorch uses dict)
+    "C408",
+    # E203 whitespace before :
+    "E203",
+    # E231 missing whitespace after ','
+    "E231",
+    # W291 trailing whitespace
+    "W291",
+    # E203 line break before binary operator (replaces W503)
+    "E203",
+    # Line break occurred after a binary operator (replaces W504)
+    "E226",
+    # B905 `zip()` without an explicit `strict=` parameter
+    "B905",
+    # Too many arguments in function definition (9 > 5)
+    "PLR0913",
+]
+
+[tool.ruff.lint.per-file-ignores]
+# Add any per-file ignores here if needed
+
+[tool.ruff.lint.flake8-bugbear]
+# Allow assert statements in tests
+extend-immutable-calls = ["pytest.raises", "pytest.warns", "pytest.mark.skip"]
diff --git a/test/papyan2020traces/test_spectrum.py b/test/papyan2020traces/test_spectrum.py
index e4b95623..a85a803f 100644
--- a/test/papyan2020traces/test_spectrum.py
+++ b/test/papyan2020traces/test_spectrum.py
@@ -47,7 +47,7 @@ def test_approximate_boundaries():
 
     for inputs, results in cases:
         output = approximate_boundaries(A, boundaries=inputs)
-        assert len(output) == 2
+        assert len(output) == 2  # noqa: PLR2004
         assert isinstance(output[0], float)
         assert isinstance(output[1], float)
         assert allclose(output, results)
@@ -69,7 +69,7 @@ def test_approximate_boundaries_abs():
 
     for inputs, results in cases:
         output = approximate_boundaries_abs(A, boundaries=inputs)
-        assert len(output) == 2
+        assert len(output) == 2  # noqa: PLR2004
         assert isinstance(output[0], float)
         assert isinstance(output[1], float)
         assert allclose(output, results)
diff --git a/test/test__torch_base.py b/test/test__torch_base.py
index 5e5205bb..6e2cdeb9 100644
--- a/test/test__torch_base.py
+++ b/test/test__torch_base.py
@@ -136,7 +136,7 @@ def __iter__(self) -> Iterator[Tuple[Union[Tensor, MutableMapping], Tensor]]:
                     if isinstance(value, Tensor):
                         X[key] = X[key][permutation]
             else:
-                X = X[permutation]
+                X = X[permutation]  # noqa: PLW2901
 
             yield X, y[permutation]
 
diff --git a/test/test_inverse.py b/test/test_inverse.py
index c9f3a429..6e726bdd 100644
--- a/test/test_inverse.py
+++ b/test/test_inverse.py
@@ -137,7 +137,7 @@ def test_Neumann_inverse_damped_GGN_matvec(inv_case, delta: float = 1e-2):
 
     # set scale such that Neumann series converges
     eval_max = eigh(damped_GGN_functorch)[0][-1]
-    scale = 1.0 if eval_max < 2 else 1.9 / eval_max
+    scale = 1.0 if eval_max < 2 else 1.9 / eval_max  # noqa: PLR2004
 
     # NOTE This may break when other cases are added because slow convergence
     inv_GGN = NeumannInverseLinearOperator(GGN + damping, num_terms=7_000, scale=scale)
@@ -303,7 +303,7 @@ def test_KFAC_inverse_damped_matmat(
 @mark.parametrize(
     "separate_weight_and_bias", [True, False], ids=["separate_bias", "joint_bias"]
 )
-def test_KFAC_inverse_heuristically_damped_matmat(  # noqa: C901
+def test_KFAC_inverse_heuristically_damped_matmat(  # noqa: C901, PLR0912, PLR0915
     case: Tuple[
         Module,
         Union[MSELoss, CrossEntropyLoss],
diff --git a/test/test_kfac.py b/test/test_kfac.py
index 59d455d3..37cafd14 100644
--- a/test/test_kfac.py
+++ b/test/test_kfac.py
@@ -560,7 +560,7 @@ def test_expand_setting_scaling(
             # MSE loss averages over number of output channels
             loss_term_factor *= output_random_variable_size
         for ggT in kfac_sum_torch._gradient_covariances.values():
-            ggT /= kfac_sum_torch._N_data * loss_term_factor
+            ggT.div_(kfac_sum_torch._N_data * loss_term_factor)
     kfac_simulated_mean_mat = kfac_sum @ eye(kfac_sum.shape[1])
 
     # KFAC with mean reduction
diff --git a/test/utils.py b/test/utils.py
index fdfeae3c..be0677e1 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -239,7 +239,7 @@ def forward(self, x: Tensor) -> Tensor:
         # Example: Transformer for translation: (batch, sequence_length, c)
         # (although second and third dimension would have to be transposed for
         # classification)
-        if x.ndim > 2 and self.loss == "CE":
+        if x.ndim > 2 and self.loss == "CE":  # noqa: PLR2004
             x = rearrange(x, "batch ... c -> batch c ...")
         return x