diff --git a/notebooks/generate_data_pes_4well.py b/notebooks/generate_data_pes_4well.py
new file mode 100755
index 00000000..58859f50
--- /dev/null
+++ b/notebooks/generate_data_pes_4well.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics.pairwise import pairwise_distances
+from selector import MaxMin, MaxSum, OptiSim, DISE, Medoid, GridPartition
+
+
+class FourWellPotential:
+    def __init__(self):
+        # papers taken from paper
+        self.v0 = 5.0
+        self.a0 = 0.6
+        self.a = [3.0, 1.5, 3.2, 2.0]
+        self.b_q1 = 0.1
+        self.b_q2 = 0.1
+        self.s_q1 = [0.3, 1.0, 0.4, 1.0]
+        self.s_q2 = [0.4, 1.0, 1.0, 0.1]
+        self.alpha = [1.3, -1.5, 1.4, -1.3]
+        self.beta = [-1.6, -1.7, 1.8, 1.23]
+        # evaluate potential at A, B, C, D minima (taken from paper)
+        points = np.array([[1.29, -1.65], [1.4, 1.78], [-1.29, -1.53], [-1.17, 1.56]])
+        self.minima = [points, self.evaluate(points)]
+
+    def evaluate(self, points):
+        if points.ndim != 2 and points.shape[1] != 2:
+            raise ValueError("Argument points must be 2D array with 2 columns.")
+
+        # TODO: This needs to be vectorized
+        values = []
+        for q1, q2 in points:
+            v = self.v0 + self.a0 * (np.exp(-((q1 - self.b_q1) ** 2) - (q2 - self.b_q2) ** 2))
+            for i in range(4):
+                v -= self.a[i] * np.exp(
+                    -self.s_q1[i] * (q1 - self.alpha[i]) ** 2
+                    - self.s_q2[i] * (q2 - self.beta[i]) ** 2
+                )
+            values.append(v)
+        return np.array(values)
+
+    def generate_points_uniform(
+        self, size_q1, size_q2, range_q1=(-3.0, 3.0), range_q2=(-3.0, 3.0), noise=0.0
+    ):
+        # sample q1 and q2 coordinates along each axes
+        q1 = np.linspace(range_q1[0], range_q1[1], size_q1)
+        q2 = np.linspace(range_q2[0], range_q2[1], size_q2)
+        # combine q1 and q2 coordinates into a grid
+        data = np.array(np.meshgrid(q1, q2)).T.reshape(-1, 2)
+        return data
+
+
+def select_monte_carlo(potential, min_potential, k, seed=42):
+    npoints = len(potential)
+    # compute Boltzman probability
+    prob = np.exp(-1.5 * k * (potential - min_potential))
+
+    rng = np.random.default_rng(seed=seed)
+    random = rng.random(size=npoints)
+    indices = np.where(prob > random)[0]
+    print("Monte Carlo selection with k:", k)
+    print("Minimum potential           :", min_potential)
+    print("Total number of points      :", npoints)
+    print("Number of selected points   :", len(indices))
+    return indices
+
+
+def generate_datasets(size_random, k_values, size_dataset, niter=10, seed=42):
+    # Create directory for storing plots and npz data files, if it does not exist
+    os.makedirs("data", exist_ok=True)
+
+    # Make an instance of the 4-well potential
+    # ----------------------------------------
+    pes = FourWellPotential()
+    print("Coordinates of the 4 minima :", pes.minima[0])
+    print("Potential at the 4 minima   :", pes.minima[1])
+
+    # Plot 2D potential using uniform grid
+    # ------------------------------------
+    points = pes.generate_points_uniform(size_q1=100, size_q2=100)
+    values = pes.evaluate(points)
+    print("Shape of PES points (for plotting):", points.shape)
+    print("Shape of PES values (for plotting): ", values.shape)
+    fname = "data/pes_4well.png"
+    plot_data_2d(points, values, highlight=pes.minima[0], title="Four Well Potential", fname=fname)
+    # 1D plot to see minimum D
+    # ------------------------
+    # q1_fixed = -1.17
+    # points_1d = pes.generate_points(size_q1=1, size_q2=50, range_q1=(q1_fixed, q1_fixed))
+    # values_1d = pes.evaluate(points_1d)
+    # plt.plot(points_1d[:, 1], values_1d)
+    # plt.xlabel("q2", fontsize=14)
+    # plt.ylabel("Potential (kcal/mol)", fontsize=14)
+    # plt.title(f"Intersection of PES at q1={q1_fixed}", fontsize=18)
+    # plt.show()
+
+    # Generate Dataset
+    # ----------------
+    # Select a set of random 2D points between -3.0 and 3.0. For large sizes (e.g., 10M),
+    # this gives a dense and uniform coverage of the 4-well potential.
+    # rng.random generates points in [0, 1] interval, so they are shifted ans scaled
+    # to cover -3.0 to 3.0 range
+    rng = np.random.default_rng(seed=seed)
+    points_random = (rng.random((int(size_random), 2)) - 0.5) * 6.0
+    values_random = pes.evaluate(points_random)
+    # plot_data_2d(points, values, highlight=points_random, title="Four Well Potential (kcal/mol)")
+
+    # Use Monte-Carlo (MC) method to select points for a given k, which results in a different
+    # number of points for each k. The idea is that as k increases, the dataset becomes
+    # more biased so we can better demonstrate the importance of diverse subset sampling.
+    # Note that k=0 gives back the entire set, because probability of all points is one.
+    # However, as k increases the number of points selected by MC becomes smaller. This causes
+    # an unfair comparison, when we select a percentage of the data (for each k) as a subset.
+    # E.g., 1% of 10M (k=0) is already going to be diverse, no matter how the selection is
+    # made while 1% of 10K (k=30) is not going to be diverse in general.
+    # So, we select a fixed number of data generated by MC (for all k values) for the rest
+    # of our analysis. Keep in mind that this selection will be random so that it has a
+    # similar distribution (i.e, the same statistical bias) as the original MC sample set.
+    # Knowing that k=20 for 10M points, selected ~15K points, we choose 10K for fixed size.
+    for k in k_values:
+        indices_mc = select_monte_carlo(values_random, np.min(pes.minima[1]), k=k, seed=seed)
+        # points_mc = points_random[indices_mc]
+        dataset = {}
+        # make subdirectory for each k to store data
+        folder = f"data/monte_carlo_k{k:02d}"
+        os.makedirs(folder, exist_ok=True)
+        os.makedirs(f"{folder}/plots", exist_ok=True)
+        # randomly select a fixed-size dataset from each k niter times
+        for index in range(niter):
+            indices_iter = np.random.choice(indices_mc, size_dataset, replace=False)
+            print(f" Select {len(indices_iter)} points iteration {index}")
+            # plot MC selected points and the fixed-size dataset
+            fname = f"dataset_{size_random:2.1e}_k{k:02d}_size{size_dataset:05d}_iter{index:02d}"
+            plot_data_2d(
+                points,
+                values,
+                highlight=(points_random[indices_mc], points_random[indices_iter]),
+                title=f"Monte Carlo (k={k}): {len(indices_iter)} out of {len(indices_mc)}",
+                fname=f"{folder}/plots/{fname}.png",
+            )
+            # store data for given k and iteration
+            # dataset[fname] = points_random[indices_iter]
+            dataset[fname] = {
+                "points": points_random[indices_iter].tolist(),
+                "values": values_random[indices_iter].tolist(),
+            }
+        # save data for given k
+        # np.savez_compressed(f"data/k{k:02.1f}/{fname.split('_iter')[0]}", **dataset)
+        print(f"Write {niter} iterations JSON data into data/{folder}")
+        with open(f"{folder}/{fname.split('_iter')[0]}.json", "w") as fp:
+            json.dump(dataset, fp, indent=4, sort_keys=True)
+
+
+def select_subsets(points, percentages, methods, seed=42):
+    rng = np.random.default_rng(seed=seed)
+    selected_indices = {}
+    # loop over methods first to avoid recomputing points_dist for MaxMin and MaxSum
+    for method in methods:
+        print(f" Selection Method: {method}")
+        if method not in selected_indices:
+            selected_indices[method] = {}
+        points_dist = None
+        if method in ["MaxMin", "MaxSum"]:
+            points_dist = pairwise_distances(points, metric="euclidean")
+        for p in percentages:
+            # make an entry for each percentage
+            if p not in selected_indices[method]:
+                # str(p) is used as key because JSON does not allow storing int keys
+                selected_indices[method][str(p)] = {}
+            size = int(p * len(points) / 100)
+            print(f"  Percentage={p: 2d}: Select {size} from {len(points)}")
+            if method == "Random":
+                indices = rng.choice(np.arange(len(points)), size, replace=False, shuffle=True)
+            elif method == "MaxMin":
+                indices = MaxMin(fun_dist=None).select(points_dist, size=size)
+            elif method == "MaxSum":
+                indices = MaxSum().select(points_dist, size=size)
+            elif method == "DISE":
+                indices = DISE().select(points, size=size)
+            elif method == "OptiSim":
+                indices = OptiSim().select(points, size=size)
+            elif method == "Medoid":
+                indices = Medoid().select(points, size=size)
+            elif method.startswith("GridPartition"):
+                grid_method = method.split("-")[1]
+                indices = GridPartition(nbins_axis=5, bin_method=grid_method).select(
+                    points, size=size
+                )
+            else:
+                raise ValueError(f"Unknown selection method: {method}")
+            # str(p) is used as key because JSON does not allow storing int keys
+            # indices are converted to int because JSON does not allow storing numpy.int64
+            selected_indices[method][str(p)] = [int(item) for item in indices]
+    return selected_indices
+
+
+def compute_diversity(points, selected_indices, measures):
+    diversity = {}
+    for measure in measures:
+        if measure == "LogDet":
+            diversity[measure] = logdet(points[selected_indices])
+        elif measure == "WDUD":
+            diversity[measure] = wdud(points[selected_indices])
+        elif measure == "MinDist":
+            points_dist = pairwise_distances(points[selected_indices], metric="euclidean")
+            upper_triangular = points_dist[np.triu_indices_from(points_dist, k=1)]
+            diversity[measure] = np.min(upper_triangular)
+        elif measure == "MeanDist":
+            points_dist = pairwise_distances(points[selected_indices], metric="euclidean")
+            upper_triangular = points_dist[np.triu_indices_from(points_dist, k=1)]
+            diversity[measure] = np.mean(upper_triangular)
+        else:
+            raise ValueError(f"Unknown diversity measure: {measure}")
+    return diversity
+
+
+def plot_data_2d(points, values, highlight=None, title="", fname=None):
+    """Plot 2D data with colorbar.
+
+    Parameters
+    ----------
+    points : np.ndarray
+        2D array with 2 columns representing coordinates.
+    values : np.ndarray
+        1D array with potential values for each point used for coloring.
+    highlight : np.ndarray
+        2D array with 2 columns representing coordinates of points to highlight
+        with a star marker.
+    title : str
+        Title of the plot.
+    """
+    if points.ndim != 2 and points.shape[1] != 2:
+        raise ValueError("Argument points must be 2D array with 2 columns")
+    plt.scatter(points[:, 0], points[:, 1], c=values, cmap="RdBu")
+    plt.colorbar(label="Potential (kcal/mol)")
+    if highlight is not None:
+        if isinstance(highlight, np.ndarray) and highlight.ndim == 2 and highlight.shape[1] == 2:
+            # raise ValueError("Argument highlight must be 2D array with 2 columns")
+            plt.scatter(highlight[:, 0], highlight[:, 1], c="black", marker="*", s=20)  # , s=100)
+        else:
+            for index, item in enumerate(highlight):
+                assert item.ndim == 2 and item.shape[1] == 2
+                if index == 0:
+                    plt.scatter(item[:, 0], item[:, 1], c="black", marker="*", s=50)
+                else:
+                    plt.scatter(
+                        item[:, 0],
+                        item[:, 1],
+                        marker="o",
+                        facecolors="none",
+                        edgecolors="cyan",
+                        linewidths=2,
+                        s=10,
+                    )
+
+    plt.title(title, fontsize=14)
+    plt.xlabel("q1", fontsize=14)
+    plt.ylabel("q2", fontsize=14)
+
+    if fname is not None:
+        plt.savefig(fname, dpi=300, bbox_inches="tight")
+    else:
+        plt.show()
+    plt.close()
+
+
+if __name__ == "__main__":
+    # get command line arguments
+    args = sys.argv[1:]
+
+    if args[0] == "generate_data":
+        # These parameter choices are discussed in generate_datasets function
+        size_random, size_dataset = 1.0e7, 10000
+        k_values = np.arange(0.0, 21.0, 1.0, dtype=int)
+        generate_datasets(size_random, k_values, size_dataset, niter=10, seed=42)
+
+    elif args[0] == "select_subsets":
+        fnames = sorted(args[1:])
+        # list of selector methods and percentages to select
+        selectors = [
+            "Random",
+            "MaxMin",
+            "MaxSum",
+            "OptiSim",
+            "DISE",
+            "Medoid",
+            "GridPartition-equisized_dependent",
+            "GridPartition-equisized_independent",
+            "GridPartition-equifrequent_dependent",
+            "GridPartition-equifrequent_independent",
+        ]
+        percentages = [2**i for i in range(7)]
+
+        # Make an instance of potential for visualization
+        # -----------------------------------------------
+        pes = FourWellPotential()
+        points_plot = pes.generate_points_uniform(size_q1=100, size_q2=100)
+        values_plot = pes.evaluate(points_plot)
+
+        # Loop over datasets, select subsets, plot and store selected indices
+        # -------------------------------------------------------------------
+        # example of fname: data/monte_carlo_k00/dataset_1.0e+06_k00_size01000.json
+        for fname in fnames:
+            # Load dataset from JSON file
+            # ---------------------------
+            print(f"LOAD {fname}")
+            with open(fname, "r") as fp:
+                data = json.load(fp)
+            folder, fname = os.path.split(fname)
+            os.makedirs(f"{folder}/plots", exist_ok=True)
+            # Loop over iterations of dataset
+            # -------------------------------
+            database_indices = {}
+            for fname_iter, values_iter in data.items():
+                print(f" SELECT from: {fname_iter}")
+                points_iter = np.array(values_iter["points"])
+                # values_iter = np.array(values_iter["values"])
+                # example of fname_points: dataset_1.0e+07_k10.0_size10000_iter00
+                # Select subsets
+                # --------------
+                # indices are returned as a nested dictionary {method: {percentage: indices}}
+                selected_indices = select_subsets(points_iter, percentages, selectors, seed=42)
+                database_indices[fname_iter] = selected_indices
+                print("DONE SELECTING")
+                # Plot Subsets
+                # ------------
+                for method, values in selected_indices.items():
+                    print("PLOTTING.....")
+                    os.makedirs(f"{folder}/plots/{method}", exist_ok=True)
+                    for p, indices in values.items():
+                        p = int(p)
+                        fname_plot = (
+                            f"{folder}/plots/{method}/subset_{method}_p{p:02d}_{fname_iter}.png"
+                        )
+                        print(f"PLOT {fname_plot}")
+                        plot_data_2d(
+                            points_plot,
+                            values_plot,
+                            highlight=(points_iter, points_iter[indices]),
+                            title=f"{fname_iter}\nMethod={method}, Percentage={p}, Size={len(indices)}",
+                            fname=fname_plot,
+                        )
+            # Save selected indices from fname database
+            # -----------------------------------------
+            for method in selectors:
+                temp = {fname: values[method] for fname, values in database_indices.items()}
+                fname_json = fname.replace("dataset", f"subset_{method}_dataset")
+                fname_json = f"{folder}/{fname_json}"
+                print(f"WRITE {fname_json}")
+                with open(fname_json, "w") as fp:
+                    json.dump(temp, fp, indent=4, sort_keys=True)
+
+    elif args[0] == "compute_diversity":
+        # make a dictionary for each dataset, so that diversity of iterations can be stored
+        # diversity_iterations = diversity.setdefault(fname.replace("data/", ""), {})
+        # measures = ["LogDet", "WDUD", "MinDist", "MeanDist"]
+        # # Compute diversity measures and plot subsets
+        # # -------------------------------------------
+        # for method, values in selected_indices.items():
+        #     if method not in diversity_iterations:
+        #         diversity_iterations[method] = {}
+        #     for p, indices in values.items():
+        #         if p not in diversity_iterations[method]:
+        #             diversity_iterations[method][p] = {}
+        #         # compute diversity returns a dictionary {measure: value}
+        #         print(f" Compute diversity for method={method} and p={p}")
+        #         d = compute_diversity(points_selected, indices, measures)
+        #         for measure, value in d.items():
+        #             items = diversity_iterations[method][p].setdefault(measure, [])
+        #             items.append(value)
+        #         # plot subsets
+        #         fname = f"data/{folder}/{method}_p{p:02d}_{fname_selected}.png"
+        #         plot_data_2d(
+        #             points_plot,
+        #             values_plot,
+        #             highlight=(points_selected, points_selected[indices]),
+        #             title=f"{fname_selected}\nMethod={method}, Percentage={p}, Size={len(indices)}",
+        #             fname=fname,
+        #         )
+
+        # # Save diversity data
+        # # -------------------
+        # # data is {fname: {method: {percentage: {measure: iteration_diversity_values}}}}
+        # print("Save diversity data to data/diversity.json")
+        # with open("data/diversity.json", "w") as fp:
+        #     json.dump(diversity, fp, indent=4, sort_keys=True)
+
+        # data is {fname: {method: {percentage: {measure: iteration_diversity_values}}}}
+        with open("data/diversity.json", "r") as fp:
+            data = json.load(fp)
+        print(len(data))
+        measures = ["LogDet", "WDUD"]
+        for measure in measures:
+            # for each dataset, plot diversity measures vs. percentage
+            for fname, diversity in data.items():
+                print(fname)
+                print(diversity)
+                print(type(diversity))
+                for method, values in diversity.items():
+                    x_values = sorted([int(p) for p in values.keys()])
+                    print("percentage values = ", x_values)
+                    y_values = [np.mean(values[str(p)][measure]) for p in x_values]
+                    plt.plot(x_values, y_values, label=method)
+                plt.xlabel("Percentage (%)")
+                plt.ylabel(measure)
+                plt.title(fname)
+                plt.legend()
+                plt.show()
+    else:
+        raise ValueError(f"Unknown task={args}")