From bed84a2aefd08c2eb141328a84d545b1a4ed07a6 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Sun, 13 Oct 2024 10:39:36 +0100 Subject: [PATCH 01/16] Add topostats file helper --- topostats/io.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/topostats/io.py b/topostats/io.py index 06b3a669e86..890663e6d7d 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1284,3 +1284,10 @@ def dict_to_json(data: dict, output_dir: str | Path, filename: str | Path, inden output_file = output_dir / filename with output_file.open("w") as f: json.dump(data, f, indent=indent, cls=NumpyEncoder) + + +class TopoFileHelper: + def __init__(self, topofile: Path | str): + self.topofile: Path = Path(topofile) + with h5py.File(self.topofile, "r") as f: + self.data: dict = hdf5_to_dict(open_hdf5_file=f, group_path="/") From a327e2085275c673b58fbaf0162c1d14bdf988dc Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Sun, 13 Oct 2024 10:42:34 +0100 Subject: [PATCH 02/16] Add find data function --- topostats/io.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/topostats/io.py b/topostats/io.py index 890663e6d7d..58aad4615cf 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1291,3 +1291,45 @@ def __init__(self, topofile: Path | str): self.topofile: Path = Path(topofile) with h5py.File(self.topofile, "r") as f: self.data: dict = hdf5_to_dict(open_hdf5_file=f, group_path="/") + def find_data(self, search_keys: list) -> None: + """ + Find the data in the dictionary that matches the list of keys. + + Parameters + ---------- + search_keys : list + The list of keys to search for. + + Returns + ------- + None + """ + # Find the best match for the list of keys + # First check if there is a direct match + LOGGER.info(f"[ Searching for {search_keys} in {self.topofile} ]") + + try: + current_data = self.data + for key in search_keys: + current_data = current_data[key] + + LOGGER.info("| [search] Direct match found") + except KeyError: + LOGGER.info("| [search] No direct match found.") + + # If no direct match is found, try to find a partial match + LOGGER.info("| [search] Searching for partial matches.") + partial_matches = self.search_partial_matches(data=self.data, keys=search_keys) + if partial_matches: + LOGGER.info(f"| [search] !! [ {len(partial_matches)} Partial matches found] !!") + for index, match in enumerate(partial_matches): + match_str = "/".join(match) + if index == len(partial_matches) - 1: + prefix = "| [search] └" + else: + prefix = "| [search] ├" + LOGGER.info(f"{prefix} {match_str}") + else: + LOGGER.info("| [search] No partial matches found.") + LOGGER.info("└ [End of search]") + return From 3cd66ffce609af5d136f970aa82bdb3d9f37f536 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Sun, 13 Oct 2024 10:43:09 +0100 Subject: [PATCH 03/16] Add data info function --- topostats/io.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/topostats/io.py b/topostats/io.py index 58aad4615cf..7e52c8ce018 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1333,3 +1333,44 @@ def find_data(self, search_keys: list) -> None: LOGGER.info("| [search] No partial matches found.") LOGGER.info("└ [End of search]") return + def data_info(self, location: str, verbose: bool = False) -> None: + """Get information about the data at a location. + + Parameters + ---------- + location : str + The location of the data in the dictionary, separated by '/'. + """ + # If there's a trailing '/', remove it + if location[-1] == "/": + location = location[:-1] + keys = location.split("/") + + try: + current_data = self.data + for key in keys: + current_data = current_data[key] + except KeyError as e: + LOGGER.error(f"[ Info ] Key not found: {e}, please check the location string.") + return + + if isinstance(current_data, dict): + key_types = {type(k) for k in current_data.keys()} + value_types = {type(v) for v in current_data.values()} + LOGGER.info( + f"[ Info ] Data at {location} is a dictionary with {len(current_data)} " + f"keys of types {key_types} and values " + f"of types {value_types}" + ) + if verbose: + for k, v in current_data.items(): + LOGGER.info(f" {k}: {type(v)}") + elif isinstance(current_data, np.ndarray): + LOGGER.info( + f"[ Info ] Data at {location} is a numpy array with shape: {current_data.shape}, " + f"dtype: {current_data.dtype}" + ) + else: + LOGGER.info(f"[ Info ] Data at {location} is {type(current_data)}") + + return From a9c3f8242d5f76929cb3b4ed7d00f7116f8584b1 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Sun, 13 Oct 2024 10:43:38 +0100 Subject: [PATCH 04/16] Add get data function --- topostats/io.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/topostats/io.py b/topostats/io.py index 7e52c8ce018..fae55004b73 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1333,6 +1333,34 @@ def find_data(self, search_keys: list) -> None: LOGGER.info("| [search] No partial matches found.") LOGGER.info("└ [End of search]") return + def get_data(self, location: str) -> int | float | str | np.ndarray | dict | None: + """ + Retrieve data from the dictionary using a '/' separated string. + + Parameters + ---------- + location : str + The location of the data in the dictionary, separated by '/'. + + Returns + ------- + int | float | str | np.ndarray | dict + The data at the location. + """ + # If there's a trailing '/', remove it + if location[-1] == "/": + location = location[:-1] + keys = location.split("/") + + try: + current_data = self.data + for key in keys: + current_data = current_data[key] + LOGGER.info(f"[ Get data ] Data found at {location}, type: {type(current_data)}") + return current_data + except KeyError as e: + LOGGER.error(f"[ Get data ] Key not found: {e}, please check the location string.") + return None def data_info(self, location: str, verbose: bool = False) -> None: """Get information about the data at a location. From 0c806ee4a084c3030e3b69a40901926a838cf414 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Sun, 13 Oct 2024 10:44:00 +0100 Subject: [PATCH 05/16] Add pretty-print-structure function --- topostats/io.py | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/topostats/io.py b/topostats/io.py index fae55004b73..9e7849f2580 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1333,6 +1333,60 @@ def find_data(self, search_keys: list) -> None: LOGGER.info("| [search] No partial matches found.") LOGGER.info("└ [End of search]") return + + def pretty_print_structure(self) -> None: + """Print the structure of the data in the data dictionary. + + The structure is printed with the keys indented to show the hierarchy of the data. + """ + + def print_structure(data: dict, level=0, prefix=""): + """Recursive function to print the structure.""" + for i, (key, value) in enumerate(data.items()): + is_last_item = i == len(data) - 1 + current_prefix = prefix + ("└ " if is_last_item else "├ ") + LOGGER.info(current_prefix + key) + + if isinstance(value, dict): + # Check if all keys are able to be integers, they are strings but need to check if they can be + # converted to integers without error + all_keys_are_integers = True + for k in value.keys(): + try: + int(k) + except ValueError: + all_keys_are_integers = False + break + all_values_are_numpy_arrays = all(isinstance(v, np.ndarray) for v in value.values()) + # if dictionary has keys that are integers and values that are numpy arrays, print the number + # of keys and the shape of the numpy arrays + if all_keys_are_integers and all_values_are_numpy_arrays: + LOGGER.info( + prefix + + (" " if is_last_item else "│ ") + + "└ " + + f"{len(value)} keys with numpy arrays as values" + ) + else: + new_prefix = prefix + (" " if is_last_item else "│ ") + print_structure(value, level + 1, new_prefix) + + elif isinstance(value, np.ndarray): + # Don't print the array, just the shape + LOGGER.info( + prefix + + (" " if is_last_item else "│ ") + + "└ " + + f"Numpy array, shape: {str(value.shape)}, dtype: {value.dtype}" + ) + else: + LOGGER.info(f"{prefix + (' ' if is_last_item else '│ ') + '└ ' + str(value)}") + + LOGGER.info(f"[{self.topofile}]") + print_structure(self.data) + + return + def get_data(self, location: str) -> int | float | str | np.ndarray | dict | None: """ Retrieve data from the dictionary using a '/' separated string. From d90ef91c218e44d0801132a94ad6b9b28937e970 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Sun, 13 Oct 2024 10:44:30 +0100 Subject: [PATCH 06/16] Add partial search function --- topostats/io.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/topostats/io.py b/topostats/io.py index 9e7849f2580..8a6f838891b 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1291,6 +1291,91 @@ def __init__(self, topofile: Path | str): self.topofile: Path = Path(topofile) with h5py.File(self.topofile, "r") as f: self.data: dict = hdf5_to_dict(open_hdf5_file=f, group_path="/") + + def search_partial_matches(self, data: dict, keys: list, current_path: list | None = None): + """ + Find partial matches to the keys in the dictionary. + + Recursively search through nested dictionaries and keep only the paths that match the keys in the correct order, + allowing gaps between the keys. + + Parameters + ---------- + data : dict + The dictionary to search through. + keys : list + The list of keys to search for. + current_path : list, optional + The current path in the dictionary, by default []. + + Returns + ------- + list + A list of paths that match the keys in the correct order. + """ + if current_path is None: + # Need to initialise the empty list here and not as a default argument since it is mutable + current_path = [] + + partial_matches = [] + + def recursive_partial_search(data, keys, current_path): + """ + Recursively find partial matches to the keys in the dictionary. + + Recursive function to search through the dictionary and keep only the paths + that match the keys in the correct order, + allowing gaps between the keys. + + Parameters + ---------- + data : dict + The dictionary to search through. + keys : list + The list of keys to search for. + current_path : list + The current path in the dictionary. + + Returns + ------- + None + """ + # If have reached the end of the current dictionary, return + if not keys: + partial_matches.append(current_path) + return + + current_key = keys[0] + + if isinstance(data, dict): + for k, v in data.items(): + new_path = current_path + [k] + try: + # Check if the current key can be converted to an integer + current_key_int = int(current_key) + k_int = int(k) + # If the current key and the key in the dictionary can be converted to integers, + # check if they are equal + if current_key_int == k_int: + # If the current key is in the key list of the dictionary, continue searching + # but remove the current key from the list + remaining_keys = keys[1:] + recursive_partial_search(v, remaining_keys, new_path) + except ValueError: + # If the current key cannot be converted to an integer, allow for partial matches + if current_key in k: + # If the current key is in the key list of the dictionary, continue searching + # but remove the current key from the list + remaining_keys = keys[1:] + recursive_partial_search(v, remaining_keys, new_path) + else: + # If the current key is not in the key list of the dictionary, continue searching + # but don't remove the current key from the list as it might be deeper in the dictionary + recursive_partial_search(v, keys, new_path) + + recursive_partial_search(data, keys, current_path) + return partial_matches + def find_data(self, search_keys: list) -> None: """ Find the data in the dictionary that matches the list of keys. From 1677ecd95549f2ab1b9ebe9c82aea82c55b866d5 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Sun, 13 Oct 2024 10:54:53 +0100 Subject: [PATCH 07/16] Add class documentation with examples --- topostats/io.py | 143 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 126 insertions(+), 17 deletions(-) diff --git a/topostats/io.py b/topostats/io.py index 8a6f838891b..c71732f6764 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1287,12 +1287,115 @@ def dict_to_json(data: dict, output_dir: str | Path, filename: str | Path, inden class TopoFileHelper: - def __init__(self, topofile: Path | str): + """ + Helper class for searching through the data in a .topostats (hdf5) file. + + Parameters + ---------- + topofile : Path + Path to the .topostats file. + + Examples + -------- + Creating a helper object. + ```python + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + ``` + + Print the structure of the data in the file. + ```python + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + helper.pretty_print_structure() + ``` + >>> [./tests/resources/file.topostats] + >>> ├ filename + >>> │ └ minicircle + >>> ├ grain_masks + >>> │ └ above + >>> │ └ Numpy array, shape: (1024, 1024), dtype: int64 + >>> ├ grain_trace_data + >>> │ └ above + >>> │ ├ cropped_images + >>> │ │ └ 21 keys with numpy arrays as values + >>> │ ├ ordered_trace_cumulative_distances + >>> │ │ └ 21 keys with numpy arrays as values + >>> │ ├ ordered_trace_heights + >>> │ │ └ 21 keys with numpy arrays as values + >>> │ ├ ordered_traces + >>> │ │ └ 21 keys with numpy arrays as values + >>> │ └ splined_traces + >>> │ └ 21 keys with numpy arrays as values + >>> ├ image + >>> │ └ Numpy array, shape: (1024, 1024), dtype: float64 + >>> ├ image_original + >>> │ └ Numpy array, shape: (1024, 1024), dtype: float64 + >>> ├ img_path + >>> │ └ /Users/sylvi/Documents/TopoStats/tests/resources/minicircle + >>> ├ pixel_to_nm_scaling + >>> │ └ 0.4940029296875 + >>> └ topostats_file_version + >>> └ 0.2 + + Finding data in a file. + ```python + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + helper.find_data(["ordered_trace_heights", "0"]) + ``` + + >>> [ Searching for ['ordered_trace_heights', '0'] in ./path/to/topostats_file.topostats ] + >>> | [search] No direct match found. + >>> | [search] Searching for partial matches. + >>> | [search] !! [ 1 Partial matches found] !! + >>> | [search] └ grain_trace_data/above/ordered_trace_heights/0 + >>> └ [End of search] + + Get data from a file. + ```python + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + + data = helper.get_data("ordered_trace_heights/0") + ``` + >>> [ Get data ] Data found at grain_trace_data/above/ordered_trace_heights/0, type: + + Get data information + ```python + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + + helper.data_info("grain_trace_data/above/ordered_trace_heights/0") + ``` + >>> [ Info ] Data at grain_trace_data/above/ordered_trace_heights/0 is a numpy array with shape: (95,), + >>> dtype: float64 + """ + + def __init__(self, topofile: Path | str) -> None: + """ + Initialise the TopoFileHelper object. + + Parameters + ---------- + topofile : Path | str + Path to the .topostats file. + """ self.topofile: Path = Path(topofile) with h5py.File(self.topofile, "r") as f: self.data: dict = hdf5_to_dict(open_hdf5_file=f, group_path="/") - def search_partial_matches(self, data: dict, keys: list, current_path: list | None = None): + def search_partial_matches(self, data: dict, keys: list, current_path: list | None = None) -> list: """ Find partial matches to the keys in the dictionary. @@ -1319,7 +1422,7 @@ def search_partial_matches(self, data: dict, keys: list, current_path: list | No partial_matches = [] - def recursive_partial_search(data, keys, current_path): + def recursive_partial_search(data, keys, current_path) -> None: """ Recursively find partial matches to the keys in the dictionary. @@ -1335,10 +1438,6 @@ def recursive_partial_search(data, keys, current_path): The list of keys to search for. current_path : list The current path in the dictionary. - - Returns - ------- - None """ # If have reached the end of the current dictionary, return if not keys: @@ -1384,10 +1483,6 @@ def find_data(self, search_keys: list) -> None: ---------- search_keys : list The list of keys to search for. - - Returns - ------- - None """ # Find the best match for the list of keys # First check if there is a direct match @@ -1417,16 +1512,27 @@ def find_data(self, search_keys: list) -> None: else: LOGGER.info("| [search] No partial matches found.") LOGGER.info("└ [End of search]") - return def pretty_print_structure(self) -> None: - """Print the structure of the data in the data dictionary. + """ + Print the structure of the data in the data dictionary. The structure is printed with the keys indented to show the hierarchy of the data. """ def print_structure(data: dict, level=0, prefix=""): - """Recursive function to print the structure.""" + """ + Recursive function to print the structure. + + Parameters + ---------- + data : dict + The dictionary to print the structure of. + level : int, optional + The current level of the dictionary, by default 0. + prefix : str, optional + The prefix to use when printing the dictionary, by default "". + """ for i, (key, value) in enumerate(data.items()): is_last_item = i == len(data) - 1 current_prefix = prefix + ("└ " if is_last_item else "├ ") @@ -1470,8 +1576,6 @@ def print_structure(data: dict, level=0, prefix=""): LOGGER.info(f"[{self.topofile}]") print_structure(self.data) - return - def get_data(self, location: str) -> int | float | str | np.ndarray | dict | None: """ Retrieve data from the dictionary using a '/' separated string. @@ -1500,13 +1604,18 @@ def get_data(self, location: str) -> int | float | str | np.ndarray | dict | Non except KeyError as e: LOGGER.error(f"[ Get data ] Key not found: {e}, please check the location string.") return None + def data_info(self, location: str, verbose: bool = False) -> None: - """Get information about the data at a location. + """ + Get information about the data at a location. Parameters ---------- location : str The location of the data in the dictionary, separated by '/'. + + verbose : bool, optional + Print more detailed information about the data, by default False. """ # If there's a trailing '/', remove it if location[-1] == "/": From 831f1c869b7d5369aea8322b86d803d09094b9e9 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Wed, 16 Oct 2024 16:02:58 +0100 Subject: [PATCH 08/16] Add example notebook for loading topostats file data --- notebooks/topostats_file_helper_example.ipynb | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 notebooks/topostats_file_helper_example.ipynb diff --git a/notebooks/topostats_file_helper_example.ipynb b/notebooks/topostats_file_helper_example.ipynb new file mode 100644 index 00000000000..11dd364be51 --- /dev/null +++ b/notebooks/topostats_file_helper_example.ipynb @@ -0,0 +1,75 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import needed libraries\n", + "import numpy as np\n", + "from topostats.io import TopoFileHelper\n", + "from IPython.display import clear_output\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load topostats file\n", + "file = \"./tests/resources/file.topostats\"\n", + "helper = TopoFileHelper(file)\n", + "# Clear logging output\n", + "clear_output(wait=False)\n", + "\n", + "# Print the structure of the file\n", + "helper.pretty_print_structure()\n", + "\n", + "# Find the name of the data we want, we know it contains \"ordered_trace_heights\" and we want grain 2, but don't know\n", + "# what keys precede it\n", + "helper.find_data([\"ordered_trace_heights\", \"2\"])\n", + "\n", + "# Get some data from the file\n", + "cropped_image = helper.get_data(\"grain_trace_data/above/cropped_images/2\")\n", + "ordered_trace_heights = helper.get_data(\"grain_trace_data/above/ordered_trace_heights/2\")\n", + "cumulative_distances = helper.get_data(\"grain_trace_data/above/ordered_trace_cumulative_distances/2\")\n", + "ordered_traces = helper.get_data(\"grain_trace_data/above/ordered_traces/2\")\n", + "\n", + "# Plot the image\n", + "plt.imshow(cropped_image)\n", + "# Create a basic colour scale for the moleucle trace\n", + "c = np.arange(0, len(ordered_traces))\n", + "# Plot the molecule trace\n", + "plt.scatter(ordered_traces[:, 1], ordered_traces[:, 0], c=c, s=10)\n", + "plt.show()\n", + "# Plot the height of the molecule trace against the cumulative distance in nanometres\n", + "plt.plot(cumulative_distances, ordered_trace_heights)\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "topo-unet", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 89b6475ac160833e854759048d1c48ad26984837 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Thu, 17 Oct 2024 10:40:59 +0100 Subject: [PATCH 09/16] Add tests for TopoFileHelper class (WIP) --- tests/test_io.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_io.py b/tests/test_io.py index 0384deeee0d..5ba6b48d259 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -15,6 +15,7 @@ from topostats.io import ( LoadScans, + TopoFileHelper, convert_basename_to_relative_paths, dict_to_hdf5, dict_to_json, @@ -1379,3 +1380,41 @@ def test_dict_to_json(dictionary: dict, target: dict, tmp_path: Path) -> None: with outfile.open("r", encoding="utf-8") as f: assert target == json.load(f) + + +class TestTopoFileHelper: + """Test the TopoFileHelper class.""" + + @pytest.mark.parametrize( + ("file_path_or_string"), + [ + pytest.param( + "tests/resources/file.topostats", + id="String file path", + ), + pytest.param( + Path("tests/resources/file.topostats"), + id="Path object path", + ), + ], + ) + def test_init(self, file_path_or_string: Path | str) -> None: + """Test the __init__ method of the TopoFileHelper class.""" + topo_file_helper = TopoFileHelper(file_path_or_string) + assert isinstance(topo_file_helper, TopoFileHelper) + assert isinstance(topo_file_helper.data, dict) + + def test_get_data(self) -> None: + """Test the get_data method of the TopoFileHelper class.""" + topo_file_helper = TopoFileHelper("tests/resources/file.topostats") + cropped_image = topo_file_helper.get_data("grain_trace_data/above/cropped_images/2") + assert isinstance(cropped_image, np.ndarray) + + +# This test only works when not part of the TestTopoFileHelper class +def test_pretty_print_structure(caplog) -> None: + """Test the pretty_print_structure method of the TopoFileHelper class.""" + topo_file_helper = TopoFileHelper("tests/resources/file.topostats") + topo_file_helper.pretty_print_structure() + assert "filename" in caplog.text + assert "keys with numpy arrays as values" in caplog.text From 56acb1ffa6445afc6b9a0daad6b358683553f292 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Tue, 3 Dec 2024 15:04:30 +0000 Subject: [PATCH 10/16] Fix | io.py failing tests | import futures.annotations to allow pipe --- tests/test_io.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_io.py b/tests/test_io.py index 5ba6b48d259..0e9bad443e7 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,5 +1,7 @@ """Tests of IO.""" +from __future__ import annotations + import argparse import json import logging From 986870a4c15b92454449dd4ba1e461c976547b88 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Tue, 3 Dec 2024 15:45:48 +0000 Subject: [PATCH 11/16] Add: markdown sections to helper notebook --- notebooks/topostats_file_helper_example.ipynb | 82 +++++++++++++++++-- 1 file changed, 74 insertions(+), 8 deletions(-) diff --git a/notebooks/topostats_file_helper_example.ipynb b/notebooks/topostats_file_helper_example.ipynb index 11dd364be51..9458162a237 100644 --- a/notebooks/topostats_file_helper_example.ipynb +++ b/notebooks/topostats_file_helper_example.ipynb @@ -13,6 +13,13 @@ "import matplotlib.pyplot as plt" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load topostats file" + ] + }, { "cell_type": "code", "execution_count": null, @@ -20,24 +27,81 @@ "outputs": [], "source": [ "# Load topostats file\n", - "file = \"./tests/resources/file.topostats\"\n", + "file = \"../tests/resources/file.topostats\"\n", "helper = TopoFileHelper(file)\n", "# Clear logging output\n", "clear_output(wait=False)\n", - "\n", + "print(\"File loaded\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print the structure of the file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Print the structure of the file\n", - "helper.pretty_print_structure()\n", - "\n", + "helper.pretty_print_structure()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find data within the file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Find the name of the data we want, we know it contains \"ordered_trace_heights\" and we want grain 2, but don't know\n", "# what keys precede it\n", - "helper.find_data([\"ordered_trace_heights\", \"2\"])\n", - "\n", + "helper.find_data([\"ordered_trace_heights\", \"2\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieve data from the file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Get some data from the file\n", "cropped_image = helper.get_data(\"grain_trace_data/above/cropped_images/2\")\n", "ordered_trace_heights = helper.get_data(\"grain_trace_data/above/ordered_trace_heights/2\")\n", "cumulative_distances = helper.get_data(\"grain_trace_data/above/ordered_trace_cumulative_distances/2\")\n", - "ordered_traces = helper.get_data(\"grain_trace_data/above/ordered_traces/2\")\n", - "\n", + "ordered_traces = helper.get_data(\"grain_trace_data/above/ordered_traces/2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use the retrieved data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Plot the image\n", "plt.imshow(cropped_image)\n", "# Create a basic colour scale for the moleucle trace\n", @@ -47,6 +111,8 @@ "plt.show()\n", "# Plot the height of the molecule trace against the cumulative distance in nanometres\n", "plt.plot(cumulative_distances, ordered_trace_heights)\n", + "plt.xlabel(\"Cumulative distance (nm)\")\n", + "plt.ylabel(\"Height (nm)\")\n", "plt.show()" ] } From ba9e394b738645027373671c984a192a3fcd0299 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Wed, 22 Jan 2025 10:51:44 +0000 Subject: [PATCH 12/16] Fix code blocks in documentation --- topostats/io.py | 127 +++++++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 62 deletions(-) diff --git a/topostats/io.py b/topostats/io.py index c71732f6764..25512f0cf60 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1298,88 +1298,91 @@ class TopoFileHelper: Examples -------- Creating a helper object. - ```python - from topostats.io import TopoFileHelper + .. code-block:: RST + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) - ``` Print the structure of the data in the file. - ```python + .. code-block:: RST from topostats.io import TopoFileHelper topofile = "path/to/topostats_file.topostats" helper = TopoFileHelper(topofile) helper.pretty_print_structure() - ``` - >>> [./tests/resources/file.topostats] - >>> ├ filename - >>> │ └ minicircle - >>> ├ grain_masks - >>> │ └ above - >>> │ └ Numpy array, shape: (1024, 1024), dtype: int64 - >>> ├ grain_trace_data - >>> │ └ above - >>> │ ├ cropped_images - >>> │ │ └ 21 keys with numpy arrays as values - >>> │ ├ ordered_trace_cumulative_distances - >>> │ │ └ 21 keys with numpy arrays as values - >>> │ ├ ordered_trace_heights - >>> │ │ └ 21 keys with numpy arrays as values - >>> │ ├ ordered_traces - >>> │ │ └ 21 keys with numpy arrays as values - >>> │ └ splined_traces - >>> │ └ 21 keys with numpy arrays as values - >>> ├ image - >>> │ └ Numpy array, shape: (1024, 1024), dtype: float64 - >>> ├ image_original - >>> │ └ Numpy array, shape: (1024, 1024), dtype: float64 - >>> ├ img_path - >>> │ └ /Users/sylvi/Documents/TopoStats/tests/resources/minicircle - >>> ├ pixel_to_nm_scaling - >>> │ └ 0.4940029296875 - >>> └ topostats_file_version - >>> └ 0.2 + + .. code-block:: RST + >>> [./tests/resources/file.topostats] + >>> ├ filename + >>> │ └ minicircle + >>> ├ grain_masks + >>> │ └ above + >>> │ └ Numpy array, shape: (1024, 1024), dtype: int64 + >>> ├ grain_trace_data + >>> │ └ above + >>> │ ├ cropped_images + >>> │ │ └ 21 keys with numpy arrays as values + >>> │ ├ ordered_trace_cumulative_distances + >>> │ │ └ 21 keys with numpy arrays as values + >>> │ ├ ordered_trace_heights + >>> │ │ └ 21 keys with numpy arrays as values + >>> │ ├ ordered_traces + >>> │ │ └ 21 keys with numpy arrays as values + >>> │ └ splined_traces + >>> │ └ 21 keys with numpy arrays as values + >>> ├ image + >>> │ └ Numpy array, shape: (1024, 1024), dtype: float64 + >>> ├ image_original + >>> │ └ Numpy array, shape: (1024, 1024), dtype: float64 + >>> ├ img_path + >>> │ └ /Users/sylvi/Documents/TopoStats/tests/resources/minicircle + >>> ├ pixel_to_nm_scaling + >>> │ └ 0.4940029296875 + >>> └ topostats_file_version + >>> └ 0.2 Finding data in a file. - ```python - from topostats.io import TopoFileHelper + .. code-block:: RST + from topostats.io import TopoFileHelper - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) - helper.find_data(["ordered_trace_heights", "0"]) - ``` + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + helper.find_data(["ordered_trace_heights", "0"]) - >>> [ Searching for ['ordered_trace_heights', '0'] in ./path/to/topostats_file.topostats ] - >>> | [search] No direct match found. - >>> | [search] Searching for partial matches. - >>> | [search] !! [ 1 Partial matches found] !! - >>> | [search] └ grain_trace_data/above/ordered_trace_heights/0 - >>> └ [End of search] + .. code-block:: RST + >>> [ Searching for ['ordered_trace_heights', '0'] in ./path/to/topostats_file.topostats ] + >>> | [search] No direct match found. + >>> | [search] Searching for partial matches. + >>> | [search] !! [ 1 Partial matches found] !! + >>> | [search] └ grain_trace_data/above/ordered_trace_heights/0 + >>> └ [End of search] Get data from a file. - ```python - from topostats.io import TopoFileHelper + .. code-block:: RST + from topostats.io import TopoFileHelper - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + + data = helper.get_data("ordered_trace_heights/0") - data = helper.get_data("ordered_trace_heights/0") - ``` - >>> [ Get data ] Data found at grain_trace_data/above/ordered_trace_heights/0, type: + .. code-block:: RST + >>> [ Get data ] Data found at grain_trace_data/above/ordered_trace_heights/0, type: Get data information - ```python - from topostats.io import TopoFileHelper + .. code-block:: RST + from topostats.io import TopoFileHelper - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + + helper.data_info("grain_trace_data/above/ordered_trace_heights/0") - helper.data_info("grain_trace_data/above/ordered_trace_heights/0") - ``` - >>> [ Info ] Data at grain_trace_data/above/ordered_trace_heights/0 is a numpy array with shape: (95,), - >>> dtype: float64 + .. code-block:: RST + >>> [ Info ] Data at grain_trace_data/above/ordered_trace_heights/0 is a numpy array with shape: (95,), + >>> dtype: float64 """ def __init__(self, topofile: Path | str) -> None: From adb6eb8533c55e08f02773d996bc3f7a0884554b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Jan 2025 10:52:37 +0000 Subject: [PATCH 13/16] [pre-commit.ci] Fixing issues with pre-commit --- notebooks/topostats_file_helper_example.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/notebooks/topostats_file_helper_example.ipynb b/notebooks/topostats_file_helper_example.ipynb index 9458162a237..19332464181 100644 --- a/notebooks/topostats_file_helper_example.ipynb +++ b/notebooks/topostats_file_helper_example.ipynb @@ -7,10 +7,11 @@ "outputs": [], "source": [ "# Import needed libraries\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "from topostats.io import TopoFileHelper\n", "from IPython.display import clear_output\n", - "import matplotlib.pyplot as plt" + "\n", + "from topostats.io import TopoFileHelper" ] }, { From 87074d10089d1ba0e5a963e2728f57488518b2b9 Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Wed, 22 Jan 2025 11:02:48 +0000 Subject: [PATCH 14/16] Remove superfluous functions and use H5Glance instead --- notebooks/topostats_file_helper_example.ipynb | 8 +- tests/test_io.py | 9 -- topostats/io.py | 115 +++--------------- 3 files changed, 24 insertions(+), 108 deletions(-) diff --git a/notebooks/topostats_file_helper_example.ipynb b/notebooks/topostats_file_helper_example.ipynb index 9458162a237..819a103fb6e 100644 --- a/notebooks/topostats_file_helper_example.ipynb +++ b/notebooks/topostats_file_helper_example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -10,7 +10,9 @@ "import numpy as np\n", "from topostats.io import TopoFileHelper\n", "from IPython.display import clear_output\n", - "import matplotlib.pyplot as plt" + "import matplotlib.pyplot as plt\n", + "\n", + "from h5glance import H5Glance" ] }, { @@ -48,7 +50,7 @@ "outputs": [], "source": [ "# Print the structure of the file\n", - "helper.pretty_print_structure()" + "H5Glance(file)" ] }, { diff --git a/tests/test_io.py b/tests/test_io.py index 0e9bad443e7..1490efdc5a5 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1411,12 +1411,3 @@ def test_get_data(self) -> None: topo_file_helper = TopoFileHelper("tests/resources/file.topostats") cropped_image = topo_file_helper.get_data("grain_trace_data/above/cropped_images/2") assert isinstance(cropped_image, np.ndarray) - - -# This test only works when not part of the TestTopoFileHelper class -def test_pretty_print_structure(caplog) -> None: - """Test the pretty_print_structure method of the TopoFileHelper class.""" - topo_file_helper = TopoFileHelper("tests/resources/file.topostats") - topo_file_helper.pretty_print_structure() - assert "filename" in caplog.text - assert "keys with numpy arrays as values" in caplog.text diff --git a/topostats/io.py b/topostats/io.py index 25512f0cf60..8a82a4df62a 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1297,6 +1297,8 @@ class TopoFileHelper: Examples -------- + This class should be used in a Jupyter Notebook or an interactive Python session. + Creating a helper object. .. code-block:: RST from topostats.io import TopoFileHelper @@ -1305,43 +1307,27 @@ class TopoFileHelper: helper = TopoFileHelper(topofile) - Print the structure of the data in the file. + Find the data you're looking for using H5Glance (only works in Jupyter Notebooks). .. code-block:: RST - from topostats.io import TopoFileHelper + from H5Glance import H5Glance + + file = "path/to/topostats_file.topostats" + H5Glance(file) - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) - helper.pretty_print_structure() + Which prints an interactive explorer of the file structure, eg: .. code-block:: RST - >>> [./tests/resources/file.topostats] - >>> ├ filename - >>> │ └ minicircle - >>> ├ grain_masks - >>> │ └ above - >>> │ └ Numpy array, shape: (1024, 1024), dtype: int64 - >>> ├ grain_trace_data - >>> │ └ above - >>> │ ├ cropped_images - >>> │ │ └ 21 keys with numpy arrays as values - >>> │ ├ ordered_trace_cumulative_distances - >>> │ │ └ 21 keys with numpy arrays as values - >>> │ ├ ordered_trace_heights - >>> │ │ └ 21 keys with numpy arrays as values - >>> │ ├ ordered_traces - >>> │ │ └ 21 keys with numpy arrays as values - >>> │ └ splined_traces - >>> │ └ 21 keys with numpy arrays as values - >>> ├ image - >>> │ └ Numpy array, shape: (1024, 1024), dtype: float64 - >>> ├ image_original - >>> │ └ Numpy array, shape: (1024, 1024), dtype: float64 - >>> ├ img_path - >>> │ └ /Users/sylvi/Documents/TopoStats/tests/resources/minicircle - >>> ├ pixel_to_nm_scaling - >>> │ └ 0.4940029296875 - >>> └ topostats_file_version - >>> └ 0.2 + ../tests/resources/process_scan_topostats_file_regtest.topostats + grain_masks + grain_trace_data + height_profiles + filename [📋]: scalar entries, dtype: ASCII string + image [📋]: 64 x 64 entries, dtype: float64 + image_original [📋]: 64 x 64 entries, dtype: float64 + pixel_to_nm_scaling [📋]: scalar entries, dtype: float64 + topostats_file_version [📋]: scalar entries, dtype: float64 + + Where each entry can be clicked on for more information. Finding data in a file. .. code-block:: RST @@ -1516,69 +1502,6 @@ def find_data(self, search_keys: list) -> None: LOGGER.info("| [search] No partial matches found.") LOGGER.info("└ [End of search]") - def pretty_print_structure(self) -> None: - """ - Print the structure of the data in the data dictionary. - - The structure is printed with the keys indented to show the hierarchy of the data. - """ - - def print_structure(data: dict, level=0, prefix=""): - """ - Recursive function to print the structure. - - Parameters - ---------- - data : dict - The dictionary to print the structure of. - level : int, optional - The current level of the dictionary, by default 0. - prefix : str, optional - The prefix to use when printing the dictionary, by default "". - """ - for i, (key, value) in enumerate(data.items()): - is_last_item = i == len(data) - 1 - current_prefix = prefix + ("└ " if is_last_item else "├ ") - LOGGER.info(current_prefix + key) - - if isinstance(value, dict): - # Check if all keys are able to be integers, they are strings but need to check if they can be - # converted to integers without error - all_keys_are_integers = True - for k in value.keys(): - try: - int(k) - except ValueError: - all_keys_are_integers = False - break - all_values_are_numpy_arrays = all(isinstance(v, np.ndarray) for v in value.values()) - # if dictionary has keys that are integers and values that are numpy arrays, print the number - # of keys and the shape of the numpy arrays - if all_keys_are_integers and all_values_are_numpy_arrays: - LOGGER.info( - prefix - + (" " if is_last_item else "│ ") - + "└ " - + f"{len(value)} keys with numpy arrays as values" - ) - else: - new_prefix = prefix + (" " if is_last_item else "│ ") - print_structure(value, level + 1, new_prefix) - - elif isinstance(value, np.ndarray): - # Don't print the array, just the shape - LOGGER.info( - prefix - + (" " if is_last_item else "│ ") - + "└ " - + f"Numpy array, shape: {str(value.shape)}, dtype: {value.dtype}" - ) - else: - LOGGER.info(f"{prefix + (' ' if is_last_item else '│ ') + '└ ' + str(value)}") - - LOGGER.info(f"[{self.topofile}]") - print_structure(self.data) - def get_data(self, location: str) -> int | float | str | np.ndarray | dict | None: """ Retrieve data from the dictionary using a '/' separated string. From 279bc0040fdb1d45001199941e8c68091e372cce Mon Sep 17 00:00:00 2001 From: SylviaWhittle Date: Wed, 22 Jan 2025 11:19:12 +0000 Subject: [PATCH 15/16] Move topostats file helper to its own module --- tests/test_io.py | 29 --- topostats/io.py | 292 --------------------------- topostats/topostats_file_helper.py | 304 +++++++++++++++++++++++++++++ 3 files changed, 304 insertions(+), 321 deletions(-) create mode 100644 topostats/topostats_file_helper.py diff --git a/tests/test_io.py b/tests/test_io.py index 1490efdc5a5..e71483aae99 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1382,32 +1382,3 @@ def test_dict_to_json(dictionary: dict, target: dict, tmp_path: Path) -> None: with outfile.open("r", encoding="utf-8") as f: assert target == json.load(f) - - -class TestTopoFileHelper: - """Test the TopoFileHelper class.""" - - @pytest.mark.parametrize( - ("file_path_or_string"), - [ - pytest.param( - "tests/resources/file.topostats", - id="String file path", - ), - pytest.param( - Path("tests/resources/file.topostats"), - id="Path object path", - ), - ], - ) - def test_init(self, file_path_or_string: Path | str) -> None: - """Test the __init__ method of the TopoFileHelper class.""" - topo_file_helper = TopoFileHelper(file_path_or_string) - assert isinstance(topo_file_helper, TopoFileHelper) - assert isinstance(topo_file_helper.data, dict) - - def test_get_data(self) -> None: - """Test the get_data method of the TopoFileHelper class.""" - topo_file_helper = TopoFileHelper("tests/resources/file.topostats") - cropped_image = topo_file_helper.get_data("grain_trace_data/above/cropped_images/2") - assert isinstance(cropped_image, np.ndarray) diff --git a/topostats/io.py b/topostats/io.py index 8a82a4df62a..06b3a669e86 100644 --- a/topostats/io.py +++ b/topostats/io.py @@ -1284,295 +1284,3 @@ def dict_to_json(data: dict, output_dir: str | Path, filename: str | Path, inden output_file = output_dir / filename with output_file.open("w") as f: json.dump(data, f, indent=indent, cls=NumpyEncoder) - - -class TopoFileHelper: - """ - Helper class for searching through the data in a .topostats (hdf5) file. - - Parameters - ---------- - topofile : Path - Path to the .topostats file. - - Examples - -------- - This class should be used in a Jupyter Notebook or an interactive Python session. - - Creating a helper object. - .. code-block:: RST - from topostats.io import TopoFileHelper - - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) - - - Find the data you're looking for using H5Glance (only works in Jupyter Notebooks). - .. code-block:: RST - from H5Glance import H5Glance - - file = "path/to/topostats_file.topostats" - H5Glance(file) - - Which prints an interactive explorer of the file structure, eg: - - .. code-block:: RST - ../tests/resources/process_scan_topostats_file_regtest.topostats - grain_masks - grain_trace_data - height_profiles - filename [📋]: scalar entries, dtype: ASCII string - image [📋]: 64 x 64 entries, dtype: float64 - image_original [📋]: 64 x 64 entries, dtype: float64 - pixel_to_nm_scaling [📋]: scalar entries, dtype: float64 - topostats_file_version [📋]: scalar entries, dtype: float64 - - Where each entry can be clicked on for more information. - - Finding data in a file. - .. code-block:: RST - from topostats.io import TopoFileHelper - - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) - helper.find_data(["ordered_trace_heights", "0"]) - - .. code-block:: RST - >>> [ Searching for ['ordered_trace_heights', '0'] in ./path/to/topostats_file.topostats ] - >>> | [search] No direct match found. - >>> | [search] Searching for partial matches. - >>> | [search] !! [ 1 Partial matches found] !! - >>> | [search] └ grain_trace_data/above/ordered_trace_heights/0 - >>> └ [End of search] - - Get data from a file. - .. code-block:: RST - from topostats.io import TopoFileHelper - - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) - - data = helper.get_data("ordered_trace_heights/0") - - .. code-block:: RST - >>> [ Get data ] Data found at grain_trace_data/above/ordered_trace_heights/0, type: - - Get data information - .. code-block:: RST - from topostats.io import TopoFileHelper - - topofile = "path/to/topostats_file.topostats" - helper = TopoFileHelper(topofile) - - helper.data_info("grain_trace_data/above/ordered_trace_heights/0") - - .. code-block:: RST - >>> [ Info ] Data at grain_trace_data/above/ordered_trace_heights/0 is a numpy array with shape: (95,), - >>> dtype: float64 - """ - - def __init__(self, topofile: Path | str) -> None: - """ - Initialise the TopoFileHelper object. - - Parameters - ---------- - topofile : Path | str - Path to the .topostats file. - """ - self.topofile: Path = Path(topofile) - with h5py.File(self.topofile, "r") as f: - self.data: dict = hdf5_to_dict(open_hdf5_file=f, group_path="/") - - def search_partial_matches(self, data: dict, keys: list, current_path: list | None = None) -> list: - """ - Find partial matches to the keys in the dictionary. - - Recursively search through nested dictionaries and keep only the paths that match the keys in the correct order, - allowing gaps between the keys. - - Parameters - ---------- - data : dict - The dictionary to search through. - keys : list - The list of keys to search for. - current_path : list, optional - The current path in the dictionary, by default []. - - Returns - ------- - list - A list of paths that match the keys in the correct order. - """ - if current_path is None: - # Need to initialise the empty list here and not as a default argument since it is mutable - current_path = [] - - partial_matches = [] - - def recursive_partial_search(data, keys, current_path) -> None: - """ - Recursively find partial matches to the keys in the dictionary. - - Recursive function to search through the dictionary and keep only the paths - that match the keys in the correct order, - allowing gaps between the keys. - - Parameters - ---------- - data : dict - The dictionary to search through. - keys : list - The list of keys to search for. - current_path : list - The current path in the dictionary. - """ - # If have reached the end of the current dictionary, return - if not keys: - partial_matches.append(current_path) - return - - current_key = keys[0] - - if isinstance(data, dict): - for k, v in data.items(): - new_path = current_path + [k] - try: - # Check if the current key can be converted to an integer - current_key_int = int(current_key) - k_int = int(k) - # If the current key and the key in the dictionary can be converted to integers, - # check if they are equal - if current_key_int == k_int: - # If the current key is in the key list of the dictionary, continue searching - # but remove the current key from the list - remaining_keys = keys[1:] - recursive_partial_search(v, remaining_keys, new_path) - except ValueError: - # If the current key cannot be converted to an integer, allow for partial matches - if current_key in k: - # If the current key is in the key list of the dictionary, continue searching - # but remove the current key from the list - remaining_keys = keys[1:] - recursive_partial_search(v, remaining_keys, new_path) - else: - # If the current key is not in the key list of the dictionary, continue searching - # but don't remove the current key from the list as it might be deeper in the dictionary - recursive_partial_search(v, keys, new_path) - - recursive_partial_search(data, keys, current_path) - return partial_matches - - def find_data(self, search_keys: list) -> None: - """ - Find the data in the dictionary that matches the list of keys. - - Parameters - ---------- - search_keys : list - The list of keys to search for. - """ - # Find the best match for the list of keys - # First check if there is a direct match - LOGGER.info(f"[ Searching for {search_keys} in {self.topofile} ]") - - try: - current_data = self.data - for key in search_keys: - current_data = current_data[key] - - LOGGER.info("| [search] Direct match found") - except KeyError: - LOGGER.info("| [search] No direct match found.") - - # If no direct match is found, try to find a partial match - LOGGER.info("| [search] Searching for partial matches.") - partial_matches = self.search_partial_matches(data=self.data, keys=search_keys) - if partial_matches: - LOGGER.info(f"| [search] !! [ {len(partial_matches)} Partial matches found] !!") - for index, match in enumerate(partial_matches): - match_str = "/".join(match) - if index == len(partial_matches) - 1: - prefix = "| [search] └" - else: - prefix = "| [search] ├" - LOGGER.info(f"{prefix} {match_str}") - else: - LOGGER.info("| [search] No partial matches found.") - LOGGER.info("└ [End of search]") - - def get_data(self, location: str) -> int | float | str | np.ndarray | dict | None: - """ - Retrieve data from the dictionary using a '/' separated string. - - Parameters - ---------- - location : str - The location of the data in the dictionary, separated by '/'. - - Returns - ------- - int | float | str | np.ndarray | dict - The data at the location. - """ - # If there's a trailing '/', remove it - if location[-1] == "/": - location = location[:-1] - keys = location.split("/") - - try: - current_data = self.data - for key in keys: - current_data = current_data[key] - LOGGER.info(f"[ Get data ] Data found at {location}, type: {type(current_data)}") - return current_data - except KeyError as e: - LOGGER.error(f"[ Get data ] Key not found: {e}, please check the location string.") - return None - - def data_info(self, location: str, verbose: bool = False) -> None: - """ - Get information about the data at a location. - - Parameters - ---------- - location : str - The location of the data in the dictionary, separated by '/'. - - verbose : bool, optional - Print more detailed information about the data, by default False. - """ - # If there's a trailing '/', remove it - if location[-1] == "/": - location = location[:-1] - keys = location.split("/") - - try: - current_data = self.data - for key in keys: - current_data = current_data[key] - except KeyError as e: - LOGGER.error(f"[ Info ] Key not found: {e}, please check the location string.") - return - - if isinstance(current_data, dict): - key_types = {type(k) for k in current_data.keys()} - value_types = {type(v) for v in current_data.values()} - LOGGER.info( - f"[ Info ] Data at {location} is a dictionary with {len(current_data)} " - f"keys of types {key_types} and values " - f"of types {value_types}" - ) - if verbose: - for k, v in current_data.items(): - LOGGER.info(f" {k}: {type(v)}") - elif isinstance(current_data, np.ndarray): - LOGGER.info( - f"[ Info ] Data at {location} is a numpy array with shape: {current_data.shape}, " - f"dtype: {current_data.dtype}" - ) - else: - LOGGER.info(f"[ Info ] Data at {location} is {type(current_data)}") - - return diff --git a/topostats/topostats_file_helper.py b/topostats/topostats_file_helper.py new file mode 100644 index 00000000000..daac507b036 --- /dev/null +++ b/topostats/topostats_file_helper.py @@ -0,0 +1,304 @@ +"""For helper scripts in handling .topostats files.""" + +from pathlib import Path +import logging + +import h5py +import numpy.typing as npt + +from topostats.io import hdf5_to_dict +from topostats.logs.logs import LOGGER_NAME + +LOGGER = logging.getLogger(LOGGER_NAME) + + +class TopoFileHelper: + """ + Helper class for searching through the data in a .topostats (hdf5) file. + + Parameters + ---------- + topofile : Path + Path to the .topostats file. + + Examples + -------- + This class should be used in a Jupyter Notebook or an interactive Python session. + + Creating a helper object. + .. code-block:: RST + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + + + Find the data you're looking for using H5Glance (only works in Jupyter Notebooks). + .. code-block:: RST + from H5Glance import H5Glance + + file = "path/to/topostats_file.topostats" + H5Glance(file) + + Which prints an interactive explorer of the file structure, eg: + + .. code-block:: RST + ../tests/resources/process_scan_topostats_file_regtest.topostats + grain_masks + grain_trace_data + height_profiles + filename [📋]: scalar entries, dtype: ASCII string + image [📋]: 64 x 64 entries, dtype: float64 + image_original [📋]: 64 x 64 entries, dtype: float64 + pixel_to_nm_scaling [📋]: scalar entries, dtype: float64 + topostats_file_version [📋]: scalar entries, dtype: float64 + + Where each entry can be clicked on for more information. + + Finding data in a file. + .. code-block:: RST + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + helper.find_data(["ordered_trace_heights", "0"]) + + .. code-block:: RST + >>> [ Searching for ['ordered_trace_heights', '0'] in ./path/to/topostats_file.topostats ] + >>> | [search] No direct match found. + >>> | [search] Searching for partial matches. + >>> | [search] !! [ 1 Partial matches found] !! + >>> | [search] └ grain_trace_data/above/ordered_trace_heights/0 + >>> └ [End of search] + + Get data from a file. + .. code-block:: RST + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + + data = helper.get_data("ordered_trace_heights/0") + + .. code-block:: RST + >>> [ Get data ] Data found at grain_trace_data/above/ordered_trace_heights/0, type: + + Get data information + .. code-block:: RST + from topostats.io import TopoFileHelper + + topofile = "path/to/topostats_file.topostats" + helper = TopoFileHelper(topofile) + + helper.data_info("grain_trace_data/above/ordered_trace_heights/0") + + .. code-block:: RST + >>> [ Info ] Data at grain_trace_data/above/ordered_trace_heights/0 is a numpy array with shape: (95,), + >>> dtype: float64 + """ + + def __init__(self, topofile: Path | str) -> None: + """ + Initialise the TopoFileHelper object. + + Parameters + ---------- + topofile : Path | str + Path to the .topostats file. + """ + self.topofile: Path = Path(topofile) + with h5py.File(self.topofile, "r") as f: + self.data: dict = hdf5_to_dict(open_hdf5_file=f, group_path="/") + + def search_partial_matches(self, data: dict, keys: list, current_path: list | None = None) -> list: + """ + Find partial matches to the keys in the dictionary. + + Recursively search through nested dictionaries and keep only the paths that match the keys in the correct order, + allowing gaps between the keys. + + Parameters + ---------- + data : dict + The dictionary to search through. + keys : list + The list of keys to search for. + current_path : list, optional + The current path in the dictionary, by default []. + + Returns + ------- + list + A list of paths that match the keys in the correct order. + """ + if current_path is None: + # Need to initialise the empty list here and not as a default argument since it is mutable + current_path = [] + + partial_matches = [] + + def recursive_partial_search(data, keys, current_path) -> None: + """ + Recursively find partial matches to the keys in the dictionary. + + Recursive function to search through the dictionary and keep only the paths + that match the keys in the correct order, + allowing gaps between the keys. + + Parameters + ---------- + data : dict + The dictionary to search through. + keys : list + The list of keys to search for. + current_path : list + The current path in the dictionary. + """ + # If have reached the end of the current dictionary, return + if not keys: + partial_matches.append(current_path) + return + + current_key = keys[0] + + if isinstance(data, dict): + for k, v in data.items(): + new_path = current_path + [k] + try: + # Check if the current key can be converted to an integer + current_key_int = int(current_key) + k_int = int(k) + # If the current key and the key in the dictionary can be converted to integers, + # check if they are equal + if current_key_int == k_int: + # If the current key is in the key list of the dictionary, continue searching + # but remove the current key from the list + remaining_keys = keys[1:] + recursive_partial_search(v, remaining_keys, new_path) + except ValueError: + # If the current key cannot be converted to an integer, allow for partial matches + if current_key in k: + # If the current key is in the key list of the dictionary, continue searching + # but remove the current key from the list + remaining_keys = keys[1:] + recursive_partial_search(v, remaining_keys, new_path) + else: + # If the current key is not in the key list of the dictionary, continue searching + # but don't remove the current key from the list as it might be deeper in the dictionary + recursive_partial_search(v, keys, new_path) + + recursive_partial_search(data, keys, current_path) + return partial_matches + + def find_data(self, search_keys: list) -> None: + """ + Find the data in the dictionary that matches the list of keys. + + Parameters + ---------- + search_keys : list + The list of keys to search for. + """ + # Find the best match for the list of keys + # First check if there is a direct match + LOGGER.info(f"[ Searching for {search_keys} in {self.topofile} ]") + + try: + current_data = self.data + for key in search_keys: + current_data = current_data[key] + + LOGGER.info("| [search] Direct match found") + except KeyError: + LOGGER.info("| [search] No direct match found.") + + # If no direct match is found, try to find a partial match + LOGGER.info("| [search] Searching for partial matches.") + partial_matches = self.search_partial_matches(data=self.data, keys=search_keys) + if partial_matches: + LOGGER.info(f"| [search] !! [ {len(partial_matches)} Partial matches found] !!") + for index, match in enumerate(partial_matches): + match_str = "/".join(match) + if index == len(partial_matches) - 1: + prefix = "| [search] └" + else: + prefix = "| [search] ├" + LOGGER.info(f"{prefix} {match_str}") + else: + LOGGER.info("| [search] No partial matches found.") + LOGGER.info("└ [End of search]") + + def get_data(self, location: str) -> int | float | str | npt.NDArray | dict | None: + """ + Retrieve data from the dictionary using a '/' separated string. + + Parameters + ---------- + location : str + The location of the data in the dictionary, separated by '/'. + + Returns + ------- + int | float | str | np.ndarray | dict + The data at the location. + """ + # If there's a trailing '/', remove it + if location[-1] == "/": + location = location[:-1] + keys = location.split("/") + + try: + current_data = self.data + for key in keys: + current_data = current_data[key] + LOGGER.info(f"[ Get data ] Data found at {location}, type: {type(current_data)}") + return current_data + except KeyError as e: + LOGGER.error(f"[ Get data ] Key not found: {e}, please check the location string.") + return None + + def data_info(self, location: str, verbose: bool = False) -> None: + """ + Get information about the data at a location. + + Parameters + ---------- + location : str + The location of the data in the dictionary, separated by '/'. + + verbose : bool, optional + Print more detailed information about the data, by default False. + """ + # If there's a trailing '/', remove it + if location[-1] == "/": + location = location[:-1] + keys = location.split("/") + + try: + current_data = self.data + for key in keys: + current_data = current_data[key] + except KeyError as e: + LOGGER.error(f"[ Info ] Key not found: {e}, please check the location string.") + return + + if isinstance(current_data, dict): + key_types = {type(k) for k in current_data.keys()} + value_types = {type(v) for v in current_data.values()} + LOGGER.info( + f"[ Info ] Data at {location} is a dictionary with {len(current_data)} " + f"keys of types {key_types} and values " + f"of types {value_types}" + ) + if verbose: + for k, v in current_data.items(): + LOGGER.info(f" {k}: {type(v)}") + elif isinstance(current_data, npt.NDArray): + LOGGER.info( + f"[ Info ] Data at {location} is a numpy array with shape: {current_data.shape}, " + f"dtype: {current_data.dtype}" + ) + else: + LOGGER.info(f"[ Info ] Data at {location} is {type(current_data)}") + + return From b4ade4a9dcdda6b968db831c966c9fcdbeb9f617 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Jan 2025 11:20:06 +0000 Subject: [PATCH 16/16] [pre-commit.ci] Fixing issues with pre-commit --- notebooks/topostats_file_helper_example.ipynb | 8 +++----- tests/test_io.py | 1 - topostats/topostats_file_helper.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/notebooks/topostats_file_helper_example.ipynb b/notebooks/topostats_file_helper_example.ipynb index 11d989b3aa4..4011665f03e 100644 --- a/notebooks/topostats_file_helper_example.ipynb +++ b/notebooks/topostats_file_helper_example.ipynb @@ -2,17 +2,15 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import needed libraries\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "from IPython.display import clear_output\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from h5glance import H5Glance" + "from h5glance import H5Glance\n", + "from IPython.display import clear_output" ] }, { diff --git a/tests/test_io.py b/tests/test_io.py index e71483aae99..6637cc19755 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -17,7 +17,6 @@ from topostats.io import ( LoadScans, - TopoFileHelper, convert_basename_to_relative_paths, dict_to_hdf5, dict_to_json, diff --git a/topostats/topostats_file_helper.py b/topostats/topostats_file_helper.py index daac507b036..17b906767b7 100644 --- a/topostats/topostats_file_helper.py +++ b/topostats/topostats_file_helper.py @@ -1,7 +1,7 @@ """For helper scripts in handling .topostats files.""" -from pathlib import Path import logging +from pathlib import Path import h5py import numpy.typing as npt