Add data tests

yupidevs · Jun 7, 2023 · 22c5e72 · 22c5e72
1 parent 49f2ed4
commit 22c5e72
Show file tree

Hide file tree

Showing 11 changed files with 213 additions and 27 deletions.
diff --git a/examples/example_02.py b/examples/example_02.py
@@ -27,7 +27,7 @@
     save_best_only=True,
     mode="min",
 )
-model.train(train, epochs=150, batch_size=64, checkpoint=checkpoint)
+model.train(train, dataset, epochs=150, batch_size=64, checkpoint=checkpoint)
 
 # Evaluate the model on a test dataset
 evaluation = model.evaluate(test)

diff --git a/examples/mnist_stroke_example.py b/examples/mnist_stroke_example.py
@@ -65,7 +65,7 @@
     save_best_only=True,
     mode="min",
 )
-lstm.train(train, epochs=20, checkpoint=checkpoint)
+lstm.train(train, dataset, epochs=20, checkpoint=checkpoint)
 evaluation = lstm.evaluate(test)
 evaluation.show()
 
@@ -77,6 +77,6 @@
     save_best_only=True,
     mode="min",
 )
-transformer.train(train, epochs=150, checkpoint=checkpoint)
+transformer.train(train, dataset, epochs=150, checkpoint=checkpoint)
 evaluation = transformer.evaluate(test)
 evaluation.show()
diff --git a/examples/trasnformer_example.py b/examples/trasnformer_example.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 from tensorflow import keras
 
 from pactus import Dataset, featurizers
@@ -18,16 +20,16 @@
 ]
 
 
-def dataset_splitter(ds: Data) -> tuple[Data, Data]:
-    if ds.dataset.name == "geolife":
+def dataset_splitter(ds: Data) -> Tuple[Data, Data]:
+    if ds.dataset_name == "geolife":
         use_classes = {"car", "taxi-bus", "walk", "bike", "subway", "train"}
         return (
             ds.filter(lambda traj, _: len(traj) > 10 and traj.dt < 8)
             .map(lambda _, lbl: (_, "taxi-bus" if lbl in ("bus", "taxi") else lbl))
             .filter(lambda _, lbl: lbl in use_classes)
             .split(train_size=0.7, random_state=SEED)
         )
-    if ds.dataset.name == "mnist_stroke":
+    if ds.dataset_name == "mnist_stroke":
         ds = ds.take(10_000)
     return ds.filter(
         lambda traj, _: len(traj) >= 5 and traj.r.delta.norm.sum() > 0
@@ -50,6 +52,6 @@ def dataset_splitter(ds: Data) -> tuple[Data, Data]:
         optimizer=keras.optimizers.Adam(learning_rate=1e-4),
     )
 
-    model.train(train, epochs=150, batch_size=64)
+    model.train(train, dataset, epochs=150, batch_size=64)
     evaluation = model.evaluate(test)
     evaluation.show()
diff --git a/pactus/dataset/dataset.py b/pactus/dataset/dataset.py
@@ -26,23 +26,27 @@ class Data:
 
     Parameters
     ----------
-    dataset : Dataset
-        Base dataset from where the data is being used.
     trajs: List[Trajectory]
         A list that contains a subset of the dataset trajectories.
     labels: List[Any]
         A list that contains the label of each trajectory from the subset.
+    dataset_name: str
+        Name of the dataset where the trajectories come from. If not provided,
+        it will be set to "custom".
     """
 
     def __init__(
-        self, dataset: Dataset, trajs: List[Trajectory], labels: List[Any]
+        self,
+        trajs: List[Trajectory],
+        labels: List[Any],
+        dataset_name: str = "custom",
     ) -> None:
-        self.dataset = dataset
         self.trajs = trajs
         self.labels = labels
         self.label_counts = Counter(labels)
         self.feats = None
         self.last_featurizer = None
+        self.dataset_name = dataset_name
 
     @property
     def classes(self) -> List[Any]:
@@ -79,7 +83,30 @@ def take(
         shuffle: bool = True,
         random_state: Union[int, None] = None,
     ) -> Data:
-        """Takes a subset of the dataset."""
+        """
+        Takes a subset of the dataset.
+        
+        Parameters
+        ----------
+        size : Union[float, int]
+            If float, it should be between 0 and 1 and it will be interpreted
+            as the proportion of the dataset to be taken. If int, it should be
+            between 0 and the dataset size and it will be interpreted as the
+            number of trajectories to be taken.
+        stratify : bool, optional
+            If True, the dataset will be stratified by the labels, by default
+            True.
+        shuffle : bool, optional
+            If True, the dataset will be shuffled before taking the subset,
+            by default True.
+        random_state : Union[int, None], optional
+            Random state to be used, by default None.
+        
+        Returns
+        -------
+        Data
+            A new Data object with the subset of the dataset.
+        """
         if isinstance(size, int):
             assert 0 < size < len(self), "size should be within 0 and len(self)"
             size /= len(self)
@@ -89,10 +116,24 @@ def take(
         )
         return ans
 
-    def cut(self, size: Union[float, int]):
+    def cut(self, size: Union[float, int]) -> Tuple[Data, Data]:
         """
         Similar to split, but without shuffle, stratify, etc. Just slices the
         dataset into two parts.
+
+        Parameters
+        ----------
+        size : Union[float, int]
+            If float, it should be between 0 and 1 and it will be interpreted
+            as the proportion of the dataset to be taken. If int, it should be
+            between 0 and the dataset size and it will be interpreted as the
+            number of trajectories to be taken.
+        
+        Returns
+        -------
+        Tuple[Data, Data]
+            A tuple with two Data objects, the first one with the first part
+            of the cut and the second one with the second part.
         """
         if isinstance(size, float):
             assert 0 < size < 1, "size should be within 0 and 1 if float"
@@ -104,8 +145,8 @@ def cut(self, size: Union[float, int]):
 
         left, right = self.trajs[:size], self.trajs[size:]
         left_labels, right_labels = self.labels[:size], self.labels[size:]
-        left_d = Data(self.dataset, left, left_labels)
-        right_d = Data(self.dataset, right, right_labels)
+        left_d = Data(left, left_labels)
+        right_d = Data(right, right_labels)
         return left_d, right_d
 
     def split(
@@ -156,8 +197,8 @@ def split(
             shuffle=shuffle,
         )
 
-        train_data = Data(self.dataset, x_train, y_train)
-        test_data = Data(self.dataset, x_test, y_test)
+        train_data = Data(x_train, y_train)
+        test_data = Data(x_test, y_test)
         return train_data, test_data
 
     def map(self, func: Callable[[Trajectory, Any], Tuple[Trajectory, Any]]) -> Data:
@@ -182,7 +223,7 @@ def map(self, func: Callable[[Trajectory, Any], Tuple[Trajectory, Any]]) -> Data
             traj, label = func(traj, label)
             trajs.append(traj)
             labels.append(label)
-        return Data(self.dataset, trajs, labels)
+        return Data(trajs, labels)
 
     def filter(self, func: Callable[[Trajectory, Any], bool]) -> Data:
         """
@@ -204,7 +245,7 @@ def filter(self, func: Callable[[Trajectory, Any], bool]) -> Data:
                 trajs.append(traj)
                 labels.append(label)
         logging.info("Filtered %d of %d trajectories", len(trajs), len(self))
-        return Data(self.dataset, trajs, labels)
+        return Data(trajs, labels)
 
 
 class Dataset(Data):
@@ -236,7 +277,7 @@ def __init__(
         self.version = version
         self.trajs = trajs
         self.labels = labels
-        super().__init__(self, trajs, labels)
+        super().__init__(trajs, labels)
 
     def __len__(self):
         return len(self.trajs)

diff --git a/pactus/models/evaluation.py b/pactus/models/evaluation.py
@@ -89,7 +89,7 @@ def from_data(
         model_summary: dict,
     ) -> Evaluation:
         return Evaluation(
-            dataset_name=data.dataset.name,
+            dataset_name=data.dataset_name,
             trajs_ids=[traj.traj_id for traj in data.trajs if traj.traj_id is not None],
             y_true=data.labels,
             y_pred=predictions,

diff --git a/pactus/models/lstm_model.py b/pactus/models/lstm_model.py
@@ -8,6 +8,7 @@
 from tensorflow.keras import layers
 from yupi import Trajectory
 
+from pactus import Dataset
 from pactus import config as cfg
 from pactus.dataset import Data
 from pactus.models.evaluation import Evaluation
@@ -35,6 +36,7 @@ def __init__(
         super().__init__(NAME)
         self.masking_value = cfg.MASK_VALUE if masking_value is None else masking_value
         self.encoder: Union[LabelEncoder, None] = None
+        self.dataset: Union[Dataset, None] = None
         self.model: keras.Secuential
         self.max_len = 0
         metrics = ["accuracy"] if metrics is None else metrics
@@ -86,18 +88,21 @@ def _get_model(self, input_shape, n_classes):
         return model
 
     def _prepare_data(self, data: Data) -> Tuple[np.ndarray, np.ndarray]:
+        assert self.dataset is not None, "Dataset is not set"
+
         self.encoder = LabelEncoder()
         self.encoder.fit(data.labels)
         encoded_labels = self.encoder.transform(data.labels)
         y_data = np.array(encoded_labels)
 
-        self.max_len = max(map(len, data.dataset.trajs))
+        self.max_len = max(map(len, self.dataset.trajs))
         x_data = self._get_x_data(self.max_len, data.trajs)
         return x_data, y_data
 
     def train(
         self,
         data: Data,
+        dataset: Dataset,
         cross_validation=0,
         epochs=10,
         batch_size=None,
@@ -110,6 +115,7 @@ def train(
         self.set_summary(epochs=epochs, validation_split=validation_split)
         callbacks = DEFAULT_CALLBACKS.copy() if callbacks is None else callbacks
         model_path = None
+        self.dataset = dataset
         if checkpoint is not None:
             callbacks.append(checkpoint)
             if Path(checkpoint.filepath).exists():

diff --git a/pactus/models/transformer_model.py b/pactus/models/transformer_model.py
@@ -8,6 +8,7 @@
 from tensorflow import keras
 
 import pactus.config as cfg
+from pactus import Dataset
 from pactus.dataset import Data
 from pactus.models import Model
 from pactus.models.evaluation import Evaluation
@@ -57,6 +58,7 @@ def __init__(
         self.mask_value = mask_value
         self.encoder: Union[LabelEncoder, None] = None
         self.labels: Union[List[Any], None] = None
+        self.dataset: Union[Dataset, None] = None
         self.set_summary(
             head_size=self.head_size,
             num_heads=self.num_heads,
@@ -75,6 +77,7 @@ def __init__(
     def train(
         self,
         data: Data,
+        dataset: Dataset,
         cross_validation: int = 0,
         epochs: int = 10,
         validation_split: float = 0.2,
@@ -90,6 +93,7 @@ def train(
         )
         self.encoder = None
         self.labels = data.labels
+        self.dataset = dataset
         x_train, y_train = self._get_input_data(data)
         n_classes = len(data.classes)
         input_shape = x_train.shape[1:]
@@ -208,8 +212,10 @@ def _encode_labels(self, data: Data) -> np.ndarray:
 
     def _extract_raw_data(self, data: Data) -> np.ndarray:
         """Extracts the raw data from the yupi trajectories"""
+        assert self.dataset is not None, "Dataset must be set"
+
         trajs = data.trajs
-        max_len = np.max([len(traj) for traj in data.dataset.trajs])
+        max_len = np.max([len(traj) for traj in self.dataset.trajs])
         if self.max_traj_len > 0:
             max_len = self.max_traj_len
         raw_data = [np.hstack((traj.r, np.reshape(traj.t, (-1, 1)))) for traj in trajs]

diff --git a/pactus/models/xgboost_model.py b/pactus/models/xgboost_model.py
@@ -43,6 +43,7 @@ def train(self, data: Data, cross_validation: int = 0):
     def predict(self, data: Data) -> List[Any]:
         x_data = data.featurize(self.featurizer)
         predicted = self.grid.predict(x_data)
+        assert self.encoder is not None
         return self.encoder.inverse_transform(predicted)
 
     def predict_single(self, traj: Trajectory) -> Any:

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,8 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
 ]
-keywords = ["trajectory", "classification"]
+keywords = ["trajectory",
+    "classification"]
 dependencies = [
     "numpy >= 1.20.0",
     "yupi >= 0.11.2",
@@ -25,8 +26,21 @@ dependencies = [
 ]
 requires-python = ">=3.8"
 
+[options]
+packages = [
+    { name = "pactus", include = true },
+]
+
 [project.optional-dependencies]
-dev = ["black", "pylint", "bumpver", "isort", "pytest"]
+dev = [
+    "mypy",
+    "black",
+    "pylint",
+    "bumpver",
+    "isort",
+    "pytest",
+    "tensor-annotations-tensorflow-stubs"
+]
 
 [project.urls]
 Homepage = "https://github.com/yupidevs/pactus"
@@ -36,7 +50,7 @@ profile = "black"
 known_first_party = ["pactus"]
 
 [tool.black]
-target-version = ["py37"]
+target-version = ["py38"]
 
 [tool.pylint."MESSAGES CONTROL"]
 max-line-length = 88
@@ -58,6 +72,16 @@ push = false
 ]
 
 [build-system]
-requires      = ["setuptools>=61.0.0", "wheel"]
+requires      = ["setuptools>=61.0.0",
+    "wheel"]
 build-backend = "setuptools.build_meta"
 
+[tool.mypy]
+python_version = "3.8"
+
+[[tool.mypy.overrides]]
+module = [
+    "yupi.*",
+    "sklearn.*",
+]
+ignore_missing_imports = true
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,5 @@
+[metadata]
+name = pactus
+
+[options]
+packages = find: