diff --git a/examples/example_02.py b/examples/example_02.py index 517790f..2b9f10a 100644 --- a/examples/example_02.py +++ b/examples/example_02.py @@ -27,7 +27,7 @@ save_best_only=True, mode="min", ) -model.train(train, epochs=150, batch_size=64, checkpoint=checkpoint) +model.train(train, dataset, epochs=150, batch_size=64, checkpoint=checkpoint) # Evaluate the model on a test dataset evaluation = model.evaluate(test) diff --git a/examples/mnist_stroke_example.py b/examples/mnist_stroke_example.py index cd4a94c..ef67411 100644 --- a/examples/mnist_stroke_example.py +++ b/examples/mnist_stroke_example.py @@ -65,7 +65,7 @@ save_best_only=True, mode="min", ) -lstm.train(train, epochs=20, checkpoint=checkpoint) +lstm.train(train, dataset, epochs=20, checkpoint=checkpoint) evaluation = lstm.evaluate(test) evaluation.show() @@ -77,6 +77,6 @@ save_best_only=True, mode="min", ) -transformer.train(train, epochs=150, checkpoint=checkpoint) +transformer.train(train, dataset, epochs=150, checkpoint=checkpoint) evaluation = transformer.evaluate(test) evaluation.show() diff --git a/examples/trasnformer_example.py b/examples/trasnformer_example.py index c0980f6..8d34a9b 100644 --- a/examples/trasnformer_example.py +++ b/examples/trasnformer_example.py @@ -1,3 +1,5 @@ +from typing import Tuple + from tensorflow import keras from pactus import Dataset, featurizers @@ -18,8 +20,8 @@ ] -def dataset_splitter(ds: Data) -> tuple[Data, Data]: - if ds.dataset.name == "geolife": +def dataset_splitter(ds: Data) -> Tuple[Data, Data]: + if ds.dataset_name == "geolife": use_classes = {"car", "taxi-bus", "walk", "bike", "subway", "train"} return ( ds.filter(lambda traj, _: len(traj) > 10 and traj.dt < 8) @@ -27,7 +29,7 @@ def dataset_splitter(ds: Data) -> tuple[Data, Data]: .filter(lambda _, lbl: lbl in use_classes) .split(train_size=0.7, random_state=SEED) ) - if ds.dataset.name == "mnist_stroke": + if ds.dataset_name == "mnist_stroke": ds = ds.take(10_000) return ds.filter( lambda traj, _: len(traj) >= 5 and traj.r.delta.norm.sum() > 0 @@ -50,6 +52,6 @@ def dataset_splitter(ds: Data) -> tuple[Data, Data]: optimizer=keras.optimizers.Adam(learning_rate=1e-4), ) - model.train(train, epochs=150, batch_size=64) + model.train(train, dataset, epochs=150, batch_size=64) evaluation = model.evaluate(test) evaluation.show() diff --git a/pactus/dataset/dataset.py b/pactus/dataset/dataset.py index 7665b67..5a61901 100644 --- a/pactus/dataset/dataset.py +++ b/pactus/dataset/dataset.py @@ -26,23 +26,27 @@ class Data: Parameters ---------- - dataset : Dataset - Base dataset from where the data is being used. trajs: List[Trajectory] A list that contains a subset of the dataset trajectories. labels: List[Any] A list that contains the label of each trajectory from the subset. + dataset_name: str + Name of the dataset where the trajectories come from. If not provided, + it will be set to "custom". """ def __init__( - self, dataset: Dataset, trajs: List[Trajectory], labels: List[Any] + self, + trajs: List[Trajectory], + labels: List[Any], + dataset_name: str = "custom", ) -> None: - self.dataset = dataset self.trajs = trajs self.labels = labels self.label_counts = Counter(labels) self.feats = None self.last_featurizer = None + self.dataset_name = dataset_name @property def classes(self) -> List[Any]: @@ -79,7 +83,30 @@ def take( shuffle: bool = True, random_state: Union[int, None] = None, ) -> Data: - """Takes a subset of the dataset.""" + """ + Takes a subset of the dataset. + + Parameters + ---------- + size : Union[float, int] + If float, it should be between 0 and 1 and it will be interpreted + as the proportion of the dataset to be taken. If int, it should be + between 0 and the dataset size and it will be interpreted as the + number of trajectories to be taken. + stratify : bool, optional + If True, the dataset will be stratified by the labels, by default + True. + shuffle : bool, optional + If True, the dataset will be shuffled before taking the subset, + by default True. + random_state : Union[int, None], optional + Random state to be used, by default None. + + Returns + ------- + Data + A new Data object with the subset of the dataset. + """ if isinstance(size, int): assert 0 < size < len(self), "size should be within 0 and len(self)" size /= len(self) @@ -89,10 +116,24 @@ def take( ) return ans - def cut(self, size: Union[float, int]): + def cut(self, size: Union[float, int]) -> Tuple[Data, Data]: """ Similar to split, but without shuffle, stratify, etc. Just slices the dataset into two parts. + + Parameters + ---------- + size : Union[float, int] + If float, it should be between 0 and 1 and it will be interpreted + as the proportion of the dataset to be taken. If int, it should be + between 0 and the dataset size and it will be interpreted as the + number of trajectories to be taken. + + Returns + ------- + Tuple[Data, Data] + A tuple with two Data objects, the first one with the first part + of the cut and the second one with the second part. """ if isinstance(size, float): assert 0 < size < 1, "size should be within 0 and 1 if float" @@ -104,8 +145,8 @@ def cut(self, size: Union[float, int]): left, right = self.trajs[:size], self.trajs[size:] left_labels, right_labels = self.labels[:size], self.labels[size:] - left_d = Data(self.dataset, left, left_labels) - right_d = Data(self.dataset, right, right_labels) + left_d = Data(left, left_labels) + right_d = Data(right, right_labels) return left_d, right_d def split( @@ -156,8 +197,8 @@ def split( shuffle=shuffle, ) - train_data = Data(self.dataset, x_train, y_train) - test_data = Data(self.dataset, x_test, y_test) + train_data = Data(x_train, y_train) + test_data = Data(x_test, y_test) return train_data, test_data def map(self, func: Callable[[Trajectory, Any], Tuple[Trajectory, Any]]) -> Data: @@ -182,7 +223,7 @@ def map(self, func: Callable[[Trajectory, Any], Tuple[Trajectory, Any]]) -> Data traj, label = func(traj, label) trajs.append(traj) labels.append(label) - return Data(self.dataset, trajs, labels) + return Data(trajs, labels) def filter(self, func: Callable[[Trajectory, Any], bool]) -> Data: """ @@ -204,7 +245,7 @@ def filter(self, func: Callable[[Trajectory, Any], bool]) -> Data: trajs.append(traj) labels.append(label) logging.info("Filtered %d of %d trajectories", len(trajs), len(self)) - return Data(self.dataset, trajs, labels) + return Data(trajs, labels) class Dataset(Data): @@ -236,7 +277,7 @@ def __init__( self.version = version self.trajs = trajs self.labels = labels - super().__init__(self, trajs, labels) + super().__init__(trajs, labels) def __len__(self): return len(self.trajs) diff --git a/pactus/models/evaluation.py b/pactus/models/evaluation.py index 407cf27..5e6fbd2 100644 --- a/pactus/models/evaluation.py +++ b/pactus/models/evaluation.py @@ -89,7 +89,7 @@ def from_data( model_summary: dict, ) -> Evaluation: return Evaluation( - dataset_name=data.dataset.name, + dataset_name=data.dataset_name, trajs_ids=[traj.traj_id for traj in data.trajs if traj.traj_id is not None], y_true=data.labels, y_pred=predictions, diff --git a/pactus/models/lstm_model.py b/pactus/models/lstm_model.py index ef07048..a6ce995 100644 --- a/pactus/models/lstm_model.py +++ b/pactus/models/lstm_model.py @@ -8,6 +8,7 @@ from tensorflow.keras import layers from yupi import Trajectory +from pactus import Dataset from pactus import config as cfg from pactus.dataset import Data from pactus.models.evaluation import Evaluation @@ -35,6 +36,7 @@ def __init__( super().__init__(NAME) self.masking_value = cfg.MASK_VALUE if masking_value is None else masking_value self.encoder: Union[LabelEncoder, None] = None + self.dataset: Union[Dataset, None] = None self.model: keras.Secuential self.max_len = 0 metrics = ["accuracy"] if metrics is None else metrics @@ -86,18 +88,21 @@ def _get_model(self, input_shape, n_classes): return model def _prepare_data(self, data: Data) -> Tuple[np.ndarray, np.ndarray]: + assert self.dataset is not None, "Dataset is not set" + self.encoder = LabelEncoder() self.encoder.fit(data.labels) encoded_labels = self.encoder.transform(data.labels) y_data = np.array(encoded_labels) - self.max_len = max(map(len, data.dataset.trajs)) + self.max_len = max(map(len, self.dataset.trajs)) x_data = self._get_x_data(self.max_len, data.trajs) return x_data, y_data def train( self, data: Data, + dataset: Dataset, cross_validation=0, epochs=10, batch_size=None, @@ -110,6 +115,7 @@ def train( self.set_summary(epochs=epochs, validation_split=validation_split) callbacks = DEFAULT_CALLBACKS.copy() if callbacks is None else callbacks model_path = None + self.dataset = dataset if checkpoint is not None: callbacks.append(checkpoint) if Path(checkpoint.filepath).exists(): diff --git a/pactus/models/transformer_model.py b/pactus/models/transformer_model.py index 67e9eab..4385122 100644 --- a/pactus/models/transformer_model.py +++ b/pactus/models/transformer_model.py @@ -8,6 +8,7 @@ from tensorflow import keras import pactus.config as cfg +from pactus import Dataset from pactus.dataset import Data from pactus.models import Model from pactus.models.evaluation import Evaluation @@ -57,6 +58,7 @@ def __init__( self.mask_value = mask_value self.encoder: Union[LabelEncoder, None] = None self.labels: Union[List[Any], None] = None + self.dataset: Union[Dataset, None] = None self.set_summary( head_size=self.head_size, num_heads=self.num_heads, @@ -75,6 +77,7 @@ def __init__( def train( self, data: Data, + dataset: Dataset, cross_validation: int = 0, epochs: int = 10, validation_split: float = 0.2, @@ -90,6 +93,7 @@ def train( ) self.encoder = None self.labels = data.labels + self.dataset = dataset x_train, y_train = self._get_input_data(data) n_classes = len(data.classes) input_shape = x_train.shape[1:] @@ -208,8 +212,10 @@ def _encode_labels(self, data: Data) -> np.ndarray: def _extract_raw_data(self, data: Data) -> np.ndarray: """Extracts the raw data from the yupi trajectories""" + assert self.dataset is not None, "Dataset must be set" + trajs = data.trajs - max_len = np.max([len(traj) for traj in data.dataset.trajs]) + max_len = np.max([len(traj) for traj in self.dataset.trajs]) if self.max_traj_len > 0: max_len = self.max_traj_len raw_data = [np.hstack((traj.r, np.reshape(traj.t, (-1, 1)))) for traj in trajs] diff --git a/pactus/models/xgboost_model.py b/pactus/models/xgboost_model.py index bed7560..a89bd57 100644 --- a/pactus/models/xgboost_model.py +++ b/pactus/models/xgboost_model.py @@ -43,6 +43,7 @@ def train(self, data: Data, cross_validation: int = 0): def predict(self, data: Data) -> List[Any]: x_data = data.featurize(self.featurizer) predicted = self.grid.predict(x_data) + assert self.encoder is not None return self.encoder.inverse_transform(predicted) def predict_single(self, traj: Trajectory) -> Any: diff --git a/pyproject.toml b/pyproject.toml index 631946c..bdc31f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,8 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", ] -keywords = ["trajectory", "classification"] +keywords = ["trajectory", + "classification"] dependencies = [ "numpy >= 1.20.0", "yupi >= 0.11.2", @@ -25,8 +26,21 @@ dependencies = [ ] requires-python = ">=3.8" +[options] +packages = [ + { name = "pactus", include = true }, +] + [project.optional-dependencies] -dev = ["black", "pylint", "bumpver", "isort", "pytest"] +dev = [ + "mypy", + "black", + "pylint", + "bumpver", + "isort", + "pytest", + "tensor-annotations-tensorflow-stubs" +] [project.urls] Homepage = "https://github.com/yupidevs/pactus" @@ -36,7 +50,7 @@ profile = "black" known_first_party = ["pactus"] [tool.black] -target-version = ["py37"] +target-version = ["py38"] [tool.pylint."MESSAGES CONTROL"] max-line-length = 88 @@ -58,6 +72,16 @@ push = false ] [build-system] -requires = ["setuptools>=61.0.0", "wheel"] +requires = ["setuptools>=61.0.0", + "wheel"] build-backend = "setuptools.build_meta" +[tool.mypy] +python_version = "3.8" + +[[tool.mypy.overrides]] +module = [ + "yupi.*", + "sklearn.*", +] +ignore_missing_imports = true \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..197e53a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[metadata] +name = pactus + +[options] +packages = find: \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..3c753e7 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,101 @@ +from re import S + +import numpy as np +import pytest +import yupi +from yupi.core.featurizers import SpatialFeaturizer + +from pactus.dataset import Data + +np.random.seed(42) + + +@pytest.fixture +def data(): + trajs = [ + yupi.Trajectory(x=np.random.rand(100), y=np.random.rand(100)) + for _ in range(1000) + ] + labels = np.random.choice(["a", "b", "c"], size=1000) + return Data(trajs, labels) + + +def test_data(data: Data): + assert len(data) == 1000 + assert len(data.labels) == 1000 + assert len(data.trajs) == 1000 + assert set(data.classes) == {"a", "b", "c"} + + +def test_data_float_cut(data: Data): + left, right = data.cut(0.5) + assert len(left) == 500 + assert len(right) == 500 + + with pytest.raises(AssertionError): + data.cut(1.5) + + with pytest.raises(AssertionError): + data.cut(-0.5) + + +def test_data_int_cut(data: Data): + left, right = data.cut(500) + assert len(left) == 500 + assert len(right) == 500 + + with pytest.raises(AssertionError): + data.cut(1500) + + with pytest.raises(AssertionError): + data.cut(-500) + + +def test_data_float_split(data: Data): + train, test = data.split(0.8) + assert len(train) == 800 + assert len(test) == 200 + + with pytest.raises(AssertionError): + data.split(1.5) + + with pytest.raises(AssertionError): + data.split(-0.5) + + +def test_data_int_split(data: Data): + train, test = data.split(800) + assert len(train) == 800 + assert len(test) == 200 + + with pytest.raises(AssertionError): + data.split(1500) + + with pytest.raises(AssertionError): + data.split(-500) + +def test_data_float_take(data: Data): + sub_data = data.take(0.8) + assert len(sub_data) == 800 + + with pytest.raises(AssertionError): + data.take(1.5) + + with pytest.raises(AssertionError): + data.take(-0.5) + +def test_data_int_take(data: Data): + sub_data = data.take(800) + assert len(sub_data) == 800 + + with pytest.raises(AssertionError): + data.take(1500) + + with pytest.raises(AssertionError): + data.take(-500) + +def test_data_featurize(data: Data): + featurizer = SpatialFeaturizer() + featurized_data = data.featurize(featurizer) + assert len(featurized_data) == 1000 + assert len(featurized_data[0]) == featurizer.count \ No newline at end of file