Skip to content

Commit

Permalink
Add data tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jmorgadov committed Jun 7, 2023
1 parent 49f2ed4 commit 22c5e72
Show file tree
Hide file tree
Showing 11 changed files with 213 additions and 27 deletions.
2 changes: 1 addition & 1 deletion examples/example_02.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
save_best_only=True,
mode="min",
)
model.train(train, epochs=150, batch_size=64, checkpoint=checkpoint)
model.train(train, dataset, epochs=150, batch_size=64, checkpoint=checkpoint)

# Evaluate the model on a test dataset
evaluation = model.evaluate(test)
Expand Down
4 changes: 2 additions & 2 deletions examples/mnist_stroke_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
save_best_only=True,
mode="min",
)
lstm.train(train, epochs=20, checkpoint=checkpoint)
lstm.train(train, dataset, epochs=20, checkpoint=checkpoint)
evaluation = lstm.evaluate(test)
evaluation.show()

Expand All @@ -77,6 +77,6 @@
save_best_only=True,
mode="min",
)
transformer.train(train, epochs=150, checkpoint=checkpoint)
transformer.train(train, dataset, epochs=150, checkpoint=checkpoint)
evaluation = transformer.evaluate(test)
evaluation.show()
10 changes: 6 additions & 4 deletions examples/trasnformer_example.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Tuple

from tensorflow import keras

from pactus import Dataset, featurizers
Expand All @@ -18,16 +20,16 @@
]


def dataset_splitter(ds: Data) -> tuple[Data, Data]:
if ds.dataset.name == "geolife":
def dataset_splitter(ds: Data) -> Tuple[Data, Data]:
if ds.dataset_name == "geolife":
use_classes = {"car", "taxi-bus", "walk", "bike", "subway", "train"}
return (
ds.filter(lambda traj, _: len(traj) > 10 and traj.dt < 8)
.map(lambda _, lbl: (_, "taxi-bus" if lbl in ("bus", "taxi") else lbl))
.filter(lambda _, lbl: lbl in use_classes)
.split(train_size=0.7, random_state=SEED)
)
if ds.dataset.name == "mnist_stroke":
if ds.dataset_name == "mnist_stroke":
ds = ds.take(10_000)
return ds.filter(
lambda traj, _: len(traj) >= 5 and traj.r.delta.norm.sum() > 0
Expand All @@ -50,6 +52,6 @@ def dataset_splitter(ds: Data) -> tuple[Data, Data]:
optimizer=keras.optimizers.Adam(learning_rate=1e-4),
)

model.train(train, epochs=150, batch_size=64)
model.train(train, dataset, epochs=150, batch_size=64)
evaluation = model.evaluate(test)
evaluation.show()
67 changes: 54 additions & 13 deletions pactus/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,27 @@ class Data:
Parameters
----------
dataset : Dataset
Base dataset from where the data is being used.
trajs: List[Trajectory]
A list that contains a subset of the dataset trajectories.
labels: List[Any]
A list that contains the label of each trajectory from the subset.
dataset_name: str
Name of the dataset where the trajectories come from. If not provided,
it will be set to "custom".
"""

def __init__(
self, dataset: Dataset, trajs: List[Trajectory], labels: List[Any]
self,
trajs: List[Trajectory],
labels: List[Any],
dataset_name: str = "custom",
) -> None:
self.dataset = dataset
self.trajs = trajs
self.labels = labels
self.label_counts = Counter(labels)
self.feats = None
self.last_featurizer = None
self.dataset_name = dataset_name

@property
def classes(self) -> List[Any]:
Expand Down Expand Up @@ -79,7 +83,30 @@ def take(
shuffle: bool = True,
random_state: Union[int, None] = None,
) -> Data:
"""Takes a subset of the dataset."""
"""
Takes a subset of the dataset.
Parameters
----------
size : Union[float, int]
If float, it should be between 0 and 1 and it will be interpreted
as the proportion of the dataset to be taken. If int, it should be
between 0 and the dataset size and it will be interpreted as the
number of trajectories to be taken.
stratify : bool, optional
If True, the dataset will be stratified by the labels, by default
True.
shuffle : bool, optional
If True, the dataset will be shuffled before taking the subset,
by default True.
random_state : Union[int, None], optional
Random state to be used, by default None.
Returns
-------
Data
A new Data object with the subset of the dataset.
"""
if isinstance(size, int):
assert 0 < size < len(self), "size should be within 0 and len(self)"
size /= len(self)
Expand All @@ -89,10 +116,24 @@ def take(
)
return ans

def cut(self, size: Union[float, int]):
def cut(self, size: Union[float, int]) -> Tuple[Data, Data]:
"""
Similar to split, but without shuffle, stratify, etc. Just slices the
dataset into two parts.
Parameters
----------
size : Union[float, int]
If float, it should be between 0 and 1 and it will be interpreted
as the proportion of the dataset to be taken. If int, it should be
between 0 and the dataset size and it will be interpreted as the
number of trajectories to be taken.
Returns
-------
Tuple[Data, Data]
A tuple with two Data objects, the first one with the first part
of the cut and the second one with the second part.
"""
if isinstance(size, float):
assert 0 < size < 1, "size should be within 0 and 1 if float"
Expand All @@ -104,8 +145,8 @@ def cut(self, size: Union[float, int]):

left, right = self.trajs[:size], self.trajs[size:]
left_labels, right_labels = self.labels[:size], self.labels[size:]
left_d = Data(self.dataset, left, left_labels)
right_d = Data(self.dataset, right, right_labels)
left_d = Data(left, left_labels)
right_d = Data(right, right_labels)
return left_d, right_d

def split(
Expand Down Expand Up @@ -156,8 +197,8 @@ def split(
shuffle=shuffle,
)

train_data = Data(self.dataset, x_train, y_train)
test_data = Data(self.dataset, x_test, y_test)
train_data = Data(x_train, y_train)
test_data = Data(x_test, y_test)
return train_data, test_data

def map(self, func: Callable[[Trajectory, Any], Tuple[Trajectory, Any]]) -> Data:
Expand All @@ -182,7 +223,7 @@ def map(self, func: Callable[[Trajectory, Any], Tuple[Trajectory, Any]]) -> Data
traj, label = func(traj, label)
trajs.append(traj)
labels.append(label)
return Data(self.dataset, trajs, labels)
return Data(trajs, labels)

def filter(self, func: Callable[[Trajectory, Any], bool]) -> Data:
"""
Expand All @@ -204,7 +245,7 @@ def filter(self, func: Callable[[Trajectory, Any], bool]) -> Data:
trajs.append(traj)
labels.append(label)
logging.info("Filtered %d of %d trajectories", len(trajs), len(self))
return Data(self.dataset, trajs, labels)
return Data(trajs, labels)


class Dataset(Data):
Expand Down Expand Up @@ -236,7 +277,7 @@ def __init__(
self.version = version
self.trajs = trajs
self.labels = labels
super().__init__(self, trajs, labels)
super().__init__(trajs, labels)

def __len__(self):
return len(self.trajs)
Expand Down
2 changes: 1 addition & 1 deletion pactus/models/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def from_data(
model_summary: dict,
) -> Evaluation:
return Evaluation(
dataset_name=data.dataset.name,
dataset_name=data.dataset_name,
trajs_ids=[traj.traj_id for traj in data.trajs if traj.traj_id is not None],
y_true=data.labels,
y_pred=predictions,
Expand Down
8 changes: 7 additions & 1 deletion pactus/models/lstm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from tensorflow.keras import layers
from yupi import Trajectory

from pactus import Dataset
from pactus import config as cfg
from pactus.dataset import Data
from pactus.models.evaluation import Evaluation
Expand Down Expand Up @@ -35,6 +36,7 @@ def __init__(
super().__init__(NAME)
self.masking_value = cfg.MASK_VALUE if masking_value is None else masking_value
self.encoder: Union[LabelEncoder, None] = None
self.dataset: Union[Dataset, None] = None
self.model: keras.Secuential
self.max_len = 0
metrics = ["accuracy"] if metrics is None else metrics
Expand Down Expand Up @@ -86,18 +88,21 @@ def _get_model(self, input_shape, n_classes):
return model

def _prepare_data(self, data: Data) -> Tuple[np.ndarray, np.ndarray]:
assert self.dataset is not None, "Dataset is not set"

self.encoder = LabelEncoder()
self.encoder.fit(data.labels)
encoded_labels = self.encoder.transform(data.labels)
y_data = np.array(encoded_labels)

self.max_len = max(map(len, data.dataset.trajs))
self.max_len = max(map(len, self.dataset.trajs))
x_data = self._get_x_data(self.max_len, data.trajs)
return x_data, y_data

def train(
self,
data: Data,
dataset: Dataset,
cross_validation=0,
epochs=10,
batch_size=None,
Expand All @@ -110,6 +115,7 @@ def train(
self.set_summary(epochs=epochs, validation_split=validation_split)
callbacks = DEFAULT_CALLBACKS.copy() if callbacks is None else callbacks
model_path = None
self.dataset = dataset
if checkpoint is not None:
callbacks.append(checkpoint)
if Path(checkpoint.filepath).exists():
Expand Down
8 changes: 7 additions & 1 deletion pactus/models/transformer_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from tensorflow import keras

import pactus.config as cfg
from pactus import Dataset
from pactus.dataset import Data
from pactus.models import Model
from pactus.models.evaluation import Evaluation
Expand Down Expand Up @@ -57,6 +58,7 @@ def __init__(
self.mask_value = mask_value
self.encoder: Union[LabelEncoder, None] = None
self.labels: Union[List[Any], None] = None
self.dataset: Union[Dataset, None] = None
self.set_summary(
head_size=self.head_size,
num_heads=self.num_heads,
Expand All @@ -75,6 +77,7 @@ def __init__(
def train(
self,
data: Data,
dataset: Dataset,
cross_validation: int = 0,
epochs: int = 10,
validation_split: float = 0.2,
Expand All @@ -90,6 +93,7 @@ def train(
)
self.encoder = None
self.labels = data.labels
self.dataset = dataset
x_train, y_train = self._get_input_data(data)
n_classes = len(data.classes)
input_shape = x_train.shape[1:]
Expand Down Expand Up @@ -208,8 +212,10 @@ def _encode_labels(self, data: Data) -> np.ndarray:

def _extract_raw_data(self, data: Data) -> np.ndarray:
"""Extracts the raw data from the yupi trajectories"""
assert self.dataset is not None, "Dataset must be set"

trajs = data.trajs
max_len = np.max([len(traj) for traj in data.dataset.trajs])
max_len = np.max([len(traj) for traj in self.dataset.trajs])
if self.max_traj_len > 0:
max_len = self.max_traj_len
raw_data = [np.hstack((traj.r, np.reshape(traj.t, (-1, 1)))) for traj in trajs]
Expand Down
1 change: 1 addition & 0 deletions pactus/models/xgboost_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def train(self, data: Data, cross_validation: int = 0):
def predict(self, data: Data) -> List[Any]:
x_data = data.featurize(self.featurizer)
predicted = self.grid.predict(x_data)
assert self.encoder is not None
return self.encoder.inverse_transform(predicted)

def predict_single(self, traj: Trajectory) -> Any:
Expand Down
32 changes: 28 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ classifiers = [
"Programming Language :: Python",
"Programming Language :: Python :: 3",
]
keywords = ["trajectory", "classification"]
keywords = ["trajectory",
"classification"]
dependencies = [
"numpy >= 1.20.0",
"yupi >= 0.11.2",
Expand All @@ -25,8 +26,21 @@ dependencies = [
]
requires-python = ">=3.8"

[options]
packages = [
{ name = "pactus", include = true },
]

[project.optional-dependencies]
dev = ["black", "pylint", "bumpver", "isort", "pytest"]
dev = [
"mypy",
"black",
"pylint",
"bumpver",
"isort",
"pytest",
"tensor-annotations-tensorflow-stubs"
]

[project.urls]
Homepage = "https://github.com/yupidevs/pactus"
Expand All @@ -36,7 +50,7 @@ profile = "black"
known_first_party = ["pactus"]

[tool.black]
target-version = ["py37"]
target-version = ["py38"]

[tool.pylint."MESSAGES CONTROL"]
max-line-length = 88
Expand All @@ -58,6 +72,16 @@ push = false
]

[build-system]
requires = ["setuptools>=61.0.0", "wheel"]
requires = ["setuptools>=61.0.0",
"wheel"]
build-backend = "setuptools.build_meta"

[tool.mypy]
python_version = "3.8"

[[tool.mypy.overrides]]
module = [
"yupi.*",
"sklearn.*",
]
ignore_missing_imports = true
5 changes: 5 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[metadata]
name = pactus

[options]
packages = find:
Loading

0 comments on commit 22c5e72

Please sign in to comment.