From a0f9452af39adc3819fcf309e8731466bd4935b1 Mon Sep 17 00:00:00 2001 From: Deklan Webster Date: Tue, 19 Jan 2021 13:14:19 -0500 Subject: [PATCH 1/5] Add CEASE --- recbole/model/general_recommender/__init__.py | 1 + recbole/model/general_recommender/cease.py | 124 ++++++++++++++++++ recbole/properties/model/CEASE.yaml | 3 + run_test_example.py | 4 + tests/model/test_model_auto.py | 6 + 5 files changed, 138 insertions(+) create mode 100644 recbole/model/general_recommender/cease.py create mode 100644 recbole/properties/model/CEASE.yaml diff --git a/recbole/model/general_recommender/__init__.py b/recbole/model/general_recommender/__init__.py index 7f6de17cb..c3823476c 100644 --- a/recbole/model/general_recommender/__init__.py +++ b/recbole/model/general_recommender/__init__.py @@ -1,4 +1,5 @@ from recbole.model.general_recommender.bpr import BPR +from recbole.model.general_recommender.cease import CEASE from recbole.model.general_recommender.convncf import ConvNCF from recbole.model.general_recommender.dgcf import DGCF from recbole.model.general_recommender.dmf import DMF diff --git a/recbole/model/general_recommender/cease.py b/recbole/model/general_recommender/cease.py new file mode 100644 index 000000000..7e28942de --- /dev/null +++ b/recbole/model/general_recommender/cease.py @@ -0,0 +1,124 @@ + +r""" +C-EASE +################################################ +Reference: + Olivier Jeunen, et al. "Closed-Form Models for Collaborative Filtering with Side-Information". + +Reference code: + https://github.com/olivierjeunen/ease-side-info-recsys-2020/ +""" + + +from recbole.utils.enum_type import ModelType, FeatureType +import numpy as np +import scipy.sparse as sp +import torch + +from recbole.utils import InputType +from recbole.model.abstract_recommender import GeneralRecommender + +from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder + + +def encode_categorical_item_features(dataset, included_features): + item_features = dataset.get_item_feature() + + mlb = MultiLabelBinarizer(sparse_output=True) + ohe = OneHotEncoder(sparse=True) + + encoded_feats = [] + + for feat in included_features: + t = dataset.field2type[feat] + feat_frame = item_features[feat].numpy() + + if t == FeatureType.TOKEN: + encoded = ohe.fit_transform(feat_frame.reshape(-1, 1)) + encoded_feats.append(encoded) + elif t == FeatureType.TOKEN_SEQ: + encoded = mlb.fit_transform(feat_frame) + + # drop first column which corresponds to the padding 0; real categories start at 1 + # convert to csc first? + encoded = encoded[:, 1:] + encoded_feats.append(encoded) + else: + raise Warning( + f'CEASE/A-EASE only supports token or token_seq types. [{feat}] is of type [{t}].') + + if not encoded_feats: + raise ValueError( + f'No valid token or token_seq features to include.') + + return sp.hstack(encoded_feats).T.astype(np.float32) + + +class CEASE(GeneralRecommender): + input_type = InputType.POINTWISE + type = ModelType.TRADITIONAL + + def __init__(self, config, dataset): + super().__init__(config, dataset) + + # need at least one param + self.dummy_param = torch.nn.Parameter(torch.zeros(1)) + + B = dataset.inter_matrix( + form='csr').astype(np.float32) + + item_feat_weight = config['item_feat_weight'] + reg_weight = config['reg_weight'] + included_features = config['included_features'] + + T = encode_categorical_item_features(dataset, included_features) + T *= item_feat_weight + + # just directly calculate the entire score matrix in init + # (can't be done incrementally) + + X = sp.vstack([B, T]).tocsr() + + # gram matrix + G = X.T @ X + + # add reg to diagonal + G += reg_weight * sp.identity(G.shape[0]) + + # convert to dense because inverse will be dense + G = G.todense() + + # invert. this takes most of the time + P = np.linalg.inv(G) + B = P / (-np.diag(P)) + # zero out diag + np.fill_diagonal(B, 0.) + + # instead of computing and storing the entire score matrix, just store B and compute the scores on demand + # more memory efficient for a larger number of users + # but if there's a large number of items not much one can do: + # still have to compute B all at once + # S = X @ B + # self.score_matrix = torch.from_numpy(S).to(self.device) + + # torch doesn't support sparse tensor slicing, so will do everything with np/scipy + self.item_similarity = B + self.interaction_tag_matrix = X + + def forward(self): + pass + + def calculate_loss(self, interaction): + return torch.nn.Parameter(torch.zeros(1)) + + def predict(self, interaction): + user = interaction[self.USER_ID].cpu().numpy() + item = interaction[self.ITEM_ID].cpu().numpy() + + return torch.from_numpy((self.interaction_tag_matrix[user, :].multiply(self.item_similarity[:, item].T)).sum(axis=1).getA1()) + + def full_sort_predict(self, interaction): + user = interaction[self.USER_ID].cpu().numpy() + + r = self.interaction_tag_matrix[user, :] @ self.item_similarity + return torch.from_numpy(r.flatten()) diff --git a/recbole/properties/model/CEASE.yaml b/recbole/properties/model/CEASE.yaml new file mode 100644 index 000000000..86c1d4312 --- /dev/null +++ b/recbole/properties/model/CEASE.yaml @@ -0,0 +1,3 @@ +item_feat_weight: 10.0 +reg_weight: 350.0 +included_features: ['class'] \ No newline at end of file diff --git a/run_test_example.py b/run_test_example.py index 82aacfd92..e4cc4da8c 100644 --- a/run_test_example.py +++ b/run_test_example.py @@ -146,6 +146,10 @@ 'model': 'MacridVAE', 'dataset': 'ml-100k', }, + 'Test CEASE': { + 'model': 'CEASE', + 'dataset': 'ml-100k', + }, # Context-aware Recommendation 'Test FM': { diff --git a/tests/model/test_model_auto.py b/tests/model/test_model_auto.py index 7c2b3b308..0b449eb05 100644 --- a/tests/model/test_model_auto.py +++ b/tests/model/test_model_auto.py @@ -175,6 +175,12 @@ def test_CDAE(self): } quick_test(config_dict) + def test_CEASE(self): + config_dict = { + 'model': 'CEASE', + } + quick_test(config_dict) + class TestContextRecommender(unittest.TestCase): # todo: more complex context information should be test, such as criteo dataset From 1ed6cc0c77111f53cb71c00adf956d5d6f729d0c Mon Sep 17 00:00:00 2001 From: Deklan Webster Date: Tue, 19 Jan 2021 15:41:04 -0500 Subject: [PATCH 2/5] Clean up CEASE --- recbole/model/general_recommender/cease.py | 55 +++++++++++----------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/recbole/model/general_recommender/cease.py b/recbole/model/general_recommender/cease.py index 7e28942de..107710ea3 100644 --- a/recbole/model/general_recommender/cease.py +++ b/recbole/model/general_recommender/cease.py @@ -45,7 +45,7 @@ def encode_categorical_item_features(dataset, included_features): encoded_feats.append(encoded) else: raise Warning( - f'CEASE/A-EASE only supports token or token_seq types. [{feat}] is of type [{t}].') + f'CEASE only supports token or token_seq types. [{feat}] is of type [{t}].') if not encoded_feats: raise ValueError( @@ -54,6 +54,25 @@ def encode_categorical_item_features(dataset, included_features): return sp.hstack(encoded_feats).T.astype(np.float32) +def ease_like(M, reg_weight): + # gram matrix + G = M.T @ M + + # add reg to diagonal + G += reg_weight * sp.identity(G.shape[0]) + + # convert to dense because inverse will be dense + G = G.todense() + + # invert. this takes most of the time + P = np.linalg.inv(G) + B = P / (-np.diag(P)) + # zero out diag + np.fill_diagonal(B, 0.) + + return B + + class CEASE(GeneralRecommender): input_type = InputType.POINTWISE type = ModelType.TRADITIONAL @@ -64,46 +83,28 @@ def __init__(self, config, dataset): # need at least one param self.dummy_param = torch.nn.Parameter(torch.zeros(1)) - B = dataset.inter_matrix( + inter_matrix = dataset.inter_matrix( form='csr').astype(np.float32) item_feat_weight = config['item_feat_weight'] reg_weight = config['reg_weight'] included_features = config['included_features'] - T = encode_categorical_item_features(dataset, included_features) - T *= item_feat_weight + tag_item_matrix = item_feat_weight * encode_categorical_item_features(dataset, included_features) # just directly calculate the entire score matrix in init # (can't be done incrementally) - X = sp.vstack([B, T]).tocsr() - - # gram matrix - G = X.T @ X - - # add reg to diagonal - G += reg_weight * sp.identity(G.shape[0]) - - # convert to dense because inverse will be dense - G = G.todense() + X = sp.vstack([inter_matrix, tag_item_matrix]).tocsr() - # invert. this takes most of the time - P = np.linalg.inv(G) - B = P / (-np.diag(P)) - # zero out diag - np.fill_diagonal(B, 0.) + item_similarity = ease_like(X, reg_weight) # instead of computing and storing the entire score matrix, just store B and compute the scores on demand # more memory efficient for a larger number of users - # but if there's a large number of items not much one can do: - # still have to compute B all at once - # S = X @ B - # self.score_matrix = torch.from_numpy(S).to(self.device) # torch doesn't support sparse tensor slicing, so will do everything with np/scipy - self.item_similarity = B - self.interaction_tag_matrix = X + self.item_similarity = item_similarity + self.interaction_matrix = inter_matrix def forward(self): pass @@ -115,10 +116,10 @@ def predict(self, interaction): user = interaction[self.USER_ID].cpu().numpy() item = interaction[self.ITEM_ID].cpu().numpy() - return torch.from_numpy((self.interaction_tag_matrix[user, :].multiply(self.item_similarity[:, item].T)).sum(axis=1).getA1()) + return torch.from_numpy((self.interaction_matrix[user, :].multiply(self.item_similarity[:, item].T)).sum(axis=1).getA1()) def full_sort_predict(self, interaction): user = interaction[self.USER_ID].cpu().numpy() - r = self.interaction_tag_matrix[user, :] @ self.item_similarity + r = self.interaction_matrix[user, :] @ self.item_similarity return torch.from_numpy(r.flatten()) From 7fbcd3620a77d0bee560b1b5764b8c381fd32fc2 Mon Sep 17 00:00:00 2001 From: Deklan Webster Date: Tue, 19 Jan 2021 15:41:29 -0500 Subject: [PATCH 3/5] Format CEASE --- recbole/model/general_recommender/cease.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recbole/model/general_recommender/cease.py b/recbole/model/general_recommender/cease.py index 107710ea3..b12c2bb19 100644 --- a/recbole/model/general_recommender/cease.py +++ b/recbole/model/general_recommender/cease.py @@ -90,7 +90,8 @@ def __init__(self, config, dataset): reg_weight = config['reg_weight'] included_features = config['included_features'] - tag_item_matrix = item_feat_weight * encode_categorical_item_features(dataset, included_features) + tag_item_matrix = item_feat_weight * \ + encode_categorical_item_features(dataset, included_features) # just directly calculate the entire score matrix in init # (can't be done incrementally) From 77c5ae2565b1ba000cd3bfa8b3bede42ccafbf4f Mon Sep 17 00:00:00 2001 From: Deklan Webster Date: Tue, 19 Jan 2021 15:46:53 -0500 Subject: [PATCH 4/5] Add ADDEASE --- recbole/model/general_recommender/__init__.py | 1 + recbole/model/general_recommender/addease.py | 124 ++++++++++++++++++ recbole/properties/model/ADDEASE.yaml | 4 + run_test_example.py | 4 + tests/model/test_model_auto.py | 6 + 5 files changed, 139 insertions(+) create mode 100644 recbole/model/general_recommender/addease.py create mode 100644 recbole/properties/model/ADDEASE.yaml diff --git a/recbole/model/general_recommender/__init__.py b/recbole/model/general_recommender/__init__.py index c3823476c..c07b3cb2b 100644 --- a/recbole/model/general_recommender/__init__.py +++ b/recbole/model/general_recommender/__init__.py @@ -1,3 +1,4 @@ +from recbole.model.general_recommender.addease import ADDEASE from recbole.model.general_recommender.bpr import BPR from recbole.model.general_recommender.cease import CEASE from recbole.model.general_recommender.convncf import ConvNCF diff --git a/recbole/model/general_recommender/addease.py b/recbole/model/general_recommender/addease.py new file mode 100644 index 000000000..632334eff --- /dev/null +++ b/recbole/model/general_recommender/addease.py @@ -0,0 +1,124 @@ + +r""" +ADD-EASE +################################################ +Reference: + Olivier Jeunen, et al. "Closed-Form Models for Collaborative Filtering with Side-Information". + +Reference code: + https://github.com/olivierjeunen/ease-side-info-recsys-2020/ +""" + + +from recbole.utils.enum_type import ModelType, FeatureType +import numpy as np +import scipy.sparse as sp +import torch + +from recbole.utils import InputType +from recbole.model.abstract_recommender import GeneralRecommender + +from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder + + +def encode_categorical_item_features(dataset, included_features): + item_features = dataset.get_item_feature() + + mlb = MultiLabelBinarizer(sparse_output=True) + ohe = OneHotEncoder(sparse=True) + + encoded_feats = [] + + for feat in included_features: + t = dataset.field2type[feat] + feat_frame = item_features[feat].numpy() + + if t == FeatureType.TOKEN: + encoded = ohe.fit_transform(feat_frame.reshape(-1, 1)) + encoded_feats.append(encoded) + elif t == FeatureType.TOKEN_SEQ: + encoded = mlb.fit_transform(feat_frame) + + # drop first column which corresponds to the padding 0; real categories start at 1 + # convert to csc first? + encoded = encoded[:, 1:] + encoded_feats.append(encoded) + else: + raise Warning( + f'ADD-EASE only supports token or token_seq types. [{feat}] is of type [{t}].') + + if not encoded_feats: + raise ValueError( + f'No valid token or token_seq features to include.') + + return sp.hstack(encoded_feats).T.astype(np.float32) + + +def ease_like(M, reg_weight): + # gram matrix + G = M.T @ M + + # add reg to diagonal + G += reg_weight * sp.identity(G.shape[0]) + + # convert to dense because inverse will be dense + G = G.todense() + + # invert. this takes most of the time + P = np.linalg.inv(G) + B = P / (-np.diag(P)) + # zero out diag + np.fill_diagonal(B, 0.) + + return B + + +class ADDEASE(GeneralRecommender): + input_type = InputType.POINTWISE + type = ModelType.TRADITIONAL + + def __init__(self, config, dataset): + super().__init__(config, dataset) + + # need at least one param + self.dummy_param = torch.nn.Parameter(torch.zeros(1)) + + inter_matrix = dataset.inter_matrix( + form='csr').astype(np.float32) + + item_feat_proportion = config['item_feat_proportion'] + inter_reg_weight = config['inter_reg_weight'] + item_reg_weight = config['item_reg_weight'] + included_features = config['included_features'] + + tag_item_matrix = encode_categorical_item_features( + dataset, included_features) + + inter_S = ease_like(inter_matrix, inter_reg_weight) + item_S = ease_like(tag_item_matrix, item_reg_weight) + + # instead of computing and storing the entire score matrix, just store B and compute the scores on demand + # more memory efficient for a larger number of users + + # torch doesn't support sparse tensor slicing, so will do everything with np/scipy + self.item_similarity = (1-item_feat_proportion) * \ + inter_S + item_feat_proportion * item_S + self.interaction_matrix = inter_matrix + + def forward(self): + pass + + def calculate_loss(self, interaction): + return torch.nn.Parameter(torch.zeros(1)) + + def predict(self, interaction): + user = interaction[self.USER_ID].cpu().numpy() + item = interaction[self.ITEM_ID].cpu().numpy() + + return torch.from_numpy((self.interaction_matrix[user, :].multiply(self.item_similarity[:, item].T)).sum(axis=1).getA1()) + + def full_sort_predict(self, interaction): + user = interaction[self.USER_ID].cpu().numpy() + + r = self.interaction_matrix[user, :] @ self.item_similarity + return torch.from_numpy(r.flatten()) diff --git a/recbole/properties/model/ADDEASE.yaml b/recbole/properties/model/ADDEASE.yaml new file mode 100644 index 000000000..f3c45ff9d --- /dev/null +++ b/recbole/properties/model/ADDEASE.yaml @@ -0,0 +1,4 @@ +item_feat_proportion: 0.001 +inter_reg_weight: 350.0 +item_reg_weight: 150.0 +included_features: ['class'] \ No newline at end of file diff --git a/run_test_example.py b/run_test_example.py index e4cc4da8c..7787776f3 100644 --- a/run_test_example.py +++ b/run_test_example.py @@ -150,6 +150,10 @@ 'model': 'CEASE', 'dataset': 'ml-100k', }, + 'Test ADDEASE': { + 'model': 'ADDEASE', + 'dataset': 'ml-100k', + }, # Context-aware Recommendation 'Test FM': { diff --git a/tests/model/test_model_auto.py b/tests/model/test_model_auto.py index 0b449eb05..3941d44b7 100644 --- a/tests/model/test_model_auto.py +++ b/tests/model/test_model_auto.py @@ -181,6 +181,12 @@ def test_CEASE(self): } quick_test(config_dict) + def test_ADDEASE(self): + config_dict = { + 'model': 'ADDEASE', + } + quick_test(config_dict) + class TestContextRecommender(unittest.TestCase): # todo: more complex context information should be test, such as criteo dataset From a64b263d2fac20ecfa27a574f86db5833b310702 Mon Sep 17 00:00:00 2001 From: Deklan Webster Date: Tue, 19 Jan 2021 18:08:06 -0500 Subject: [PATCH 5/5] Rename to `selected_features` --- recbole/model/general_recommender/addease.py | 8 ++++---- recbole/model/general_recommender/cease.py | 8 ++++---- recbole/properties/model/ADDEASE.yaml | 2 +- recbole/properties/model/CEASE.yaml | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/recbole/model/general_recommender/addease.py b/recbole/model/general_recommender/addease.py index 632334eff..2ee41c09c 100644 --- a/recbole/model/general_recommender/addease.py +++ b/recbole/model/general_recommender/addease.py @@ -21,7 +21,7 @@ from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder -def encode_categorical_item_features(dataset, included_features): +def encode_categorical_item_features(dataset, selected_features): item_features = dataset.get_item_feature() mlb = MultiLabelBinarizer(sparse_output=True) @@ -29,7 +29,7 @@ def encode_categorical_item_features(dataset, included_features): encoded_feats = [] - for feat in included_features: + for feat in selected_features: t = dataset.field2type[feat] feat_frame = item_features[feat].numpy() @@ -89,10 +89,10 @@ def __init__(self, config, dataset): item_feat_proportion = config['item_feat_proportion'] inter_reg_weight = config['inter_reg_weight'] item_reg_weight = config['item_reg_weight'] - included_features = config['included_features'] + selected_features = config['selected_features'] tag_item_matrix = encode_categorical_item_features( - dataset, included_features) + dataset, selected_features) inter_S = ease_like(inter_matrix, inter_reg_weight) item_S = ease_like(tag_item_matrix, item_reg_weight) diff --git a/recbole/model/general_recommender/cease.py b/recbole/model/general_recommender/cease.py index b12c2bb19..0e0763373 100644 --- a/recbole/model/general_recommender/cease.py +++ b/recbole/model/general_recommender/cease.py @@ -21,7 +21,7 @@ from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder -def encode_categorical_item_features(dataset, included_features): +def encode_categorical_item_features(dataset, selected_features): item_features = dataset.get_item_feature() mlb = MultiLabelBinarizer(sparse_output=True) @@ -29,7 +29,7 @@ def encode_categorical_item_features(dataset, included_features): encoded_feats = [] - for feat in included_features: + for feat in selected_features: t = dataset.field2type[feat] feat_frame = item_features[feat].numpy() @@ -88,10 +88,10 @@ def __init__(self, config, dataset): item_feat_weight = config['item_feat_weight'] reg_weight = config['reg_weight'] - included_features = config['included_features'] + selected_features = config['selected_features'] tag_item_matrix = item_feat_weight * \ - encode_categorical_item_features(dataset, included_features) + encode_categorical_item_features(dataset, selected_features) # just directly calculate the entire score matrix in init # (can't be done incrementally) diff --git a/recbole/properties/model/ADDEASE.yaml b/recbole/properties/model/ADDEASE.yaml index f3c45ff9d..379308502 100644 --- a/recbole/properties/model/ADDEASE.yaml +++ b/recbole/properties/model/ADDEASE.yaml @@ -1,4 +1,4 @@ item_feat_proportion: 0.001 inter_reg_weight: 350.0 item_reg_weight: 150.0 -included_features: ['class'] \ No newline at end of file +selected_features: ['class'] \ No newline at end of file diff --git a/recbole/properties/model/CEASE.yaml b/recbole/properties/model/CEASE.yaml index 86c1d4312..265f7f6bf 100644 --- a/recbole/properties/model/CEASE.yaml +++ b/recbole/properties/model/CEASE.yaml @@ -1,3 +1,3 @@ item_feat_weight: 10.0 reg_weight: 350.0 -included_features: ['class'] \ No newline at end of file +selected_features: ['class'] \ No newline at end of file