You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Sorry if this may be the wrong place to ask, I couldn't find a lead anywhere after weeks of googling.
I've implemented a retrieval model to create embeddings for cosine similarity search for a job board application. The retrieval pairs are users (query) and jobs (candidate), the application is to surface likely positive jobs for a user, then passing these jobs into a ranking model for better recommendation.
In our real-world testing, we're seeing around 35%-40% accuracy in alignment with our user behaviour when retrieving the top 5000 candidates using cosine similarity (as the model increases, we plan to lower this, the goal being 500). Also, the TopK results are quite poor (<4%) during training:
We're at a loss on how to improve the model using the recommenders library and retrieval algo.
Here's the model code:
fromtypingimportDict, Text, Listimporttensorflowastfimporttensorflow_recommendersastfrsfrom ..utilsimportget_buckets, _make_text_vect_layer, CONFIG# left side is model layer name, right side is key in dataset# {layer_name: data_key}user_model_feature_to_data_map= {
"user_skills_layer": "user_skills",
"user_specialization_layer": "user_current_specialization",
"user_job_title_layer": "user_job_title",
"user_min_salary_layer": "user_min_salary",
"user_seniority_layer": "user_seniority",
"user_remote_pref_layer": "user_remote",
"user_job_perks_layer": "user_job_perks",
"user_countries_layer": "user_countries",
"user_country_states_layer": "user_country_states",
}
job_model_feature_to_data_map= {
"job_job_title_layer": "job_job_title",
"job_seniority_layer": "job_required_seniority",
"job_remote_pref_layer": "job_remote",
"job_required_skills_layer": "job_required_skills",
"job_normalized_title_layer": "job_normalized_title",
"job_culture_values": "job_culture_values",
"job_perks_layer": "job_perks",
"job_country_layer": "job_country",
"job_country_state_layer": "job_country_state",
"job_must_reside_in_layer": "job_must_reside_in",
"job_min_salary_layer": "job_min_salary",
"job_max_salary_layer": "job_max_salary",
}
classUserModel(tf.keras.Model):
""" build sequential model for each feature pass outputs to dense/cross layers concatentate the outputs the produced embedding represents the features of a Playlist known at query time """def__init__(self, vocab_dict, embedding_dim, seed, layer_sizes):
super().__init__()
self.user_skills_layer=_make_text_vect_layer(
vocab_dict,
"user_skills",
embedding_dim,
layer_name="user_skills",
no_embedding=True,
)
self.user_specialization_layer=_make_text_vect_layer(
vocab_dict, "user_current_specializations", no_embedding=True
)
self.user_job_title_layer=_make_text_vect_layer(
vocab_dict, "user_job_titles", no_embedding=True
)
self.user_seniority_layer=_make_text_vect_layer(
vocab_dict, "user_seniorities", no_embedding=True
)
self.user_remote_pref_layer=_make_text_vect_layer(
vocab_dict, "user_remotes", no_embedding=True
)
user_min_salary_max_val=CONFIG["salary_bucket_max"]
user_min_salary_bucket_step=CONFIG["salary_bucket_step"]
user_min_salary_bucket_num= (
user_min_salary_max_val//user_min_salary_bucket_step
)
self.user_min_salary_layer=tf.keras.Sequential(
[
tf.keras.layers.Discretization(
get_buckets(
min_val=1,
max_val=user_min_salary_max_val,
buckets_num=user_min_salary_bucket_num,
),
output_mode="one_hot",
),
# XXX(Phong): only use the embedding layer if output_mode is "int"# tf.keras.layers.Embedding(# input_dim=user_min_salary_bucket_num + 1,# output_dim=embedding_dim,# name="user_min_salary_emb_layer",# mask_zero=False,# ),
],
name="user_min_salary_layer",
)
self.user_job_perks_layer=_make_text_vect_layer(
vocab_dict, "user_job_perks", no_embedding=True
)
self.user_countries_layer=_make_text_vect_layer(
vocab_dict, "user_countries", no_embedding=True
)
self.user_country_states_layer=_make_text_vect_layer(
vocab_dict, "user_country_states", embedding_dim=embedding_dim
)
self.dense_layers=tf.keras.Sequential(name="user_dense_layers")
forlayer_sizeinlayer_sizes:
self.dense_layers.add(
tf.keras.layers.Dense(
units=layer_size,
activation="relu",
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed),
)
)
# if use_dropout:# self.dense_layers.add(tf.keras.layers.Dropout(rate=0.2))# ADDING L2 NORM AT THE END to fix vector mangtitude (fixed length vector)# good for cosine similarity comparisonsself.dense_layers.add(
tf.keras.layers.Lambda(
lambdax: tf.nn.l2_normalize(x, axis=-1), name="l2_normalization"
)
)
# self.dense_layers.add(# tf.keras.layers.LayerNormalization(name="normalize_dense")# )defcall(self, data):
all_embs=tf.concat(
[
getattr(self, layer_name)(data[data_key])
forlayer_name, data_keyinuser_model_feature_to_data_map.items()
],
axis=1,
)
returnself.dense_layers(all_embs)
classJobModel(tf.keras.Model):
""" build sequential model for each feature pass outputs to dense/cross layers concatentate the outputs the produced embedding represents the features of a Playlist known at query time """def__init__(self, vocab_dict, embedding_dim, seed, layer_sizes):
super().__init__()
### Output mode == int is very important, will run VERY slow if not setself.job_job_title_layer=_make_text_vect_layer(
vocab_dict,
"job_job_titles",
embedding_dim,
layer_name="job_job_title",
output_mode="int",
)
self.job_seniority_layer=_make_text_vect_layer(
vocab_dict, "job_seniorities", no_embedding=True
)
self.job_remote_pref_layer=_make_text_vect_layer(
vocab_dict, "job_remotes", no_embedding=True
)
self.job_required_skills_layer=_make_text_vect_layer(
vocab_dict, "job_required_skills", no_embedding=True
)
self.job_normalized_title_layer=_make_text_vect_layer(
vocab_dict, "job_normalized_titles", no_embedding=True
)
self.job_culture_values=_make_text_vect_layer(
vocab_dict, "job_culture_values", no_embedding=True
)
self.job_perks_layer=_make_text_vect_layer(
vocab_dict, "job_perks", no_embedding=True
)
self.job_country_layer=_make_text_vect_layer(
vocab_dict, "job_countries", no_embedding=True
)
self.job_country_state_layer=_make_text_vect_layer(
vocab_dict, "job_country_states", embedding_dim=embedding_dim
)
self.job_must_reside_in_layer=_make_text_vect_layer(
vocab_dict, "job_must_reside_in", no_embedding=True
)
salary_bucket_max=CONFIG["salary_bucket_max"]
salary_bucket_step=CONFIG["salary_bucket_step"]
salary_bucket_num=salary_bucket_max//salary_bucket_stepself.job_min_salary_layer=tf.keras.Sequential(
[
tf.keras.layers.Discretization(
get_buckets(
min_val=1,
max_val=salary_bucket_max,
buckets_num=salary_bucket_num,
),
output_mode="one_hot",
),
],
name="job_min_salary_layer",
)
self.job_max_salary_layer=tf.keras.Sequential(
[
tf.keras.layers.Discretization(
get_buckets(
min_val=1,
max_val=salary_bucket_max,
buckets_num=salary_bucket_num,
),
output_mode="one_hot",
),
],
name="job_max_salary_layer",
)
self.dense_layers=tf.keras.Sequential(name="job_dense_layers")
forlayer_sizeinlayer_sizes:
self.dense_layers.add(
tf.keras.layers.Dense(
units=layer_size,
activation="relu",
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed),
)
)
# if use_dropout:# self.dense_layers.add(tf.keras.layers.Dropout(rate=0.2))# ADDING L2 NORM AT THE END to fix vector mangtitude (fixed length vector)# good for cosine similarity comparisonsself.dense_layers.add(
tf.keras.layers.Lambda(
lambdax: tf.nn.l2_normalize(x, axis=-1), name="l2_normalization"
)
)
# self.dense_layers.add(# tf.keras.layers.LayerNormalization(name="normalize_dense")# )defcall(self, data):
all_embs=tf.concat(
[
getattr(self, layer_name)(data[data_key])
forlayer_name, data_keyinjob_model_feature_to_data_map.items()
],
axis=1,
)
returnself.dense_layers(all_embs)
classRetrievalModel(tfrs.Model):
def__init__(self, vocab_dict, dataset):
super().__init__()
self.embedding_dim=128# XXX(Phong): this is the output layer, if you change this, you need to# run a migration to update the db embeddings dims on helix-serviceself.layer_sizes= [256]
self.seed=42self.query_model: tf.keras.Model=UserModel(
vocab_dict=vocab_dict,
embedding_dim=self.embedding_dim,
seed=self.seed,
layer_sizes=self.layer_sizes,
)
self.candidate_model: tf.keras.Model=JobModel(
vocab_dict=vocab_dict,
embedding_dim=self.embedding_dim,
seed=self.seed,
layer_sizes=self.layer_sizes,
)
self.task: tf.keras.layers.Layer=tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
# XXX(Phong): all occurences of the candidate in the dataset,# not affected by train/test splitcandidates=dataset.batch(512).map(
lambdadata: (
data["job_id"],
# XXX(Phong): need to pull out the relevant keys# or the model will infer the inputs from the wrong onesself.candidate_model(
{
key: data[key]
forkeyinjob_model_feature_to_data_map.values()
}
),
)
)
)
)
""" XXX(Phong): `data` is the training data being passed into the RetrievalModel when you run model.fit() """defcompute_loss(self, data: Dict[Text, tf.Tensor], training=False) ->tf.Tensor:
CANDIDATE_ID_KEY="job_id"# XXX(Phong): need to pull out the relevant training data for each sub-# model, otherwise it will infer the inputs from the keys of the datauser_model_train_data= {
key: data[key] forkeyinuser_model_feature_to_data_map.values()
}
query_embeddings=self.query_model(user_model_train_data)
job_model_train_data= {
key: data[key] forkeyinjob_model_feature_to_data_map.values()
}
candidate_embeddings=self.candidate_model(job_model_train_data)
returnself.task(
query_embeddings,
candidate_embeddings,
candidate_ids=data[CANDIDATE_ID_KEY],
)
Are we using the library wrong? There are some differences to the movielens example as we're following this repo
The text was updated successfully, but these errors were encountered:
pthieu
changed the title
Very low topk accuracy, real-world accuracy around 40% at k=5000
Very low topk %, real-world accuracy around 40% at k=5000
Jun 18, 2024
Sorry if this may be the wrong place to ask, I couldn't find a lead anywhere after weeks of googling.
I've implemented a retrieval model to create embeddings for cosine similarity search for a job board application. The retrieval pairs are users (query) and jobs (candidate), the application is to surface likely positive jobs for a user, then passing these jobs into a ranking model for better recommendation.
In our real-world testing, we're seeing around 35%-40% accuracy in alignment with our user behaviour when retrieving the top 5000 candidates using cosine similarity (as the model increases, we plan to lower this, the goal being 500). Also, the TopK results are quite poor (<4%) during training:
In our dataset, we have about:
We're at a loss on how to improve the model using the recommenders library and retrieval algo.
Here's the model code:
Are we using the library wrong? There are some differences to the movielens example as we're following this repo
The text was updated successfully, but these errors were encountered: