-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_index.py
37 lines (31 loc) · 1.4 KB
/
create_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from config import settings
from connectelastic import connect_elastic
from index_map import insert_df_row
from preprocessing import StemTokenizer
def process_df(path: str, preprocess: StemTokenizer, model: SentenceTransformer, es):
"""
It takes a path to a csv file, a preprocessing function, and a sentence transformer model. It then
iterates through the csv file, and for each row, it creates a document with the movie name, genre,
embedding, and movie id. It then inserts the document into the database
:param path: The path to the csv file containing the movie data
:param preprocess: This is the preprocessing function
:param model: The sentence transformer model(sbert) to encode the text
"""
df = pd.read_csv(path, sep=",")
for id, row in df.iterrows():
doc = {
"movie_name": row["title"],
"genre": row["genres"],
"embedding": model.encode(preprocess(row["features"])).tolist(),
"movie_id": id,
}
insert_df_row(doc, es)
print(f"MovieId{id} indexed Sucessfully.")
if __name__ == "__main__":
preprocess = StemTokenizer()
model = SentenceTransformer(settings.MODEL_NAME)
es = connect_elastic(settings.ENDPOINT, settings.ELASTIC_PASSWORD)
process_df(settings.DATA_PATH, preprocess, model, es)