-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
43 lines (39 loc) · 1.82 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
def bm25okapi_search(tokenized_query, bm25, corpus, n_results = 1):
"""
Function that takes a tokenized query and prints the first 100 words of the
n_results most relevant results found in the corpus, based on the BM25
method.
Parameters
----------
@param tokenized_query: list, array-like
A valid list containing the tokenized query.
@param bm25: BM25 object,
A valid object of type BM25 (BM25Okapi or BM25Plus) from the library
`rank-bm25`, initialized with a valid corpus.
@param corpus: list, array-like
A valid list containing the corpus from which the BM25 object has been
initialized. As returned from function read_corpus().
@param n_results: int, default = 1
The number of top results to print.
"""
# Get top results for the query
top_results = bm25.get_top_n(tokenized_query, corpus, n = n_results)
# Take words from each result
top_results_id = [int(' '.join(top_result).split("-->")[-1])
for top_result in top_results]
# Return results
return top_results_id
def search(query_df, ir_model, tokenize_lyric, n_results= 5):
#return result dataframe
search_results= pd.DataFrame()
for i, query in query_df.iterrows():
#return relevance lyric_id rank array
lyrics_id=bm25okapi_search(query['query_token'], ir_model, tokenize_lyric, n_results)
for rank,lyric_id in enumerate(lyrics_id):
search_results= pd.concat([search_results,
pd.DataFrame([{'rank': rank+1,
'id': lyric_id,
'qid': query['qid']}])
]).reset_index(drop=True)
return search_results