-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathupdate_similarity_index.py
94 lines (80 loc) · 3.21 KB
/
update_similarity_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import json
from pathlib import Path
from common import get_logger
from config import Config
from db_similarity_index import SimilarityIndex, get_highest_id, get_session
from get_matches import get_matches_from_data
from rust_server import get_matches_from_rust_server
HERE = Path(__file__).parent
log_file_path = HERE / "update_similarity_index.log"
logger = get_logger(__file__, log_file_path)
is_rust_server_running = True
try:
get_matches_from_rust_server(ord_id=1, file_hash=None)
logger.info("Rust server is running")
except Exception as e:
is_rust_server_running = False
logger.error(f"Rust server is not running {e}")
def main():
highest_indexed_id = get_highest_id()
logger.info(f"highest_indexed_id {highest_indexed_id}")
with open(Config.AVERAGE_HASH_DB, "r") as f:
data = json.load(f)["data"]
already_indexed_data: dict[str, str] = {
k: v for k, v in data.items() if int(k) <= highest_indexed_id
}
not_indexed_data: dict[str, str] = {
k: v for k, v in data.items() if int(k) > highest_indexed_id
}
session = get_session()
# calculate the index for all the ord_ids higher than highest_indexed_id
for progress, (ord_id, average_hash) in enumerate(not_indexed_data.items()):
if progress % 100 == 0:
logger.info(f"New update progress {progress} / {len(not_indexed_data)}")
if session.query(SimilarityIndex).get(int(ord_id)) is not None:
continue
if is_rust_server_running:
matches = get_matches_from_rust_server(ord_id=None, file_hash=average_hash)
else:
matches = get_matches_from_data(
data, ord_id=None, file_hash=average_hash, top_n=20
)
list_of_lists = [
[int(match["ord_id"]), match["match_sum"]] for match in matches
]
obj = SimilarityIndex(
id=int(ord_id),
list_of_lists=json.dumps(list_of_lists),
)
session.add(obj)
if progress % 100 == 0:
session.commit()
# calculate the similarities for older ones - but just with new ones
for progress, (old_ord_id, old_average_hash) in enumerate(
already_indexed_data.items()
):
if progress % 100 == 0:
logger.info(f"Old update progress {progress} / {len(already_indexed_data)}")
new_matches = get_matches_from_data(
not_indexed_data, ord_id=None, file_hash=old_average_hash, top_n=20
)
old_matches = SimilarityIndex.list_by_id(int(old_ord_id))
new_matches_old_format = [
[int(match["ord_id"]), match["match_sum"]] for match in new_matches
]
all_matches_together = old_matches + new_matches_old_format
all_matches_together.sort(key=lambda x: x[1], reverse=True)
new_top_matches = all_matches_together[:20]
if old_matches == new_top_matches:
continue
obj = session.query(SimilarityIndex).get(int(old_ord_id))
if obj is None:
continue
obj.list_of_lists = json.dumps(new_top_matches)
if progress % 100 == 0:
session.commit()
if __name__ == "__main__":
try:
main()
except Exception as e:
logger.exception(e)