-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
125 lines (102 loc) · 4.04 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import pickle
import logging
from bertopic import BERTopic
from typing import List
from langchain_core.documents import Document
from configData import representationModelType, saveFolder, useKeyBERT
def dataSaver(data, config, dataType, saveNameAppend=""):
"""
Save the data based on the specified configuration.
Args:
data: The data to be saved.
config: The configuration object.
dataType: The type of data being saved.
saveNameAppend: An optional string to append to the save name.
Returns:
The path where the data is saved.
"""
if useKeyBERT and config.generationModel == "BERTopic":
saveNameAppend = f"_KeyBERT{saveNameAppend}"
saveName = f"{config.videoToUse}_{representationModelType}{saveNameAppend}"
savePath = os.path.join(saveFolder, dataType, saveName)
try:
if dataType == "topicModel":
data.save(
savePath,
serialization="safetensors",
save_ctfidf=True,
save_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
)
else:
pickle.dump(data, open(savePath + ".p", "wb"))
return True
except Exception as e:
logging.warn(
f"Error saving {dataType} for {config.videoToUse}: {e}. Data will need to be reloaded next run."
)
return False
def dataLoader(config, dataType, saveNameAppend=""):
"""
Load data based on the specified configuration, data type, video to use, and save name appendix.
Parameters:
- config: The configuration object.
- dataType: The type of data to load.
- saveNameAppend: An optional appendix to add to the save name.
Returns:
- The loaded data if it exists, otherwise False.
"""
if useKeyBERT and config.generationModel == "BERTopic":
saveNameAppend = f"_KeyBERT{saveNameAppend}"
if dataType != "topicModel":
saveNameAppend = f"{saveNameAppend}.p"
saveName = f"{config.videoToUse}_{representationModelType}{saveNameAppend}"
savePath = os.path.join(saveFolder, dataType, saveName)
try:
if os.path.exists(savePath):
if dataType == "topicModel":
return BERTopic.load(savePath)
return pickle.load(open(savePath, "rb"))
except Exception as e:
logging.warn(
f"Error loading {dataType} for {config.videoToUse}: {e}. Data will need to be reloaded."
)
return None
def getBinCount(combinedTranscript, windowSize=120):
"""
Calculates the number of bins based on the combined transcript and window size.
Parameters:
combinedTranscript (DataFrame): The combined transcript containing the start and end times.
windowSize (int): The size of each window in seconds. Default is 120.
Returns:
int: The number of bins calculated based on the video duration and window size.
"""
videoDuration = (
combinedTranscript["End"].iloc[-1] - combinedTranscript["Start"].iloc[0]
)
binCount = int(videoDuration.total_seconds() // windowSize)
return binCount
def formatDocs(docs: List[Document]) -> str:
"""Convert Documents to a single string.:"""
formatted = [
f"Text ID: {doc.metadata['ID']}"
+ f"\nText Start Time: {doc.metadata['Start']}"
+ f"\nText End Time: {doc.metadata['End']}"
+ f"\nText: {doc.page_content}"
for doc in docs
]
return "\n\n" + "\n\n".join(formatted)
def getMetadata(transcript):
"""
Converts the timestamps in the transcript dataframe to a specific format and adds an 'ID' column.
Args:
transcript (pandas.DataFrame): The transcript dataframe containing 'Start' and 'End' columns.
Returns:
pandas.DataFrame: The modified transcript dataframe with converted timestamps and an added 'ID' column.
"""
for timeCol in ["Start", "End"]:
transcript[timeCol] = transcript[timeCol].apply(
lambda timestamp: timestamp.strftime("%H:%M:%S")
)
transcript["ID"] = transcript.index
return transcript