-
Notifications
You must be signed in to change notification settings - Fork 0
/
word2vecupload.py
executable file
·150 lines (124 loc) · 5.23 KB
/
word2vecupload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from flask import Blueprint, Flask, render_template, request
import nltk
from gensim.models import Word2Vec
import fitz
import numpy as np
from io import BytesIO
import faiss
import os
import sqlite3
from gensim.models import KeyedVectors # Import KeyedVectors
import re
from concurrent.futures import ThreadPoolExecutor
word2vecupload = Blueprint("word2vecupload", __name__, template_folder="templates")
nltk.download('punkt')
index_file_path = "my_index.index"
database_file_path = "metadata.db"
dimension = 300 # Adjust the dimension based on your Word2Vec model
faiss_index = faiss.IndexFlatL2(dimension)
conn = sqlite3.connect(database_file_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS metadata (
id INTEGER PRIMARY KEY,
filename TEXT,
pagenumber INTEGER,
chunknumber INTEGER,
chunktext TEXT,
embedding TEXT -- Add a new column to store embeddings as text
)
''')
conn.commit()
conn.close()
home_directory = os.path.expanduser("~")
# Specify the relative path to the Downloads folder and the model file
downloads_folder = "Downloads"
model_filename = "word2vec-google-news-300.gz"
# Combine the paths to create the full model_file_path
model_file_path = os.path.join(home_directory, downloads_folder, model_filename)
try:
# Load the Word2Vec model from the downloaded file
word2vec_model = KeyedVectors.load_word2vec_format(model_file_path, binary=True)
except FileNotFoundError:
print(f"Model file not found at '{model_file_path}'.")
def get_word2vec_embedding(text, model):
# Tokenize the text into words
words = nltk.word_tokenize(text)
# Remove punctuation and convert to lowercase
words = [re.sub(r'[^\w\s]', '', word).lower() for word in words]
# Initialize an empty list to store word vectors
word_vectors = []
# Compute the word embeddings for each word
for word in words:
try:
word_vector = model[word]
word_vectors.append(word_vector)
except KeyError:
# Handle the case where a word is not in the vocabulary
pass
# Convert the list of word vectors to a numpy array
if word_vectors:
return np.mean(word_vectors, axis=0) # Average the word vectors
else:
return np.zeros(model.vector_size) # Return a zero vector if no valid words are found
def convert_embedding_to_string(embedding):
# Convert the embedding numpy array to a string
return ' '.join(map(str, embedding))
def store_embedding(embedding, file_name, page_number, chunk_number, chunk_text):
# Check if the dimension of the embedding matches the FAISS index dimension
if len(embedding) != dimension:
print(f"Warning: Embedding dimension ({len(embedding)}) does not match the FAISS index dimension ({dimension}). Skipping this embedding.")
return
# Convert the embedding to a float32 array
embedding = embedding.astype('float32')
# Convert the embedding to a string format
embedding_str = convert_embedding_to_string(embedding)
faiss_index.add(np.array([embedding]))
conn = sqlite3.connect(database_file_path)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO metadata (filename, pagenumber, chunknumber, chunktext, embedding)
VALUES (?, ?, ?, ?, ?)
''', (file_name, page_number, chunk_number, chunk_text, embedding_str))
conn.commit()
conn.close()
@word2vecupload.route('/word2vecupload', methods=['GET', 'POST'])
def index():
uploaded_file = request.files.get('pdf_file')
if request.method == 'POST':
unique_id = 1
if uploaded_file:
pdf_data = BytesIO(uploaded_file.read())
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
# Create a thread pool for parallel processing
with ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
pagetext = page.get_text()
words = nltk.word_tokenize(pagetext)
chunk_size = 50
for i in range(0, len(words), chunk_size):
chunk = ' '.join(words[i:i + chunk_size])
futures.append(executor.submit(process_chunk, chunk, uploaded_file.filename, page_num))
# Wait for all futures to complete
for future in futures:
future.result()
# Save the FAISS index (if needed)
faiss.write_index(faiss_index, index_file_path)
return "PDF file uploaded and processed."
else:
return "No PDF file uploaded."
return render_template('index.html')
def process_chunk(chunk, filename, page_num):
chunk_embedding = get_word2vec_embedding(chunk, word2vec_model)
# Check if the dimension of the embedding matches before storing
if len(chunk_embedding) == dimension:
store_embedding(
chunk_embedding,
filename,
page_num,
chunk
)
else:
print(f"Warning: Embedding dimension ({len(chunk_embedding)}) does not match the FAISS index dimension ({dimension}). Skipping this embedding.")