-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp-final.py
181 lines (155 loc) · 7.75 KB
/
app-final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import streamlit as st
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import Session
from streamlit_extras import add_vertical_space as avs
import pandas as pd
import re
pd.set_option("max_colwidth", None)
num_chunks = 6 # Num-chunks provided as context. Adjust as needed for your use case.
def create_session():
return Session.builder.configs(st.secrets["snowflake"]).create()
session = create_session()
st.set_page_config(
page_title="Ex-stream-ly Cool App",
page_icon="🧊",
layout="wide",
)
def load_data(table_name):
table = session.table(table_name)
table = table.limit(20)
table = table.collect()
return table
table_name = "CC_QUICKSTART_CORTEX_DOCS.DATA.comp_data_bkp"
def clean_text(text):
# Remove non-alphanumeric characters and convert to lower case
return re.sub(r'\W+', ' ', text).lower()
def is_relevant(prompt_context, prompt):
# Clean and split text into words
prompt_context_words = set(clean_text(prompt_context).split())
prompt_words = set(clean_text(prompt).split())
# Check if there are common words between prompt context and prompt
return bool(prompt_context_words & prompt_words)
def create_prompt(myquestion, rag, stage, selected_doc):
known_prefixes = ['docs/', 'history_docs/']
for prefix in known_prefixes:
if selected_doc.startswith(prefix):
selected_doc = selected_doc[len(prefix):]
break
stage_table = '@CC_QUICKSTART_CORTEX_DOCS.DATA.history_docs' if stage == 'Marketing Knowledge Base' else '@CC_QUICKSTART_CORTEX_DOCS.DATA.docs'
if rag:
cmd = f"""
WITH results AS (
SELECT RELATIVE_PATH, VECTOR_COSINE_SIMILARITY(docs_chunks_table.chunk_vec,
snowflake.cortex.embed_text_768('e5-base-v2', ?)) AS distance, chunk
FROM CC_QUICKSTART_CORTEX_DOCS.DATA.docs_chunks_table
where RELATIVE_PATH = ?
ORDER BY distance DESC
LIMIT {num_chunks}
)
SELECT chunk, relative_path FROM results
"""
df_context = session.sql(cmd, params=[myquestion, selected_doc]).to_pandas()
context_length = len(df_context) - 1
prompt_context = "".join(df_context.loc[:context_length, 'CHUNK'])
prompt_context = prompt_context.replace("'", "")
relative_path = df_context.loc[0, 'RELATIVE_PATH'] if not df_context.empty else "Not Found"
# Check if the extracted context is relevant to the prompt
# Check if the extracted context is relevant to the prompt
context_is_relevant = is_relevant(prompt_context, myquestion)
if context_is_relevant:
prompt = f"""
'You are an expert assistance extracting information from context provided.
Answer the question based on the context. Be concise and do not hallucinate.
If you don’t have the information just say so.
Context: {prompt_context}
Question:
{myquestion}
Answer: '
"""
cmd2 = f"SELECT GET_PRESIGNED_URL({stage_table}, '{relative_path}', 360) AS URL_LINK FROM directory({stage_table})"
df_url_link = session.sql(cmd2).to_pandas()
url_link = df_url_link.loc[0, 'URL_LINK'] if not df_url_link.empty else "URL not available"
else:
#prompt = f"'Question:\n{myquestion}\nAnswer: '"
#url_link = "None"
#relative_path = "None"
prompt_context = "No relevant context found in the selected document."
prompt = f"""
'You are an expert assistant. Answer the question as best as you can with the given information.
If you don’t have the information just say so.
Context: {prompt_context}
Question: {myquestion}
Answer: '
"""
url_link = "None" # No relevant document context, don't show link
else:
prompt = f"'Question:\n{myquestion}\nAnswer: '"
url_link = "None"
relative_path = "None"
return prompt, url_link, relative_path
def complete(myquestion, model_name, rag, stage, selected_doc):
prompt, url_link, relative_path = create_prompt(myquestion, rag, stage, selected_doc)
cmd = f"SELECT snowflake.cortex.complete(?,?) AS response"
df_response = session.sql(cmd, params=[model_name, prompt]).collect()
return df_response, url_link, relative_path
st.title("Build a Retrieval Augmented Generation (RAG) based LLM assistant using Snowflake Cortex and Streamlit:")
st.write("You can ask questions and decide if you want to use your documents for context or allow the model to create their own response.")
col11, thumb_col = st.columns([0.45, 1.5])
with col11:
model = st.selectbox('**Choose LLM model**:', (
'mistral-7b', 'mistral-large', 'mixtral-8x7b', 'gemma-7b',))
avs.add_vertical_space(4)
tab1, tab2 = st.tabs(["**Unstructured**", "**Structured**"])
with tab1:
col22, thumb_col = st.columns([0.45, 1.5])
with col22:
stage = st.selectbox('**Business Function(Stage)**', (
'History Knowledge Base',
'Technical Knowledge Base'
))
stage_table = '@CC_QUICKSTART_CORTEX_DOCS.DATA.history_docs' if stage == 'History Knowledge Base' else '@CC_QUICKSTART_CORTEX_DOCS.DATA.docs'
docs_available = session.sql(f"ls {stage_table}").collect()
list_docs = [doc["name"] for doc in docs_available]
col33, thumb_col = st.columns([0.45, 1.5])
with col33:
selected_doc = st.selectbox('**Context(Dataset)**', list_docs)
avs.add_vertical_space(4)
tab1, tab2 = st.tabs(["**Interactive**", "**Comparison**"])
with tab1:
prompt_options = ["", "What is calculated columns", "Is there any special lubricant to be used with the premium bike?", "What is the warranty for the premium bike?", "What is the impact of The East India Company on India in 1900"]
col44, thumb_col = st.columns([1, 1.5])
with col44:
prompt = st.selectbox('**Choose prompt**', prompt_options, index=0, format_func=lambda x: 'Select prompt...' if x == '' else x)
col55, thumb_col = st.columns([1, 1.5])
with col55:
question = st.text_input("**Or Enter Your Own Prompt**")
if st.button(':red[**Submit**]'):
actual_question = prompt if prompt else question
if actual_question:
st.session_state['actual_question'] = actual_question
st.session_state['stage'] = stage
st.session_state['selected_doc'] = selected_doc
st.session_state['submitted'] = True
if 'submitted' in st.session_state:
col1, thumb_col, col2 = st.columns([3.5, 1, 3.5])
with col1:
st.header("Vanilla Response from LLM")
response, _, _ = complete(st.session_state['actual_question'], model, 0, st.session_state['stage'], st.session_state['selected_doc'])
st.markdown(response[0].RESPONSE)
with thumb_col:
st.header("Like")
if st.button('👍'):
if 'clicked' not in st.session_state:
st.session_state['clicked'] = True
st.snow()
with col2:
st.header("RAG powered Response from LLM")
response, url_link, relative_path = complete(st.session_state['actual_question'], model, 1, st.session_state['stage'], st.session_state['selected_doc'])
st.markdown(response[0].RESPONSE)
if url_link != "None" and "No relevant context" not in response[0].RESPONSE:
display_url = f"Link to [{relative_path}]({url_link}) that may be useful"
st.markdown(display_url)
with tab2:
df = load_data(table_name)
st.write("This table shows the responses from different models to the same prompt.")
st.dataframe(df)