Skip to content

Commit

Permalink
removed trigram
Browse files Browse the repository at this point in the history
  • Loading branch information
choccccy committed Nov 28, 2023
1 parent 5713b48 commit 8c5abe6
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 53 deletions.
109 changes: 60 additions & 49 deletions demo/PGvector_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@
"for index, text in enumerate(pacer_copypasta): # For each line in the copypasta\n",
" await pacerDB.insert( # Insert the line into the table\n",
" content=text, # The text to be embedded\n",
" title=f'Pacer Copypasta line {index}', # Title metadata\n",
" title='Pacer Copypasta', # Title metadata\n",
" page_numbers=[index], # Page number metadata\n",
" tags=['fitness', 'pacer', 'copypasta'], # Tag metadata\n",
" )\n",
Expand Down Expand Up @@ -199,7 +199,7 @@
"output_type": "stream",
"text": [
"RETURNED TITLE:\n",
"\"Pacer Copypasta line 3\"\n",
"\"Pacer Copypasta\"\n",
"RETURNED CONTENT:\n",
"\"[beep] A single lap should be completed each time you hear this sound.\"\n",
"RETURNED COSINE SIMILARITY:\n",
Expand Down Expand Up @@ -228,9 +228,9 @@
{
"data": {
"text/plain": [
"[<Record cosine_similarity=0.0 title='Pacer Copypasta line 3' content='[beep] A single lap should be completed each time you hear this sound.' page_numbers=[3] tags=['fitness', 'pacer', 'copypasta']>,\n",
" <Record cosine_similarity=0.31445924384770496 title='Pacer Copypasta line 5' content='The second time you fail to complete a lap before the sound, your test is over.' page_numbers=[5] tags=['fitness', 'pacer', 'copypasta']>,\n",
" <Record cosine_similarity=0.634082588486436 title='Pacer Copypasta line 2' content='The running speed starts slowly, but gets faster each minute after you hear this signal.' page_numbers=[2] tags=['fitness', 'pacer', 'copypasta']>]"
"[<Record cosine_similarity=0.0 title='Pacer Copypasta' content='[beep] A single lap should be completed each time you hear this sound.' page_numbers=[3] tags=['fitness', 'pacer', 'copypasta']>,\n",
" <Record cosine_similarity=0.31445924384770496 title='Pacer Copypasta' content='The second time you fail to complete a lap before the sound, your test is over.' page_numbers=[5] tags=['fitness', 'pacer', 'copypasta']>,\n",
" <Record cosine_similarity=0.634082588486436 title='Pacer Copypasta' content='The running speed starts slowly, but gets faster each minute after you hear this signal.' page_numbers=[2] tags=['fitness', 'pacer', 'copypasta']>]"
]
},
"execution_count": 8,
Expand Down Expand Up @@ -285,7 +285,7 @@
"output_type": "stream",
"text": [
"RETURNED TITLE:\n",
"\"Pacer Copypasta line 4\"\n",
"\"Pacer Copypasta\"\n",
"RETURNED CONTENT:\n",
"\"[ding] Remember to run in a straight line, and run as long as possible.\"\n",
"RETURNED COSINE SIMILARITY:\n",
Expand Down Expand Up @@ -314,9 +314,9 @@
{
"data": {
"text/plain": [
"[<Record cosine_similarity=0.7157614573027005 title='Pacer Copypasta line 4' content='[ding] Remember to run in a straight line, and run as long as possible.' page_numbers=[4] tags=['fitness', 'pacer', 'copypasta']>,\n",
" <Record cosine_similarity=0.8959717930563745 title='Pacer Copypasta line 6' content='The test will begin on the word start. On your mark, get ready, start.' page_numbers=[6] tags=['fitness', 'pacer', 'copypasta']>,\n",
" <Record cosine_similarity=0.9200870391648666 title='Pacer Copypasta line 2' content='The running speed starts slowly, but gets faster each minute after you hear this signal.' page_numbers=[2] tags=['fitness', 'pacer', 'copypasta']>]"
"[<Record cosine_similarity=0.7157614573027005 title='Pacer Copypasta' content='[ding] Remember to run in a straight line, and run as long as possible.' page_numbers=[4] tags=['fitness', 'pacer', 'copypasta']>,\n",
" <Record cosine_similarity=0.8959717930563745 title='Pacer Copypasta' content='The test will begin on the word start. On your mark, get ready, start.' page_numbers=[6] tags=['fitness', 'pacer', 'copypasta']>,\n",
" <Record cosine_similarity=0.9200870391648666 title='Pacer Copypasta' content='The running speed starts slowly, but gets faster each minute after you hear this signal.' page_numbers=[2] tags=['fitness', 'pacer', 'copypasta']>]"
]
},
"execution_count": 11,
Expand All @@ -325,10 +325,18 @@
}
],
"source": [
"print(f'RAW RETURN:') \n",
"print('RAW RETURN:') \n",
"sim_search"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Searching the document utilizing filters"
]
},
{
"cell_type": "code",
"execution_count": 12,
Expand All @@ -348,8 +356,11 @@
"print(f'Semantic Searching data using search string:\\n\"{search_string}\"')\n",
"\n",
"sim_search = await pacerDB.search(\n",
" query_string=search_string, # string to search by\n",
" query_page_numbers=[0, 1, 2], # Page number metadata\n",
" query_string=search_string, # string to search by\n",
" query_title='Pacer Copypasta', # title to search by\n",
" query_page_numbers=[3, 4, 5], # Page number metadata\n",
" query_tags=['fitness'], # Tag metadata\n",
" conjunctive=False, # Conjunctive search\n",
")"
]
},
Expand All @@ -363,11 +374,11 @@
"output_type": "stream",
"text": [
"RETURNED TITLE:\n",
"\"Pacer Copypasta line 2\"\n",
"\"Pacer Copypasta\"\n",
"RETURNED CONTENT:\n",
"\"The running speed starts slowly, but gets faster each minute after you hear this signal.\"\n",
"\"[ding] Remember to run in a straight line, and run as long as possible.\"\n",
"RETURNED COSINE SIMILARITY:\n",
"0.92\n"
"0.72\n"
]
}
],
Expand All @@ -392,7 +403,7 @@
{
"data": {
"text/plain": [
"[<Record cosine_similarity=0.9200870391648666 title='Pacer Copypasta line 2' content='The running speed starts slowly, but gets faster each minute after you hear this signal.' page_numbers=[2] tags=['fitness', 'pacer', 'copypasta']>]"
"[<Record cosine_similarity=0.7157614573027005 title='Pacer Copypasta' content='[ding] Remember to run in a straight line, and run as long as possible.' page_numbers=[4] tags=['fitness', 'pacer', 'copypasta']>]"
]
},
"execution_count": 14,
Expand All @@ -401,7 +412,7 @@
}
],
"source": [
"print(f'RAW RETURN:') \n",
"print('RAW RETURN:') \n",
"sim_search"
]
},
Expand Down Expand Up @@ -522,7 +533,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Inserted 22 lines of dialog into the table with history key \"8f600b20-f617-4af5-a47d-f22a11ad8267\".\n"
"Inserted 22 lines of dialog into the table with history key \"0a52c9d2-0636-46f5-8313-8951ad5da31a\".\n"
]
}
],
Expand Down Expand Up @@ -588,7 +599,7 @@
"output_type": "stream",
"text": [
"RETURNED TIMESTAMP:\n",
"2023-11-28 00:36:53.725661+00:00\n",
"2023-11-28 18:31:36.558134+00:00\n",
"RETURNED MESSAGE:\n",
"assistant: Nicknames, nicknames. Now, on the St. Louis team we have Who's on first, What's on second, I Don't Know is on third--\n",
"RETURNED COSINE SIMILARITY:\n",
Expand Down Expand Up @@ -617,17 +628,17 @@
{
"data": {
"text/plain": [
"[{'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 725661, tzinfo=datetime.timezone.utc),\n",
"[{'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 558134, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': \"Nicknames, nicknames. Now, on the St. Louis team we have Who's on first, What's on second, I Don't Know is on third--\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938},\n",
" 'cosine_similarity': 0.4910637432970839},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 697560, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 537433, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': 'Funny names?',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938},\n",
" 'cosine_similarity': 0.561316881258301},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 54, 13270, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 823459, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': \"That's the man's name.\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938},\n",
Expand Down Expand Up @@ -686,7 +697,7 @@
"output_type": "stream",
"text": [
"RETURNED TIMESTAMP:\n",
"2023-11-28 00:36:53.725661+00:00\n",
"2023-11-28 18:31:36.558134+00:00\n",
"\n",
"RETURNED MESSAGE:\n",
"assistant: Nicknames, nicknames. Now, on the St. Louis team we have Who's on first, What's on second, I Don't Know is on third--\n",
Expand Down Expand Up @@ -718,17 +729,17 @@
{
"data": {
"text/plain": [
"[{'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 725661, tzinfo=datetime.timezone.utc),\n",
"[{'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 558134, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': \"Nicknames, nicknames. Now, on the St. Louis team we have Who's on first, What's on second, I Don't Know is on third--\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938},\n",
" 'cosine_similarity': 0.4910637432970839},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 697560, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 537433, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': 'Funny names?',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938},\n",
" 'cosine_similarity': 0.561316881258301},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 54, 13270, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 823459, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': \"That's the man's name.\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938},\n",
Expand Down Expand Up @@ -762,7 +773,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Retreiving chatlog \"8f600b20-f617-4af5-a47d-f22a11ad8267\" from database\n"
"Retreiving chatlog \"0a52c9d2-0636-46f5-8313-8951ad5da31a\" from database\n"
]
}
],
Expand Down Expand Up @@ -827,91 +838,91 @@
{
"data": {
"text/plain": [
"[{'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 644738, tzinfo=datetime.timezone.utc),\n",
"[{'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 464581, tzinfo=datetime.timezone.utc),\n",
" 'role': 'system',\n",
" 'content': \"The user is considering becoming a ballplayer. The assistant wants to make sure they knows what they're getting into.\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 677880, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 506643, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': 'Strange as it may seem, they give ball players nowadays very peculiar names.',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 697560, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 537433, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': 'Funny names?',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 725661, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 558134, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': \"Nicknames, nicknames. Now, on the St. Louis team we have Who's on first, What's on second, I Don't Know is on third--\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 748778, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 582300, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': \"That's what I want to find out. I want you to tell me the names of the fellows on the St. Louis team.\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 771415, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 602338, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': \"I'm telling you. Who is on first. What's on second. I Don't Know's on third--\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 790258, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 619885, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': \"You know the fellows' names?\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 813689, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 636019, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': 'Yes.',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 827505, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 650436, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': \"Well, then who's playing first?\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 842297, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 666144, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': 'Yes.',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 857449, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 679987, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': \"I mean the fellow's name on first base.\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 877878, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 695743, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': 'Who.',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 895393, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 710776, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': \"The fellow playin' first base.\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 911530, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 727459, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': 'Who.',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 927484, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 741377, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': 'The guy on first base.',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 943138, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 758579, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': 'Who is on first.',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 959678, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 774288, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': \"Well, what are you askin' me for?\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 980016, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 790076, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': \"I'm not asking you--I'm telling you. Who is on first.\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 53, 998745, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 807238, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': \"I'm asking you--who's on first?\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 54, 13270, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 823459, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': \"That's the man's name.\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 54, 28419, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 839281, tzinfo=datetime.timezone.utc),\n",
" 'role': 'user',\n",
" 'content': \"That's who's name?\",\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}},\n",
" {'ts': datetime.datetime(2023, 11, 28, 0, 36, 54, 51377, tzinfo=datetime.timezone.utc),\n",
" {'ts': datetime.datetime(2023, 11, 28, 18, 31, 36, 855569, tzinfo=datetime.timezone.utc),\n",
" 'role': 'assistant',\n",
" 'content': 'Yes.',\n",
" 'metadata': {'genre': 'comedy', 'year': 1938}}]"
Expand Down
7 changes: 3 additions & 4 deletions pylib/embedding/pgvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
LIMIT {limit};
'''

TITLE_WHERE_CLAUSE = 'title % {query_title} -- Trigram operator (default similarity threshold is 0.3) \n'
TITLE_WHERE_CLAUSE = 'title = {query_title} -- Equals operator \n'

PAGE_NUMBERS_WHERE_CLAUSE = 'page_numbers && {query_page_numbers} -- Overlap operator \n'

Expand Down Expand Up @@ -363,10 +363,9 @@ async def search(
query_string (str): string to compare against items in the table.
This will be a vector/fuzzy/nearest-neighbor type search.
query_title (str, optional): title of the document to compare against items in the table
(uses a less fuzzy matching operator than query_string)
query_title (str, optional): title of the document to compare against items in the table.
query_page_numbers (list[int], optional): target page number in the document for query string comparison
query_page_numbers (list[int], optional): target page number in the document for query string comparison.
query_tags (list[str], optional): tags associated with the document to compare against items in the table.
Each individual tag must match exactly, but see the conjunctive param
Expand Down

0 comments on commit 8c5abe6

Please sign in to comment.