Langchain libs update (#769)

* LLMs with latest langchain dev libraries * conflict resolved * all llm models with latest library changes
neo4j-labs · Sep 27, 2024 · ba6a9d2 · ba6a9d2
1 parent 501ece4
commit ba6a9d2
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 34 deletions.
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -69,22 +69,22 @@ jsonpath-python==1.0.6
 jsonpointer==2.4
 json-repair==0.25.2
 kiwisolver==1.4.5
-langchain
-langchain-aws
-langchain-anthropic
-langchain-fireworks
-langchain-google-genai
-langchain-community
-langchain-core
-langchain-experimental
-langchain-google-vertexai
-langchain-groq
-langchain-openai
-langchain-text-splitters
+langchain==0.3.0
+langchain-aws==0.2.1
+langchain-anthropic==0.2.1
+langchain-fireworks==0.2.0
+langchain-google-genai==2.0.0
+langchain-community==0.3.0
+langchain-core==0.3.5
+langchain-experimental==0.3.1
+langchain-google-vertexai==2.0.1
+langchain-groq==0.2.0
+langchain-openai==0.2.0
+langchain-text-splitters==0.3.0
 langdetect==1.0.9
-langsmith==0.1.83
+langsmith==0.1.128
 layoutparser==0.3.4
-langserve==0.2.2
+langserve==0.3.0
 #langchain-cli==0.0.25
 lxml==5.1.0
 MarkupSafe==2.1.5
@@ -100,7 +100,7 @@ numpy==1.26.4
 omegaconf==2.3.0
 onnx==1.16.1
 onnxruntime==1.18.1
-openai==1.35.10
+openai==1.47.1
 opencv-python==4.8.0.76
 orjson==3.9.15
 packaging==23.2
@@ -144,7 +144,6 @@ shapely==2.0.3
 six==1.16.0
 sniffio==1.3.1
 soupsieve==2.5
-SQLAlchemy==2.0.28
 starlette==0.37.2
 sse-starlette==2.1.2
 starlette-session==0.4.3
@@ -159,7 +158,7 @@ transformers==4.42.3
 types-protobuf
 types-requests
 typing-inspect==0.9.0
-typing_extensions==4.9.0
+typing_extensions==4.12.2
 tzdata==2024.1
 unstructured==0.14.9
 unstructured-client==0.23.8

diff --git a/backend/src/llm.py b/backend/src/llm.py
@@ -9,13 +9,14 @@
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 from langchain_experimental.graph_transformers import LLMGraphTransformer
+from langchain_core.prompts import ChatPromptTemplate
 from langchain_anthropic import ChatAnthropic
 from langchain_fireworks import ChatFireworks
 from langchain_aws import ChatBedrock
 from langchain_community.chat_models import ChatOllama
 import boto3
 import google.auth
-from src.shared.constants import MODEL_VERSIONS
+from src.shared.constants import MODEL_VERSIONS, PROMPT_TO_ALL_LLMs
 
 
 def get_llm(model: str):
@@ -28,7 +29,7 @@ def get_llm(model: str):
         model_name = MODEL_VERSIONS[model]
         llm = ChatVertexAI(
             model_name=model_name,
-            convert_system_message_to_human=True,
+            #convert_system_message_to_human=True,
             credentials=credentials,
             project=project_id,
             temperature=0,
@@ -149,8 +150,9 @@ def get_graph_document_list(
     if "diffbot_api_key" in dir(llm):
         llm_transformer = llm
     else:
-        if "get_name" in dir(llm) and llm.get_name() == "ChatOllama":
+        if "get_name" in dir(llm) and llm.get_name() != "ChatOenAI" or llm.get_name() != "ChatVertexAI" or llm.get_name() != "AzureChatOpenAI":
             node_properties = False
+            relationship_properties = False
         else:
             node_properties = ["description"]
             relationship_properties = ["description"]
@@ -160,6 +162,8 @@ def get_graph_document_list(
             relationship_properties=relationship_properties,
             allowed_nodes=allowedNodes,
             allowed_relationships=allowedRelationship,
+            ignore_tool_usage=True,
+            #prompt = ChatPromptTemplate.from_messages(["system",PROMPT_TO_ALL_LLMs])
         )
     with ThreadPoolExecutor(max_workers=10) as executor:
         for chunk in combined_chunk_document_list:

diff --git a/backend/src/main.py b/backend/src/main.py
@@ -433,14 +433,14 @@ def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password, datab
           node_type= node.type
           if (node_id, node_type) not in distinct_nodes:
             distinct_nodes.add((node_id, node_type))
-  #get all relations
-  for relation in graph_document.relationships:
-        relations.append(relation.type)
-
-  node_count += len(distinct_nodes)
-  rel_count += len(relations)
-  print(f'node count internal func:{node_count}')
-  print(f'relation count internal func:{rel_count}')
+    #get all relations
+    for relation in graph_document.relationships:
+          relations.append(relation.type)
+
+    node_count += len(distinct_nodes)
+    rel_count += len(relations)
+    print(f'node count internal func:{node_count}')
+    print(f'relation count internal func:{rel_count}')
   return node_count,rel_count
 
 def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition):

diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py
@@ -1,15 +1,16 @@
 MODEL_VERSIONS = {
         "openai-gpt-3.5": "gpt-3.5-turbo-0125",
         "gemini-1.0-pro": "gemini-1.0-pro-001",
-        "gemini-1.5-pro": "gemini-1.5-pro-preview-0514",
+        "gemini-1.5-pro": "gemini-1.5-pro-002",
+        "gemini-1.5-flash": "gemini-1.5-flash-002",
         "openai-gpt-4": "gpt-4-turbo-2024-04-09",
         "diffbot" : "gpt-4-turbo-2024-04-09",
         "openai-gpt-4o-mini": "gpt-4o-mini-2024-07-18",
         "openai-gpt-4o":"gpt-4o-2024-08-06",
         "groq-llama3" : "llama3-70b-8192"
          }
 OPENAI_MODELS = ["openai-gpt-3.5", "openai-gpt-4o", "openai-gpt-4o-mini"]
-GEMINI_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro"]
+GEMINI_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro", "gemini-1.5-flash"]
 GROQ_MODELS = ["groq-llama3"]
 BUCKET_UPLOAD = 'llm-graph-builder-upload'
 BUCKET_FAILED_FILE = 'llm-graph-builder-failed'
@@ -92,14 +93,14 @@
 CHAT_DOC_SPLIT_SIZE = 3000
 CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD = 0.10
 CHAT_TOKEN_CUT_OFF = {
-     ("openai-gpt-3.5",'azure_ai_gpt_35',"gemini-1.0-pro","gemini-1.5-pro","groq-llama3",'groq_llama3_70b','anthropic_claude_3_5_sonnet','fireworks_llama_v3_70b','bedrock_claude_3_5_sonnet', ) : 4, 
+     ("openai-gpt-3.5",'azure_ai_gpt_35',"gemini-1.0-pro","gemini-1.5-pro","gemini-1.5-flash","groq-llama3",'groq_llama3_70b','anthropic_claude_3_5_sonnet','fireworks_llama_v3_70b','bedrock_claude_3_5_sonnet', ) : 4, 
      ("openai-gpt-4","diffbot" ,'azure_ai_gpt_4o',"openai-gpt-4o", "openai-gpt-4o-mini") : 28,
      ("ollama_llama3") : 2  
 } 
 
 
 CHAT_TOKEN_CUT_OFF = {
-     ("openai-gpt-3.5",'azure_ai_gpt_35',"gemini-1.0-pro","gemini-1.5-pro","groq-llama3",'groq_llama3_70b','anthropic_claude_3_5_sonnet','fireworks_llama_v3_70b','bedrock_claude_3_5_sonnet', ) : 4, 
+     ("openai-gpt-3.5",'azure_ai_gpt_35',"gemini-1.0-pro","gemini-1.5-pro", "gemini-1.5-flash","groq-llama3",'groq_llama3_70b','anthropic_claude_3_5_sonnet','fireworks_llama_v3_70b','bedrock_claude_3_5_sonnet', ) : 4, 
      ("openai-gpt-4","diffbot" ,'azure_ai_gpt_4o',"openai-gpt-4o", "openai-gpt-4o-mini") : 28,
      ("ollama_llama3") : 2  
 } 
@@ -476,3 +477,46 @@
 START_FROM_BEGINNING  = "start_from_beginning"     
 DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning"
 START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position"                                                    
+
+PROMPT_TO_ALL_LLMs = """
+"# Knowledge Graph Instructions for LLMs\n"
+    "## 1. Overview\n"
+    "You are a top-tier algorithm designed for extracting information in structured "
+    "formats to build a knowledge graph.\n"
+    "Try to capture as much information from the text as possible without "
+    "sacrificing accuracy. Do not add any information that is not explicitly "
+    "mentioned in the text.\n"
+    "- **Nodes** represent entities and concepts.\n"
+    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
+    "accessible for a vast audience.\n"
+    "## 2. Labeling Nodes\n"
+    "- **Consistency**: Ensure you use available types for node labels.\n"
+    "Ensure you use basic or elementary types for node labels.\n"
+    "- For example, when you identify an entity representing a person, "
+    "always label it as **'person'**. Avoid using more specific terms "
+    "like 'mathematician' or 'scientist'."
+    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
+    "names or human-readable identifiers found in the text.\n"
+    "- **Relationships** represent connections between entities or concepts.\n"
+    "Ensure consistency and generality in relationship types when constructing "
+    "knowledge graphs. Instead of using specific and momentary types "
+    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
+    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
+    "## 3. Coreference Resolution\n"
+    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
+    "ensure consistency.\n"
+    'If an entity, such as "John Doe", is mentioned multiple times in the text '
+    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
+    "always use the most complete identifier for that entity throughout the "
+    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
+    "Remember, the knowledge graph should be coherent and easily understandable, "
+    "so maintaining consistency in entity references is crucial.\n"
+    "## 4. Node Properties\n"
+    "- Dates, URLs, Time, and Numerical Values: Instead of creating separate nodes for 
+    these elements, represent them as properties of existing nodes."
+    "- Example: Instead of creating a node labeled "2023-03-15" and connecting it to another node 
+    with the relationship "BORN_ON", add a property called "born_on" to the person node with the 
+    value "2023-03-15"."
+    "## 5. Strict Compliance\n"
+    "Adhere to the rules strictly. Non-compliance will result in termination."
+    """
diff --git a/backend/src/shared/schema_extraction.py b/backend/src/shared/schema_extraction.py
@@ -1,5 +1,6 @@
 from typing import List
-from langchain_core.pydantic_v1 import BaseModel, Field
+#from langchain_core.pydantic_v1 import BaseModel, Field
+from pydantic.v1 import BaseModel, Field
 from src.llm import get_llm
 from src.shared.constants import MODEL_VERSIONS
 from langchain_core.prompts import ChatPromptTemplate

diff --git a/frontend/src/utils/Constants.ts b/frontend/src/utils/Constants.ts
@@ -45,12 +45,13 @@ export const llms =
         'openai-gpt-4o-mini',
         'gemini-1.0-pro',
         'gemini-1.5-pro',
+        'gemini-1.5-flash',
         'azure_ai_gpt_35',
         'azure_ai_gpt_4o',
         'ollama_llama3',
         'groq_llama3_70b',
         'anthropic_claude_3_5_sonnet',
-        'fireworks_v3p1_405b',
+        'fireworks_llama_v3p2_90b',
         'bedrock_claude_3_5_sonnet',
       ];