Merge branch 'dev' of https://github.com/OpenLLM-France/RAGondin into dev

ulysse-bouchet · ulysse-bouchet · commit cb36f62dd9e2 · 2025-05-07T08:32:10.000Z
diff --git a/.env.example b/.env.example
@@ -29,7 +29,7 @@ RETRIEVER_TOP_K=5
 
 
 # EMBEDDER
-EMBEDDER_MODEL=HIT-TMG/KaLM-embedding-multilingual-mini-v1
+EMBEDDER_MODEL=jinaai/jina-embeddings-v3
 
 # RERANKER
 RERANKER_MODEL=jinaai/jina-colbert-v2
diff --git a/.hydra_config/chunker/recursive_splitter.yaml b/.hydra_config/chunker/recursive_splitter.yaml
@@ -1,8 +1,8 @@
 defaults:
   - base
 name: recursive_splitter
-chunk_size: 1500
-chunk_overlap: 300
+chunk_size: 1000
+chunk_overlap: 200
 
 
 # https://chat.deepseek.com/a/chat/s/28913c5d-1f62-40b0-9247-4655994fe16b
diff --git a/.hydra_config/config.yaml b/.hydra_config/config.yaml
@@ -28,7 +28,7 @@ semaphore:
 
 embedder:
   type: huggingface
-  model_name: ${oc.env:EMBEDDER_MODEL_NAME, HIT-TMG/KaLM-embedding-multilingual-mini-v1}
+  model_name: ${oc.env:EMBEDDER_MODEL_NAME, jinaai/jina-embeddings-v3}
   
 vectordb:
   host: ${oc.env:VDB_HOST, milvus}
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ chunk_overlap: 300
 Here, the **`chunk_size`** and **`chunk_overlap`** are measured in tokens rather than characters. For improved retrieval performance, you can enable the contextual retrieval feature. This technique, known as "Contextual Retrieval," was introduced by Anthropic to enhance retrieval quality (see [Contextual Retrieval](https://www.anthropic.com/news/contextual-retrieval) for more details). To activate this feature, set **`CONTEXT_RETRIEVAL=true`** in your **`.env`** file. Refer to the **`Usage`** section for additional instructions.
 
 - **Indexing & Search**  
-After chunking, the data is indexed in the **Milvus** vector database using the multilingual embedding model `HIT-TMG/KaLM-embedding-multilingual-mini-v1`, which performs well on the [MTEB benchmark](https://huggingface.co/spaces/mteb/leaderboard). Developers can customize the embedding model by setting the **`EMBEDDER_MODEL`** variable in the *`.env`* file to any compatible model from Huggingface, such as `"sentence-transformers/all-MiniLM-L6-v2"` for faster processing.
+After chunking, the data is indexed in the **Milvus** vector database using the multilingual embedding model `jinaai/jina-embeddings-v3`, which performs well on the [MTEB benchmark](https://huggingface.co/spaces/mteb/leaderboard). Developers can customize the embedding model by setting the **`EMBEDDER_MODEL`** variable in the *`.env`* file to any compatible model from Huggingface, such as `"sentence-transformers/all-MiniLM-L6-v2"` for faster processing.
 
 **Note**: When selecting an embedding model, consider the language of your documents and the model's context length (token limit). The default model supports both French and English. The same model is also used to embed user queries for semantic (dense) search.
 
diff --git a/converter_tests.py b/converter_tests.py
diff --git a/ragondin/chainlit/app_front.py b/ragondin/chainlit/app_front.py
@@ -66,7 +66,7 @@ async def on_chat_start():
     chat_profile = cl.user_session.get("chat_profile")
     settings = {
         "model": chat_profile,
-        "temperature": 0,
+        "temperature": 0.1,
         "stream": True,
     }
 
@@ -97,6 +97,9 @@ async def __fetch_page_content(chunk_url):
 
 
 async def __format_sources(metadata_sources, only_txt=False):
+    if not metadata_sources:
+        return None, None
+
     elements = []
     source_names = []
     for s in metadata_sources:
@@ -153,12 +156,16 @@ async def on_message(message: cl.Message):
                 headers=headers,
                 json=payload,
             ) as resp:
-                metadata_sources = json.loads(resp.headers.get("X-Metadata-Sources"))
-                if metadata_sources:
-                    elements, source_names = await __format_sources(metadata_sources)
-                    msg = cl.Message(content="", elements=elements)
-                else:
-                    msg = cl.Message(content="")
+                try:
+                    metadata = resp.headers.get("X-Metadata-Sources")
+                    logger.debug(f"Metadata: {metadata}")
+                    metadata_sources = json.loads(metadata)
+                except Exception as e:
+                    metadata_sources = None
+                    pass
+
+                elements, source_names = await __format_sources(metadata_sources)
+                msg = cl.Message(content="", elements=elements)
 
                 # STREAM Response
                 await msg.send()
@@ -188,9 +195,10 @@ async def on_message(message: cl.Message):
                 cl.user_session.set("messages", messages)
 
                 # Show sources
-                s = "\n\n" + "-" * 50 + "\n\nSources: \n" + "\n".join(source_names)
-                await msg.stream_token(s)
-                await msg.update()
+                if source_names:
+                    s = "\n\n" + "-" * 50 + "\n\nSources: \n" + "\n".join(source_names)
+                    await msg.stream_token(s)
+                    await msg.update()
 
 
 if __name__ == "__main__":
diff --git a/ragondin/models/openai.py b/ragondin/models/openai.py
@@ -19,7 +19,7 @@ class OpenAIChatCompletionRequest(BaseModel):
     temperature: Optional[float] = Field(0.3)
     top_p: Optional[float] = Field(1.0)
     stream: Optional[bool] = Field(False)
-    max_tokens: Optional[int] = Field(500)
+    max_tokens: Optional[int] = Field(1024)
     logprobs: Optional[int] = Field(None)
 
 
@@ -75,7 +75,7 @@ class OpenAICompletionRequest(BaseModel):
     frequency_penalty: Optional[float] = Field(0.0)
     logit_bias: Optional[dict] = Field(None)
     logprobs: Optional[int] = Field(None)
-    max_tokens: Optional[int] = Field(100)
+    max_tokens: Optional[int] = Field(512)
     n: Optional[int] = Field(1)
     presence_penalty: Optional[float] = Field(0.0)
     seed: Optional[int] = Field(None)
diff --git a/ragondin/routers/extract.py b/ragondin/routers/extract.py
@@ -1,31 +1,29 @@
-from typing import List, Optional
-
 from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
 from fastapi.responses import JSONResponse
 from utils.dependencies import Indexer, get_indexer, vectordb
+from loguru import logger
 
 # Create an APIRouter instance
 router = APIRouter()
 
+
 @router.get("/{extract_id}")
 async def get_extract(extract_id: str, indexer: Indexer = Depends(get_indexer)):
     try:
         doc = vectordb.get_chunk_by_id(extract_id)
         if doc is None:
             raise HTTPException(
                 status_code=status.HTTP_404_NOT_FOUND,
-                detail=f"Extract '{extract_id}' not found."
+                detail=f"Extract '{extract_id}' not found.",
             )
     except Exception as e:
+        err_str = f"Failed to retrieve extract: {str(e)}"
+        logger.debug(err_str)
         raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to retrieve extract: {str(e)}"
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=err_str
         )
 
     return JSONResponse(
         status_code=status.HTTP_200_OK,
-        content={
-            "page_content": doc.page_content,
-            "metadata": doc.metadata
-        }
-    )
+        content={"page_content": doc.page_content, "metadata": doc.metadata},
+    )
diff --git a/ragondin/routers/indexer.py b/ragondin/routers/indexer.py
@@ -20,6 +20,7 @@
 from loguru import logger
 from ray.util.state import get_task
 from utils.dependencies import Indexer, get_indexer, vectordb
+from loguru import logger
 
 # load config
 config = load_config()
@@ -106,9 +107,11 @@ async def add_file(
         with open(file_path, "wb") as buffer:
             buffer.write(await file.read())
     except Exception as e:
+        err_str = f"Failed to save file: {str(e)}"
+        logger.debug(err_str)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to save file: {str(e)}",
+            detail=err_str,
         )
 
     # Queue the file for indexing
@@ -118,9 +121,11 @@ async def add_file(
         )
         # TODO: More specific errors with details and appropriate error codes
     except Exception as e:
+        err_str = f"Indexing error: {str(e)}"
+        logger.debug(err_str)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Indexing error: {str(e)}",
+            detail=err_str,
         )
 
     return JSONResponse(
@@ -143,9 +148,11 @@ async def delete_file(
     try:
         deleted = ray.get(indexer.delete_file.remote(file_id, partition))
     except Exception as e:
+        err_str = f"Error while deleting file '{file_id}': {str(e)}"
+        logger.debug(err_str)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Error while deleting file '{file_id}': {str(e)}",
+            detail=err_str,
         )
 
     if not deleted:
@@ -178,9 +185,11 @@ async def put_file(
         ray.get(indexer.delete_file.remote(file_id, partition))
         logger.info(f"File {file_id} deleted.")
     except Exception as e:
+        err_str = f"Failed to delete existing file: {str(e)}"
+        logger.debug(err_str)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to delete existing file: {str(e)}",
+            detail=err_str,
         )
 
     metadata["file_id"] = file_id
@@ -195,9 +204,11 @@ async def put_file(
         with open(file_path, "wb") as buffer:
             buffer.write(await file.read())
     except Exception as e:
+        err_str = f"Failed to save file: {str(e)}"
+        logger.debug(err_str)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to save file: {str(e)}",
+            detail=err_str,
         )
 
     # Queue indexing task
@@ -206,9 +217,11 @@ async def put_file(
             path=file_path, metadata=metadata, partition=partition
         )
     except Exception as e:
+        err_str = f"Indexing error: {str(e)}"
+        logger.debug(err_str)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Indexing error: {str(e)}",
+            detail=err_str,
         )
 
     return JSONResponse(
@@ -241,9 +254,11 @@ async def patch_file(
     try:
         ray.get(indexer.update_file_metadata.remote(file_id, metadata, partition))
     except Exception as e:
+        err_str = f"Failed to update metadata: {str(e)}"
+        logger.debug(err_str)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to update metadata: {str(e)}",
+            detail=err_str,
         )
 
     return JSONResponse(
diff --git a/ragondin/routers/openai.py b/ragondin/routers/openai.py
diff --git a/ragondin/routers/partition.py b/ragondin/routers/partition.py
diff --git a/ragondin/routers/search.py b/ragondin/routers/search.py