Merge branch 'main' into cloud-issue-369/-enhancement-logs-are-not-fe…

…tched-2
Agenta-AI · Jul 6, 2024 · 858d018 · 858d018
2 parents 8ced5ae + dd12f19
commit 858d018
Show file tree

Hide file tree

Showing 26 changed files with 493 additions and 191 deletions.
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -437,6 +437,16 @@
         "bug",
         "code"
       ]
+    },
+    {
+      "login": "jp-agenta",
+      "name": "jp-agenta",
+      "avatar_url": "https://avatars.githubusercontent.com/u/174311389?v=4",
+      "profile": "https://github.com/jp-agenta",
+      "contributions": [
+        "code",
+        "bug"
+      ]
     }
   ],
   "contributorsPerLine": 7,

diff --git a/README.md b/README.md
@@ -42,7 +42,6 @@
 </br>
 </p>
 
-
 <p align="center">
     <a href="https://join.slack.com/t/agenta-hq/shared_invite/zt-1zsafop5i-Y7~ZySbhRZvKVPV5DO_7IA">
         <img src="https://img.shields.io/badge/JOIN US ON SLACK-4A154B?style=for-the-badge&logo=slack&logoColor=white" />
@@ -55,7 +54,6 @@
     </a>
 </p>
 
-
 </br>
 
 <a href="https://cloud.agenta.ai">
@@ -64,7 +62,6 @@
   </picture>
 </a>
 
-
 <br>
   <br />
   <br />
@@ -96,54 +93,58 @@
 
 # ⭐️ Why Agenta?
 
-Agenta is an end-to-end LLM developer platform. It provides the tools for **prompt engineering and management**,  ⚖️ **evaluation**, **human annotation**, and :rocket: **deployment**. All without imposing any restrictions on your choice of framework, library, or model. 
+Agenta is an end-to-end LLM developer platform. It provides the tools for **prompt engineering and management**, ⚖️ **evaluation**, **human annotation**, and :rocket: **deployment**. All without imposing any restrictions on your choice of framework, library, or model.
 
-Agenta allows developers and product teams to collaborate in building production-grade LLM-powered applications in less time. 
+Agenta allows developers and product teams to collaborate in building production-grade LLM-powered applications in less time.
 
 ### With Agenta, you can:
 
-- [🧪 **Experiment** and **compare** prompts](https://docs.agenta.ai/basic_guides/prompt_engineering) on [any LLM workflow](https://docs.agenta.ai/advanced_guides/custom_applications) (chain-of-prompts, Retrieval Augmented Generation (RAG), LLM agents...) 
-- ✍️ Collect and [**annotate golden test sets**](https://docs.agenta.ai/basic_guides/test_sets) for evaluation
-- 📈 [**Evaluate** your application](https://docs.agenta.ai/basic_guides/automatic_evaluation) with pre-existing or [**custom evaluators**](https://docs.agenta.ai/advanced_guides/using_custom_evaluators)
-- [🔍 **Annotate** and **A/B test**](https://docs.agenta.aibasic_guides/human_evaluation) your applications with **human feedback**
-- [🤝 **Collaborate with product teams**](https://docs.agenta.ai/basic_guides/team_management) for prompt engineering and evaluation
-- [🚀 **Deploy your application**](https://docs.agenta.ai/basic_guides/deployment) in one-click in the UI,  through CLI, or through github workflows. 
+- [🧪 **Experiment** and **compare** prompts](https://docs.agenta.ai/prompt_management/prompt_engineering) on [any LLM workflow](https://docs.agenta.ai/prompt_management/custom_applications) (chain-of-prompts, Retrieval Augmented Generation (RAG), LLM agents...)
+- ✍️ Collect and [**annotate golden test sets**](https://docs.agenta.ai/evaluation/test_sets) for evaluation
+- 📈 [**Evaluate** your application](https://docs.agenta.ai/evaluation/automatic_evaluation) with pre-existing or [**custom evaluators**](https://docs.agenta.ai/evaluation/custom_evaluator)
+- [🔍 **Annotate** and **A/B test**](https://docs.agenta.ai/evaluation/human_evaluation) your applications with **human feedback**
+- [🤝 **Collaborate with product teams**](https://docs.agenta.ai/misc/team_management) for prompt engineering and evaluation
+- [🚀 **Deploy your application**](https://docs.agenta.ai/prompt_management/deployment) in one-click in the UI, through CLI, or through github workflows.
 
 ### Works with any LLM app workflow
 
 Agenta enables prompt engineering and evaluation on any LLM app architecture:
+
 - Chain of prompts
 - RAG
 - Agents
-- ...
 
-It works with any framework such as [Langchain](https://langchain.com), [LlamaIndex](https://www.llamaindex.ai/) and any LLM provider (openAI, Cohere, Mistral). 
-
-[Jump here to see how to use your own custom application with agenta](/advanced_guides/custom_applications)
+It works with any framework such as [Langchain](https://langchain.com), [LlamaIndex](https://www.llamaindex.ai/) and any LLM provider (openAI, Cohere, Mistral).
 
 # Quick Start
 
 ### [Get started for free](https://cloud.agenta.ai?utm_source=github&utm_medium=readme&utm_campaign=github)
-### [Explore the Docs](https://docs.agenta.ai)
-### [Create your first application in one-minute](https://docs.agenta.ai/quickstart/getting-started-ui)
-### [Create an application using Langchain](https://docs.agenta.ai/tutorials/first-app-with-langchain)
+
+### [Explore the Docs](https://docs.agenta.ai/getting_started/introduction)
+
+### [Create your first application in one-minute](https://docs.agenta.ai/getting_started/quick-start)
+
+### [Create an application using Langchain](https://docs.agenta.ai/guides/tutorials/first-app-with-langchain)
+
 ### [Self-host agenta](https://docs.agenta.ai/self-host/host-locally)
-### [Check the Cookbook](https://docs.agenta.ai/cookbook)
 
-# Features
+### [Check the Cookbook](https://docs.agenta.ai/guides/evaluation_from_sdk)
 
+# Features
 
-| Playground | Evaluation |
-| ------- | ------- |
-| Compare and version prompts for any LLM app, from single prompt to agents. <br/> <video src="https://github.com/Agenta-AI/agenta/assets/4510758/8b736d2b-7c61-414c-b534-d95efc69134c" controls="controls" style="max-width:100%;"> | Define test sets, then evaluate manually or programmatically your different variants.<br/> <video src="https://github.com/Agenta-AI/agenta/assets/4510758/8c6997c6-da87-46ad-a81f-e15e277263d2" controls="controls" style="max-width:100%;">|
-| Human annotation | Deployment |
-| Use Human annotator to A/B test and score your LLM apps. <br/>  <img width="750" alt="Screenshot 2024-01-28 at 12 57 46" src="https://github.com/Agenta-AI/agenta/assets/4510758/bf62a697-bf19-4ba9-850e-742fbfb75424"> | When you are ready, deploy your LLM applications as APIs in one click.<br/>![](https://github.com/Agenta-AI/agenta/blob/main/docs/images/endpoint.gif) |
+| Playground                                                                                                                                                                                                                         | Evaluation                                                                                                                                                                                                                                   |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Compare and version prompts for any LLM app, from single prompt to agents. <br/> <video src="https://github.com/Agenta-AI/agenta/assets/4510758/8b736d2b-7c61-414c-b534-d95efc69134c" controls="controls" style="max-width:100%;"> | Define test sets, then evaluate manually or programmatically your different variants.<br/> <video src="https://github.com/Agenta-AI/agenta/assets/4510758/8c6997c6-da87-46ad-a81f-e15e277263d2" controls="controls" style="max-width:100%;"> |
+| Human annotation                                                                                                                                                                                                                   | Deployment                                                                                                                                                                                                                                   |
+| Use Human annotator to A/B test and score your LLM apps. <br/> <img width="750" alt="Screenshot 2024-01-28 at 12 57 46" src="https://github.com/Agenta-AI/agenta/assets/4510758/bf62a697-bf19-4ba9-850e-742fbfb75424">             | When you are ready, deploy your LLM applications as APIs in one click.<br/>![](https://github.com/Agenta-AI/agenta/blob/main/docs/images/endpoint.gif)                                                                                       |
 
 # Enterprise Support
+
 Contact us here for enterprise support and early access to agenta self-managed enterprise with Kubernetes support. <br/><br/>
 <a href="https://cal.com/mahmoud-mabrouk-ogzgey/demo"><img src="https://cal.com/book-with-cal-dark.svg" alt="Book us"></a>
 
 # Disabling Anonymized Tracking
+
 By default, Agenta automatically reports anonymized basic usage statistics. This helps us understand how Agenta is used and track its overall usage and growth. This data does not include any sensitive information.
 
 To disable anonymized telemetry, follow these steps:
@@ -154,6 +155,7 @@ To disable anonymized telemetry, follow these steps:
 After making this change, restart Agenta Compose.
 
 # ⭐️ Join Our Team
+
 - [Founding Lead Software Engineer Backend](https://agentaai.notion.site/Founding-Lead-Software-Engineer-Backend-d70bfefed6d543778bc4aa38b543a678)
 - [Founding Product Engineer Frontend](https://agentaai.notion.site/Founding-Product-Engineer-Frontend-b6d26a3e9b254be6b6c2bfffbf0b53c5)
 - [Founding Product Designer](https://agentaai.notion.site/Founding-Product-Designer-96b1e760ff0241fd96632578d533a778)
@@ -164,12 +166,12 @@ We warmly welcome contributions to Agenta. Feel free to submit issues, fork the
 
 We are usually hanging in our Slack. Feel free to [join our Slack and ask us anything](https://join.slack.com/t/agenta-hq/shared_invite/zt-1zsafop5i-Y7~ZySbhRZvKVPV5DO_7IA)
 
-Check out our [Contributing Guide](https://docs.agenta.ai/contributing/getting-started) for more information.
+Check out our [Contributing Guide](https://docs.agenta.ai/misc/contributing/getting-started) for more information.
 
 ## Contributors ✨
 
 <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
-[![All Contributors](https://img.shields.io/badge/all_contributors-46-orange.svg?style=flat-square)](#contributors-)
+[![All Contributors](https://img.shields.io/badge/all_contributors-47-orange.svg?style=flat-square)](#contributors-)
 <!-- ALL-CONTRIBUTORS-BADGE:END -->
 
 Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
@@ -238,6 +240,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/youcefs21"><img src="https://avatars.githubusercontent.com/u/34604972?v=4?s=100" width="100px;" alt="Youcef Boumar"/><br /><sub><b>Youcef Boumar</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=youcefs21" title="Documentation">📖</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/LucasTrg"><img src="https://avatars.githubusercontent.com/u/47852577?v=4?s=100" width="100px;" alt="LucasTrg"/><br /><sub><b>LucasTrg</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=LucasTrg" title="Code">💻</a> <a href="https://github.com/Agenta-AI/agenta/issues?q=author%3ALucasTrg" title="Bug reports">🐛</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://ashrafchowdury.me"><img src="https://avatars.githubusercontent.com/u/87828904?v=4?s=100" width="100px;" alt="Ashraf Chowdury"/><br /><sub><b>Ashraf Chowdury</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/issues?q=author%3Aashrafchowdury" title="Bug reports">🐛</a> <a href="https://github.com/Agenta-AI/agenta/commits?author=ashrafchowdury" title="Code">💻</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/jp-agenta"><img src="https://avatars.githubusercontent.com/u/174311389?v=4?s=100" width="100px;" alt="jp-agenta"/><br /><sub><b>jp-agenta</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=jp-agenta" title="Code">💻</a> <a href="https://github.com/Agenta-AI/agenta/issues?q=author%3Ajp-agenta" title="Bug reports">🐛</a></td>
     </tr>
   </tbody>
 </table>

diff --git a/agenta-backend/agenta_backend/celery_config.py b/agenta-backend/agenta_backend/celery_config.py
@@ -7,6 +7,7 @@
 CELERY_ACCEPT_CONTENT = ["json"]
 CELERY_RESULT_SERIALIZER = "json"
 CELERY_TIMEZONE = "UTC"
+CELERY_TASK_TRACK_STARTED = True
 
 CELERY_QUEUES = (
     Queue(

diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py
@@ -33,6 +33,7 @@ class EvaluationStatusEnum(str, Enum):
     EVALUATION_FINISHED = "EVALUATION_FINISHED"
     EVALUATION_FINISHED_WITH_ERRORS = "EVALUATION_FINISHED_WITH_ERRORS"
     EVALUATION_FAILED = "EVALUATION_FAILED"
+    EVALUATION_AGGREGATION_FAILED = "EVALUATION_AGGREGATION_FAILED"
 
 
 class EvaluationScenarioStatusEnum(str, Enum):

diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py
@@ -254,10 +254,14 @@ def evaluation_scenario_db_to_pydantic(
             EvaluationScenarioOutput(**scenario_output.dict())
             for scenario_output in evaluation_scenario_db.outputs
         ],
-        correct_answers=[
-            CorrectAnswer(**correct_answer.dict())
-            for correct_answer in evaluation_scenario_db.correct_answers
-        ],
+        correct_answers=(
+            [
+                CorrectAnswer(**correct_answer.dict())
+                for correct_answer in evaluation_scenario_db.correct_answers
+            ]
+            if evaluation_scenario_db.correct_answers is not None
+            else None
+        ),
         is_pinned=evaluation_scenario_db.is_pinned or False,
         note=evaluation_scenario_db.note or "",
         results=evaluation_scenarios_results_to_pydantic(

diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py
@@ -1,5 +1,6 @@
 import secrets
 import logging
+import traceback
 from typing import Any, List
 
 from fastapi.responses import JSONResponse
@@ -270,6 +271,7 @@ async def fetch_evaluation_scenarios(
         return eval_scenarios
 
     except Exception as exc:
+        logger.error(str(traceback.format_exc()))
         raise HTTPException(status_code=500, detail=str(exc))
 
 

diff --git a/agenta-backend/agenta_backend/services/aggregation_service.py b/agenta-backend/agenta_backend/services/aggregation_service.py
@@ -15,26 +15,33 @@ def aggregate_ai_critique(results: List[Result]) -> Result:
         Result: aggregated result
     """
 
-    numeric_scores = []
-    for result in results:
-        # Extract the first number found in the result value
-        match = re.search(r"\d+", result.value)
-        if match:
-            try:
-                score = int(match.group())
-                numeric_scores.append(score)
-            except ValueError:
-                # Ignore if the extracted value is not an integer
-                continue
-
-    # Calculate the average of numeric scores if any are present
-    average_value = (
-        sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
-    )
-    return Result(
-        type="number",
-        value=average_value,
-    )
+    try:
+        numeric_scores = []
+        for result in results:
+            # Extract the first number found in the result value
+            match = re.search(r"\d+", result.value)
+            if match:
+                try:
+                    score = int(match.group())
+                    numeric_scores.append(score)
+                except ValueError:
+                    # Ignore if the extracted value is not an integer
+                    continue
+
+        # Calculate the average of numeric scores if any are present
+        average_value = (
+            sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
+        )
+        return Result(
+            type="number",
+            value=average_value,
+        )
+    except Exception as exc:
+        return Result(
+            type="error",
+            value=None,
+            error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
+        )
 
 
 def aggregate_binary(results: List[Result]) -> Result:
@@ -71,7 +78,7 @@ def aggregate_float(results: List[Result]) -> Result:
         return Result(
             type="error",
             value=None,
-            error=Error(message="Failed", stacktrace=str(traceback.format_exc())),
+            error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
         )
 
 

diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py
@@ -545,7 +545,7 @@ async def create_new_evaluation(
         user=app.user,
         testset=testset,
         status=Result(
-            value=EvaluationStatusEnum.EVALUATION_STARTED, type="status", error=None
+            value=EvaluationStatusEnum.EVALUATION_INITIALIZED, type="status", error=None
         ),
         variant=variant_id,
         variant_revision=str(variant_revision.id),