Merge branch 'main' into add-local-runtime

All-Hands-AI · Nov 30, 2024 · af56cea · af56cea
2 parents a136ab1 + 4c432d3
commit af56cea
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 18 deletions.
diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
@@ -141,8 +141,8 @@ jobs:
         id: create_comment
         uses: KeisukeYamashita/create-comment@v1
         with:
-          # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers
-          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
+          # if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5318 }}
           unique: false
           comment: |
               Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
@@ -155,4 +155,4 @@ jobs:
               DeepSeek LLM Test Results:
               ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
               ---
-              Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
+              Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
diff --git a/docs/src/components/HomepageHeader/HomepageHeader.tsx b/docs/src/components/HomepageHeader/HomepageHeader.tsx
@@ -23,7 +23,7 @@ export function HomepageHeader() {
           <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" /></a>
           <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License" /></a>
           <br/>
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
+          <a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2tom0er4l-JeNUGHt_AxpEfIBstbLPiw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
           <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" /></a>
           <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits" /></a>
           <br/>

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
@@ -218,6 +218,8 @@ def load_integration_tests() -> pd.DataFrame:
     )
 
     df = pd.read_json(output_file, lines=True, orient='records')
+
+    # record success and reason for failure for the final report
     df['success'] = df['test_result'].apply(lambda x: x['success'])
     df['reason'] = df['test_result'].apply(lambda x: x['reason'])
     logger.info('-' * 100)
@@ -231,9 +233,16 @@ def load_integration_tests() -> pd.DataFrame:
     )
     logger.info('-' * 100)
 
+    # record cost for each instance, with 3 decimal places
+    df['cost'] = df['metrics'].apply(lambda x: round(x['accumulated_cost'], 3))
+    logger.info(f'Total cost: USD {df["cost"].sum():.2f}')
+
     report_file = os.path.join(metadata.eval_output_dir, 'report.md')
     with open(report_file, 'w') as f:
         f.write(
             f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n'
         )
-        f.write(df[['instance_id', 'success', 'reason']].to_markdown(index=False))
+        f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n')
+        f.write(
+            df[['instance_id', 'success', 'reason', 'cost']].to_markdown(index=False)
+        )
diff --git a/openhands/runtime/impl/e2b/sandbox.py b/openhands/runtime/impl/e2b/sandbox.py
@@ -4,7 +4,7 @@
 from glob import glob
 
 from e2b import Sandbox as E2BSandbox
-from e2b.sandbox.exception import TimeoutException
+from e2b.sandbox import TimeoutException
 
 from openhands.core.config import SandboxConfig
 from openhands.core.logger import openhands_logger as logger

diff --git a/openhands/server/socket.py b/openhands/server/socket.py
@@ -8,7 +8,6 @@
 from openhands.events.observation import (
     NullObservation,
 )
-from openhands.events.observation.error import ErrorObservation
 from openhands.events.serialization import event_to_dict
 from openhands.events.stream import AsyncEventStreamWrapper
 from openhands.server.auth import get_sid_from_token, sign_token
@@ -42,14 +41,7 @@ async def init_connection(connection_id: str, data: dict):
     if token:
         sid = get_sid_from_token(token, config.jwt_secret)
         if sid == '':
-            await sio.emit(
-                'oh_event',
-                event_to_dict(
-                    ErrorObservation(
-                        content='Invalid token! Please ensure a valid jwt_secret is specified or use -e JWT_TOKEN when running with Docker.'
-                    )
-                ),
-            )
+            await sio.emit('oh_event', {'error': 'Invalid token', 'error_code': 401})
             return
         logger.info(f'Existing session: {sid}')
     else:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,7 @@ numpy = "*"
 json-repair = "*"
 browsergym = "0.10.2" # integrate browsergym as the browsing interface
 html2text = "*"
-e2b = "^0.17.1"
+e2b = ">=0.17.1,<1.1.0"
 pexpect = "*"
 jinja2 = "^3.1.3"
 python-multipart = "*"
@@ -97,6 +97,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]
 
+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -127,6 +128,7 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"