From ea994b6209d11e32710d4b5ed223b87a149f78ed Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Fri, 29 Nov 2024 16:39:03 +0100 Subject: [PATCH 1/4] More integration tests info (#5319) --- .github/workflows/integration-runner.yml | 6 +++--- evaluation/integration_tests/run_infer.py | 11 ++++++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 4a41ab28c97..120572aa0cd 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -141,8 +141,8 @@ jobs: id: create_comment uses: KeisukeYamashita/create-comment@v1 with: - # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers - number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }} + # if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5318 }} unique: false comment: | Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} @@ -155,4 +155,4 @@ jobs: DeepSeek LLM Test Results: ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} --- - Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) + Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 3b6f1c6ff2c..2da68b9b82b 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -218,6 +218,8 @@ def load_integration_tests() -> pd.DataFrame: ) df = pd.read_json(output_file, lines=True, orient='records') + + # record success and reason for failure for the final report df['success'] = df['test_result'].apply(lambda x: x['success']) df['reason'] = df['test_result'].apply(lambda x: x['reason']) logger.info('-' * 100) @@ -231,9 +233,16 @@ def load_integration_tests() -> pd.DataFrame: ) logger.info('-' * 100) + # record cost for each instance, with 3 decimal places + df['cost'] = df['metrics'].apply(lambda x: round(x['accumulated_cost'], 3)) + logger.info(f'Total cost: USD {df["cost"].sum():.2f}') + report_file = os.path.join(metadata.eval_output_dir, 'report.md') with open(report_file, 'w') as f: f.write( f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n' ) - f.write(df[['instance_id', 'success', 'reason']].to_markdown(index=False)) + f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n') + f.write( + df[['instance_id', 'success', 'reason', 'cost']].to_markdown(index=False) + ) From 16a7dd52aeaddace83f8ef4ee163bc06a58d6067 Mon Sep 17 00:00:00 2001 From: tofarr Date: Fri, 29 Nov 2024 09:08:47 -0700 Subject: [PATCH 2/4] Fix: Session expired (#5305) --- openhands/server/socket.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/openhands/server/socket.py b/openhands/server/socket.py index 35549548db5..19a4993a294 100644 --- a/openhands/server/socket.py +++ b/openhands/server/socket.py @@ -8,7 +8,6 @@ from openhands.events.observation import ( NullObservation, ) -from openhands.events.observation.error import ErrorObservation from openhands.events.serialization import event_to_dict from openhands.events.stream import AsyncEventStreamWrapper from openhands.server.auth import get_sid_from_token, sign_token @@ -42,14 +41,7 @@ async def init_connection(connection_id: str, data: dict): if token: sid = get_sid_from_token(token, config.jwt_secret) if sid == '': - await sio.emit( - 'oh_event', - event_to_dict( - ErrorObservation( - content='Invalid token! Please ensure a valid jwt_secret is specified or use -e JWT_TOKEN when running with Docker.' - ) - ), - ) + await sio.emit('oh_event', {'error': 'Invalid token', 'error_code': 401}) return logger.info(f'Existing session: {sid}') else: From 7afdf0659e2cb70197ebf41910e2c8455eb30f0a Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Fri, 29 Nov 2024 20:28:24 +0100 Subject: [PATCH 3/4] Update e2b (#5321) --- openhands/runtime/impl/e2b/sandbox.py | 2 +- poetry.lock | 11 +++++++++-- pyproject.toml | 4 +++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/openhands/runtime/impl/e2b/sandbox.py b/openhands/runtime/impl/e2b/sandbox.py index d145dac3511..783028ebb0e 100644 --- a/openhands/runtime/impl/e2b/sandbox.py +++ b/openhands/runtime/impl/e2b/sandbox.py @@ -4,7 +4,7 @@ from glob import glob from e2b import Sandbox as E2BSandbox -from e2b.sandbox.exception import TimeoutException +from e2b.sandbox import TimeoutException from openhands.core.config import SandboxConfig from openhands.core.logger import openhands_logger as logger diff --git a/poetry.lock b/poetry.lock index 99025dcc349..d97ef683fe6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aenum" @@ -6601,6 +6601,7 @@ description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs optional = false python-versions = ">=3.8" files = [ + {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"}, {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"}, ] @@ -6611,6 +6612,7 @@ description = "A collection of ASN.1-based protocols modules" optional = false python-versions = ">=3.8" files = [ + {file = "pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd"}, {file = "pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c"}, ] @@ -8182,6 +8184,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -10343,4 +10350,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "062b51ed5e0cdfaedaa873e24db2e422c047c63c536cea7eedd58222fe1ce3f3" +content-hash = "ff370b7b5077720b73fe3b90cc1b7fb9c7a262bfbd35885bb717369061e8a466" diff --git a/pyproject.toml b/pyproject.toml index 440b4c58795..ec148baadc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ numpy = "*" json-repair = "*" browsergym = "0.10.2" # integrate browsergym as the browsing interface html2text = "*" -e2b = "^0.17.1" +e2b = ">=0.17.1,<1.1.0" pexpect = "*" jinja2 = "^3.1.3" python-multipart = "*" @@ -97,6 +97,7 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] + [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -127,6 +128,7 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" + [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*" From 4c432d35e22ca514347ba19b593a3f68eadabd55 Mon Sep 17 00:00:00 2001 From: mamoodi Date: Fri, 29 Nov 2024 14:28:48 -0500 Subject: [PATCH 4/4] Fix slack link in docs (#5329) --- docs/src/components/HomepageHeader/HomepageHeader.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/components/HomepageHeader/HomepageHeader.tsx b/docs/src/components/HomepageHeader/HomepageHeader.tsx index f421b2897ad..aabbef67df5 100644 --- a/docs/src/components/HomepageHeader/HomepageHeader.tsx +++ b/docs/src/components/HomepageHeader/HomepageHeader.tsx @@ -23,7 +23,7 @@ export function HomepageHeader() { CodeCov MIT License
- Join our Slack community + Join our Slack community Join our Discord community Credits