Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
93 commits
Select commit Hold shift + click to select a range
026177f
WIP namo-v3
stevehuang52 Dec 30, 2025
c871eb7
Merge remote-tracking branch 'origin/main' into heh/va_nano3
stevehuang52 Jan 8, 2026
1e79403
Merge remote-tracking branch 'origin/main' into heh/va_nano3
stevehuang52 Jan 15, 2026
1ab37a3
update tts tool prompt
stevehuang52 Jan 15, 2026
921b4c3
add nano-v3 support
stevehuang52 Jan 16, 2026
49f6d2b
update for vllm
stevehuang52 Jan 16, 2026
6d1d953
refactor and improve tts
stevehuang52 Jan 18, 2026
9b21333
update title
stevehuang52 Jan 20, 2026
bb8799e
update readme
stevehuang52 Jan 20, 2026
2f59910
fix tts reset
stevehuang52 Jan 21, 2026
63d3891
add tts pre-fetch all
stevehuang52 Jan 21, 2026
8f01fee
update tts and prompt
stevehuang52 Jan 26, 2026
a206083
fix tts
stevehuang52 Jan 26, 2026
d1fc2a5
upgrade to pipecat==0.0.98
stevehuang52 Jan 26, 2026
7e75d38
update for magpie-tts
stevehuang52 Jan 26, 2026
26bf617
update readme
stevehuang52 Jan 26, 2026
8cfe37a
update readme
stevehuang52 Jan 26, 2026
b41cdf3
drop redundant code and make websocket port configurable
stevehuang52 Jan 26, 2026
48f1b52
clean up
stevehuang52 Jan 26, 2026
9be6f68
fix linting
stevehuang52 Jan 27, 2026
59aaf3b
move tool calling folder
stevehuang52 Jan 27, 2026
ddb0619
fix tts server config
stevehuang52 Jan 27, 2026
f08fbd0
initial draft
stevehuang52 Jan 27, 2026
14337b1
allow no model config by putting everything in server config
stevehuang52 Jan 28, 2026
c89f451
v1 implementation with ~78ms audio output gaps, using an user thread …
stevehuang52 Feb 4, 2026
79e377d
v2 better working solution
stevehuang52 Feb 4, 2026
1459b97
first working solution
stevehuang52 Feb 5, 2026
8bb1b8f
clean up
stevehuang52 Feb 6, 2026
d741a30
WIP conversation logging
stevehuang52 Feb 6, 2026
23d160b
fix audio and conversation log, ~100ms diff
stevehuang52 Feb 7, 2026
c625596
refactor save audio
stevehuang52 Feb 9, 2026
be7e0c0
add noise aug and refactor bridge
stevehuang52 Feb 9, 2026
d3d9b37
improve seglst and refactor
stevehuang52 Feb 9, 2026
0b6b22b
fix noise aug
stevehuang52 Feb 9, 2026
6616630
refactor and update
stevehuang52 Feb 10, 2026
49eceae
refactor and fix residual audio
stevehuang52 Feb 12, 2026
2582e8c
refactor
stevehuang52 Feb 12, 2026
60d3244
refactor to only use control signals for turn logging
stevehuang52 Feb 12, 2026
bbd14d0
refactor
stevehuang52 Feb 12, 2026
ca7ffbd
add tool call for eval
stevehuang52 Feb 13, 2026
32aa2b8
add nv llm endpoint
stevehuang52 Feb 13, 2026
eb7dd78
update nv llm examples
stevehuang52 Feb 13, 2026
61f2f5e
WIP add scenario and refactor
stevehuang52 Feb 20, 2026
73283e2
update bridge, runner, scenario and runner
stevehuang52 Feb 23, 2026
6a83e70
refactor and add task success rate
stevehuang52 Feb 24, 2026
cbd3e7d
clean up
stevehuang52 Feb 24, 2026
8af47ae
add vllm reasoning budget control
stevehuang52 Feb 25, 2026
e5a7182
add context logging
stevehuang52 Feb 26, 2026
16205d9
refactor
stevehuang52 Mar 5, 2026
4fdc9e2
update
stevehuang52 Mar 6, 2026
c9d8567
fix for non-EOU ASR models
stevehuang52 Mar 6, 2026
c8365c0
update
stevehuang52 Apr 8, 2026
472b2e4
add more scenarios
stevehuang52 Apr 20, 2026
db8169b
Apply isort and black reformatting
stevehuang52 Apr 20, 2026
1e00c83
Merge remote-tracking branch 'origin/main' into heh/va_eval
stevehuang52 Apr 20, 2026
40f3a98
clean up
stevehuang52 Apr 20, 2026
0ce517d
refactor
stevehuang52 Apr 20, 2026
39ffdb2
refactor
stevehuang52 Apr 20, 2026
0424819
change to use uv to mange env
stevehuang52 Apr 20, 2026
cb25e21
fix pylint
stevehuang52 Apr 20, 2026
5fb0857
fix pylint
stevehuang52 Apr 20, 2026
72f9d1a
refactor bot construction
stevehuang52 Apr 21, 2026
1cdf442
clean up
stevehuang52 Apr 21, 2026
144ee5e
update doc
stevehuang52 Apr 21, 2026
0568e1e
fix toolcall mixin
stevehuang52 Apr 22, 2026
c9f9c29
refactor customer_service domain
stevehuang52 Apr 23, 2026
c5ffa67
update cs domain
stevehuang52 Apr 23, 2026
7741642
update doc
stevehuang52 Apr 23, 2026
ccc486c
update client app to pre-allocate user slot when user starts speaking
stevehuang52 Apr 29, 2026
748a17d
add nemotron-voice-agent
stevehuang52 Apr 29, 2026
60cbca1
update logging for nemotron-voice-agent
stevehuang52 Apr 29, 2026
06d55c9
remove extra logging
stevehuang52 Apr 29, 2026
e400bf4
fix tool calling for nemotron
stevehuang52 May 1, 2026
61c8287
add direct support for nvidia nim to nemo voice agent
stevehuang52 May 1, 2026
a905b80
fix create_nvidia_context_aggregator missing llm adapter
stevehuang52 May 4, 2026
e027811
update prompt
stevehuang52 May 4, 2026
50030d3
add thinking_budget control
stevehuang52 May 4, 2026
83e4181
add thinking_budget control
stevehuang52 May 4, 2026
47d087f
add thinking_budget control
stevehuang52 May 4, 2026
a47aa5f
update thinking budget control
stevehuang52 May 4, 2026
ac0bcb9
update reasoning budget control for vllm compatability
stevehuang52 May 5, 2026
8ca0ccd
add claude.md
stevehuang52 May 5, 2026
a2f06e9
fix config manager for turn-taking
stevehuang52 May 8, 2026
30e0a74
add Scenario.setup_shared_state hook and disallow_extra_items compara…
stevehuang52 May 8, 2026
7aedb5f
add get_scenario_summary RTVI action + DB-state hash scoring infra
stevehuang52 May 11, 2026
28f7550
add debug logging
stevehuang52 May 11, 2026
86bf6b9
add eva_airline evaluation domain with DB-state hash scoring, inludin…
stevehuang52 May 11, 2026
769d29f
add 4 more eva_airline scenarios and refine prompt formatting
stevehuang52 May 11, 2026
718bf22
document eva_airline domain run flow and result-inspection pattern
stevehuang52 May 12, 2026
1e9c350
port remaining 45 eva_airline scenarios; reorganize as a package
stevehuang52 May 12, 2026
c04461e
update
stevehuang52 May 12, 2026
2e88fba
refactor
stevehuang52 May 12, 2026
a7a90d6
Fix eva_airline circular import
stevehuang52 May 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ onelogger.*
node_modules/
.vite/
bot_server.*
eval_results/
audio_logs/
audio_logs_user/
audio_logs_agent/
audio_logs/
eval_results/

Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ repos:
- id: check-case-conflict
- id: detect-private-key
- id: check-added-large-files
args: ['--maxkb=1000']
args: ['--maxkb=2000']
- id: requirements-txt-fixer

- repo: https://github.com/PyCQA/isort
Expand All @@ -49,4 +49,4 @@ repos:
# supported by your project here, or alternatively use
# pre-commit's default_language_version, see
# https://pre-commit.com/#top_level-default_language_version
language_version: python3.10
language_version: python3.12
166 changes: 166 additions & 0 deletions examples/voice_agent/CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# CLAUDE.md

This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.

## What this directory is

A self-contained example of a real-time voice agent built on **Pipecat** (`pipecat-ai==0.0.98`) wired together with NeMo speech models and either a HuggingFace or vLLM LLM backend. The example has its **own `pyproject.toml` + `uv.lock`** and is decoupled from the parent NeMo repo's install:

- Python **3.12–3.13** (not 3.10+ like the rest of NeMo).
- Default install pulls **CUDA 13.0** PyTorch/vLLM wheels (`torch-backend = "cu130"` in `pyproject.toml`). Override via `pyproject.toml` if you need cu128/cu124/cpu.
- The actual implementation lives in `nemo/agents/voice_agent/` at the repo root — this directory only contains the example entry-point (`server/server.py`), YAML configs, the browser client, evaluation harness, and tests. Source-code edits typically belong under `nemo/agents/voice_agent/`.

## Install & run

```bash
# One-shot install (apt deps + uv + venv): bash install.sh
uv sync
source .venv/bin/activate

# Server (terminal 1)
export PYTHONPATH=/path/to/NeMo:$PYTHONPATH # path to the repo root containing nemo/
# export SERVER_CONFIG_PATH=server/server_configs/default.yaml # override the default config
# export HF_TOKEN=hf_... # for gated HF models
python ./server/server.py

# Client (terminal 2)
cd client && npm install && npm run dev # vite on http://localhost:5173
```

The server binds two ports: a **WebSocket** for the audio pipeline (`WEBSOCKET_PORT`, default `8765`) and a **FastAPI** control plane (`FASTAPI_PORT`, default `7860`). Bind addresses come from `SERVER_HOST` (default `0.0.0.0`).

Browsers block mic access on plain HTTP — Chrome users must allow-list `http://<host>:5173/` via `chrome://flags/#unsafely-treat-insecure-origin-as-secure`.

## Tests

```bash
pytest tests/ -v # all
pytest tests/test_config_manager.py -v # config loader
pytest tests/test_reasoning_budget_logits_processor.py # vLLM reasoning-budget processor (needs CUDA + tokenizer download)
```

`tests/test_*.py` insert the repo root into `sys.path` so they always test the working-tree NeMo, not whatever is `pip install`ed.

## Server architecture

`server/server.py:run_bot_websocket()` is the whole show — it loads a YAML config and assembles a Pipecat pipeline:

```
ws.input → RTVI → STT → [Diar?] → TurnTaking → UserAggregator → LLM → TTS → ws.output → AssistantAggregator
```

Components are constructed via the **builder pattern** in `nemo/agents/voice_agent/pipecat/services/nemo/builders.py` (`build_stt`, `build_diar`, `build_llm`, `build_tts`, `build_turn_taking`, `build_vad_analyzer`, `build_ws_transport`, `build_audio_logger`, `build_context_and_aggregators`). The example file rarely needs editing — most behavioral changes happen in YAML or in the builders/services under `nemo/agents/voice_agent/`.

Key cross-cutting concepts:

- **`ConfigManager`** (`nemo/agents/voice_agent/utils/config_manager.py`) loads `server/server_configs/default.yaml`, then merges in the model-specific YAML referenced by each component's `model_config:` field (or auto-resolves via `server/model_registry.yaml` when `server.use_model_registry: true`). Configs use OmegaConf interpolation (e.g. `${llm.temperature}`) — be aware when adding new keys.
- **LLM backend selection.** `llm.type` is `auto | hf | vllm`. `auto` tries vLLM first and falls back to HF. vLLM is required for tool calling. When `start_vllm_on_init: true` the server spawns vLLM via `vllm serve` with the flags in `vllm_server_params`; otherwise you must start vLLM in another terminal (see README for the Nemotron-Nano-3 30B example).
- **Reasoning / thinking mode.** Off by default. Enable via `llm.enable_reasoning: true` (which switches to the sibling `*_think.yaml` config). `tts.think_tokens=["<think>","</think>"]` causes TTS to skip the reasoning span, so the user only hears the final answer. For vLLM, `--reasoning-parser` filters reasoning out of the OpenAI response entirely (see `server/parsers/nano_v3_reasoning_parser.py`).
- **Backchannels.** `turn_taking.backchannel_phrases_path` (or an inline list) prevents short utterances like "uh-huh" from interrupting the bot. Set to `null` to make any speech interrupt.
- **Single-connection server.** A new WebSocket connection disconnects the previous one (LLM context is preserved). Don't add multi-tenant logic here; this example is single-user by design.

## Config layout (`server/server_configs/`)

```
default.yaml # top-level: server/transport/vad/stt/diar/turn_taking/llm/tts
llm_configs/<model>.yaml # per-model llm sub-config (HF + vLLM params)
llm_configs/<model>_think.yaml # reasoning-mode variant of the same model
tts_configs/<model>.yaml # kokoro / fastpitch-hifigan / magpie
stt_configs/nemo_cache_aware_streaming.yaml
NVIDIA_NeMo_models.yaml # extra NeMo-hosted model defs
```

`server/example_prompts/*.txt` holds reusable system prompts referenceable from `llm.system_prompt` (path-or-literal).

## Tool calling

Two extension points (only works with `llm.type: vllm` + a model whose vLLM tool parser is configured):

1. **Direct functions** — write an async function and pass it to `register_direct_tools_to_llm(...)` in `server.py`. Example: `tool_get_city_weather` from `nemo/agents/voice_agent/utils/tool_calling/basic_tools.py`.
2. **Component-owned tools** — mix `ToolCallingMixin` into a service (STT/TTS/Diar/LLM/TurnTaking) and implement `setup_tool_calling()`. The mixin lives at `nemo/agents/voice_agent/utils/tool_calling/mixins.py`; `KokoroTTSService` in `pipecat/services/nemo/tts.py` is the canonical example (e.g. "speak faster", "switch to British accent").

## Evaluation harness (`evaluation/`)

A separate two-bot system: a **simulated user bot** talks to the **agent under test** via a bridge that shuttles audio between two WebSocket Pipecat servers, captures `<final_response>` payloads, and scores them. See `evaluation/README.md` for the full architecture, scenario authoring guide, and tool-system reference. Quick run:

```bash
# Three terminals: user bot (8766), agent bot (8765), bridge
python evaluation/bot_websocket_user.py # WEBSOCKET_PORT=8766, SERVER_CONFIG_PATH=server_configs/user.yaml
python evaluation/bot_websocket_agent.py # WEBSOCKET_PORT=8765, SERVER_CONFIG_PATH=server_configs/agent.yaml
python evaluation/run_evaluation.py --domain restaurant
```

Scenario classes live under `nemo/agents/voice_agent/evaluation/scenarios/data/` (one file per domain: `restaurant.py`, `customer_service.py`, `qa.py`, …) and tools under `nemo/agents/voice_agent/evaluation/tools/`. Adding a scenario: subclass the domain's `*BaseScenario`, decorate with `@register_eval_scenario`, override only what differs.

### Eval framework key concepts (read before editing)

The eval framework has evolved beyond a simple `<final_response>` capture. The pieces below are easy to miss if you only read `run_evaluation.py`.

**Shared state via RTVI.** Each side (user / agent) holds a per-scenario `shared_state` dict that the bridge seeds at scenario start. The handle is a `SharedStateRef` dataclass (mirrors `TaskRef`) published by `create_update_system_prompt_action(...)` in `nemo/agents/voice_agent/pipecat/processors/frameworks/rtvi_actions.py` — it gives later RTVI actions a mutable view of the same dict. The bridge passes initial state via the `shared_state_init` JSON-string argument of `update_system_prompt`. Scenarios populate state by overriding `Scenario.setup_shared_state(self, state, side)` (in `scenarios/classes.py`) — same method called twice with `side="user"` / `side="agent"`.

**Bridge-pull summary (not LLM-callable).** End-of-scenario state is **pulled** by the bridge after `<exit>`, not pushed by an LLM tool call. The bridge calls `_retrieve_scenario_summary(ws)` in `nemo/agents/voice_agent/evaluation/bridge.py`, which sends an RTVI `get_scenario_summary` action; the handler (`create_get_scenario_summary_action`) returns `{"actions": [...], "db": {...}}` read straight from `shared_state`. This eliminates the previous double-emit / forgot-to-call / mid-conversation-call class of bugs. **Don't reintroduce a `SubmitTransactionSummaryTool`-style LLM-callable summary.**

**DB-state hash matching (primary signal).** When a scenario sets `expected_scenario_db` (a `cached_property` on the class), the runner ignores the action-list comparator and instead hashes the agent's final `shared_state["db"]` via `get_dict_hash` (`nemo/agents/voice_agent/evaluation/db_hash.py`, adapted from eva 0.1.3 / tau-2-bench style). The hash normalizes floats (`1.0 → 1`), `"none" → None`, and uses `ORDER_INDEPENDENT_LIST_FIELDS` for set-like fields; `HASH_EXCLUDED_KEYS = {"session"}` skips per-run noise. On mismatch the runner writes a structured `db_state_diff` (tables → records → fields) via `compute_db_diff` for debugging. The action-list (`reference_answer`) remains as a secondary signal. Aggregate: `db_state_success_rate` printed by the runner.

**Auto-aggregated action records.** Each write tool extends `WriteAirlineTool` (in `nemo/agents/voice_agent/evaluation/tools/eva_airline_tools.py`) and calls `self._record_action({...})` on success — the record is appended to `shared_state["actions"]` so the bridge picks it up via the pull. The action `type` must come from the locked `AIRLINE_ACTION_TYPES` vocabulary (1:1 with eva tool names). Read tools don't record.

**Symmetric DB transfer.** The bridge sends the full original DB content (not a path) to the agent via `shared_state_init`. The agent mutates its in-memory copy through tools; the bridge pulls the full mutated DB back at end-of-scenario. There is also a `db_path` fallback for legacy paths — see the `state["db_path"]` branch in the action handler.

### `eva_airline` domain layout

```
nemo/agents/voice_agent/evaluation/
├── scenarios/data/eva_airline/ # package, not a single file
│ ├── __init__.py # re-exports EvaAirlineBaseScenario; imports group_Nx
│ ├── base.py # EvaAirlineBaseScenario + 5 hand-authored seeds
│ │ # (1.1.2, 2.1.1, 3.1.3, 5.1.1, 7.2.1)
│ └── group_{1..7}x.py # auto-scaffolded scenarios per eva sub-flow
├── tools/eva_airline_tools.py # 15 ported tools + WriteAirlineTool base
├── tools/eva_airline_params.py # Pydantic schemas for tool args
└── db_hash.py # eva-compatible normalize + hash
```

`EvaAirlineBaseScenario` derives everything from a single class attribute `eva_id` (e.g. `"1.1.2"`) via `cached_property`: `current_date`, `_scenario_db`, `expected_scenario_db`. The dataset metadata is read once per process via `_load_eva_airline_dataset_index()` (cached at module level). Subclasses only declare `name`, `eva_id`, `description`, `user_persona`, `user_task`, `user_actions`.

Voice-readability rule on `EvaAirlineBaseScenario.VOICE_ALPHANUMERIC_RULE`: alphanumerics are spelled **canonical-first**: `EPXYEK (spelled out as E, P, X, Y, E, K)`. Use this constant in both agent and user guidelines.

Fixtures live in `examples/voice_agent/evaluation/data/` (resolved by `get_eval_data_root()`, override via `EVAL_DATA_ROOT`). The directory has a `README.md` recording upstream source + license for each domain — append a section when adding a new source. The `get_eval_data_root()` helper is at `nemo/agents/voice_agent/evaluation/__init__.py` and uses `parents[4]` to walk from `nemo/agents/voice_agent/evaluation/__init__.py` to the repo root.

### Scaffolding more eva scenarios

The 5 seed scenarios in `base.py` are hand-authored; the rest are scaffolded from `eva_airline_dataset.jsonl` via:

```bash
python examples/voice_agent/nemo_experiments/generate_eva_airline_scaffolds.py --major 4 \
>> nemo/agents/voice_agent/evaluation/scenarios/data/eva_airline/group_4x.py
```

The generator (in `nemo_experiments/`, gitignored personal-scratch dir) emits one `@register_eval_scenario` class per dataset entry, applies the alphanumeric voice rule, and reads `must_have_criteria` / `negotiation_behavior` / `edge_cases` into guidelines. **The output is a starting point, not final** — hand-review prose and prune negotiation/edge-case bullets before committing.

### Running a single eva_airline scenario

```bash
# After both bots are running (see Quick Start above):
python evaluation/run_evaluation.py \
--scenarios eva_airline__1_1_2 \
--duration 900 # bump from default 600s — voice round-trips are ~10× slower than text
```

Scenario names map from eva ids: `"1.1.2" → "eva_airline__1_1_2"`, class names `"1.1.2" → "EvaAirline112"`.

### Known limitations

- **Parakeet STT misrecognizes spelled alphanumerics.** Letter sequences and digit-words (`"for"` vs `"four"`, `"B Z I W"`) frequently get mangled. Diagnose by checking `bot_logs_user/llm_context.json` to confirm the user simulator emitted the correct text before blaming the user-side LLM.
- **Action list lookups are case-sensitive.** Tool action `type` strings must match `AIRLINE_ACTION_TYPES` exactly.
- **DB diff isn't shown unless `expected_scenario_db` is set.** Scenarios without a ground-truth DB fall back to action-list comparison only.

## Code style

The parent repo's style rules (line length 119, black with `skip_string_normalization`, isort `profile=black`) apply. Most of this directory is **excluded from black's auto-format scope** in the parent `pyproject.toml`'s `extend-exclude` — only reformat files you're actively changing, and don't bulk-reformat unrelated code. Run lint via the repo-root command: `python setup.py style --scope examples/voice_agent --fix`.

## Gotchas

- **Don't run `uv sync` inside an active conda env** — `install.sh` exits early in that case because conda's gcc + system Python headers break C extensions like `cdifflib`. Run `conda deactivate` first.
- The egg-info dir (`nemo_voice_agent.egg-info/`), `.venv/`, `nemo_experiments/` (personal scratch + `.env`), and `*.log` files are local artifacts — don't commit changes to them.
- `server/parsers/*.py` are vLLM **plugins** loaded by path (`--tool-parser-plugin`, `--reasoning-parser-plugin`). They run inside the vLLM process, not the bot server, so logging/imports there have a different runtime than the rest of the codebase.
- `bot_server.log` saves the logs from the pipecat pipeline, by default it's rotated every day. Recent failures: check the newest `bot_server.<timestamp>.log`, not just `bot_server.log` (which may be from an in-flight run).
16 changes: 5 additions & 11 deletions examples/voice_agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,28 +61,22 @@ As of now, we only support English input and output, but more languages will be

### Install dependencies

First, install or update the npm and node.js to the latest version, for example:
First, install the `npm` and `nodejs` dependencies via:

```bash
sudo apt-get update
sudo apt-get install -y npm nodejs
```

or:
Second, create a venv with `uv`:

```bash
curl -fsSL https://fnm.vercel.app/install | bash
. ~/.bashrc
fnm use --install-if-missing 20
uv sync
```

Second, create a new conda environment with the dependencies:
Then you can activate the environment via `source .venv/bin/activate`.

```bash
conda env create -f environment.yaml
```

Then you can activate the environment via `conda activate nemo-voice`.
Alternatively, you can do all steps in one go by running `bash install.sh`.

### Configure the server

Expand Down
55 changes: 49 additions & 6 deletions examples/voice_agent/client/src/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ class WebsocketClientApp {
private volumeUpdateInterval: number | null = null;
private currentBotMessageElement: HTMLDivElement | null = null;
private currentBotMessage: string = '';
private currentUserMessageElement: HTMLDivElement | null = null;
private currentUserMessage: string = '';
/** ISO time for the active user row (set when the row is pre-allocated). */
private currentUserUtteranceIso: string = '';

// Server configurations
private readonly serverConfigs = {
Expand Down Expand Up @@ -125,6 +129,20 @@ class WebsocketClientApp {
return entry;
}

/**
* Create a user transcript row (pre-allocated on VAD user-started, filled on final transcript).
*/
private createUserMessageElement(initialText: string): HTMLDivElement | null {
if (!this.debugLog) return null;
this.currentUserUtteranceIso = new Date().toISOString();
const entry = document.createElement('div');
entry.style.color = '#2196F3';
entry.textContent = `${this.currentUserUtteranceIso} - User: ${initialText}`;
this.debugLog.appendChild(entry);
this.debugLog.scrollTop = this.debugLog.scrollHeight;
return entry;
}

/**
* Update the connection status display
*/
Expand Down Expand Up @@ -250,10 +268,29 @@ class WebsocketClientApp {
this.log(`Bot ready: ${JSON.stringify(data)}`);
this.setupMediaTracks();
},
onUserStartedSpeaking: () => {
if (this.currentUserMessage !== '') {
this.currentUserMessage = '';
this.currentUserMessageElement = this.createUserMessageElement('');
} else if (!this.currentUserMessageElement) {
this.currentUserMessage = '';
this.currentUserMessageElement = this.createUserMessageElement('');
}
},
onUserTranscript: (data) => {
if (data.final) {
this.log(`User: ${data.text}`);
if (!data.final) {
return;
}
if (!this.currentUserMessageElement) {
this.currentUserMessage = '';
this.currentUserMessageElement = this.createUserMessageElement('');
}
if (this.currentUserMessageElement) {
this.currentUserMessageElement.textContent = `${this.currentUserUtteranceIso} - User: ${data.text}`;
}
this.currentUserMessage = data.text;
this.debugLog?.scrollTo({ top: this.debugLog.scrollHeight, behavior: 'smooth' });
console.log(`User: ${data.text}`);
},
onBotTranscript: (data) => {
// If no current element exists, create one (fallback in case BOT_LLM_STARTED didn't fire)
Expand Down Expand Up @@ -358,10 +395,13 @@ class WebsocketClientApp {
// Clean up bot message state
this.currentBotMessage = '';
this.currentBotMessageElement = null;

this.currentUserMessage = '';
this.currentUserMessageElement = null;
this.currentUserUtteranceIso = '';

// Reset mute state
this.isMuted = false;

// Reset disconnecting flag
this.isDisconnecting = false;
}
Expand Down Expand Up @@ -406,10 +446,13 @@ class WebsocketClientApp {
// Clean up bot message state
this.currentBotMessage = '';
this.currentBotMessageElement = null;

this.currentUserMessage = '';
this.currentUserMessageElement = null;
this.currentUserUtteranceIso = '';

// Reset mute state
this.isMuted = false;

this.isDisconnecting = false;
}

Expand Down
Loading
Loading