Skip to content

Commit

Permalink
refactor(ux): update CLI, README and Makefile to improve usability
Browse files Browse the repository at this point in the history
- Make the installation process simple, and make subsequent crates easy to add here.
- Explain why we still need to run the server using a shell script.
- Simplify the client command line interface. We didn't need the extra layer of remove, we only have one command here.
  • Loading branch information
dev-msp committed Apr 23, 2024
1 parent 68ba8ca commit ab003af
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 67 deletions.
53 changes: 37 additions & 16 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,24 +1,45 @@
BIN_NAME = server
BUILD_MODE = release
BUILD_MODE:=release
PACKAGE=server client
BINARIES=$(addprefix target/$(BUILD_MODE)/, $(PACKAGE))
INSTALLED_BINARIES=$(addprefix $(INSTALL_DIR)/voice-, $(PACKAGE))

TARGET = $(BIN_NAME)_$(BUILD_MODE)
all: $(BINARIES)

BIN = target/$(BUILD_MODE)/$(BIN_NAME)
.PHONY: require-bin
require-bin:
ifndef INSTALL_DIR
$(error INSTALL_DIR is not set)
endif

all: $(TARGET)
.PHONY: install
install: require-bin $(INSTALLED_BINARIES)

.PHONY: clean
clean:
clean: clean-bin
cargo clean
rm -f $(TARGET)

$(TARGET): $(BIN)
cp $(BIN) $(TARGET)

$(BIN):
cargo build --$(BUILD_MODE) -p $(BIN_NAME)

DAEMON = local.personal.transcription

kick: $(TARGET)
.PHONY: clean-bin
clean-bin:
rm -f $(BINARIES)
ifdef $(INSTALL_DIR)
rm -f $(INSTALLED_BINARIES)
endif

$(INSTALL_DIR)/voice-%: target/$(BUILD_MODE)/%
cp $< $@

ifeq ($(words $(PACKAGE)),1)
target/$(BUILD_MODE)/%:
else
$(BINARIES):
endif
ifeq ($(BUILD_MODE),release)
cargo build --$(BUILD_MODE) $(foreach p, $(PACKAGE),-p $p)
else
cargo build $(foreach p, $(PACKAGE),-p $p)
endif

DAEMON:=local.personal.transcription

kick: require-bin $(INSTALL_DIR)/voice-server
launchctl kickstart -k gui/$(shell id -u)/$(DAEMON)
51 changes: 25 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,34 @@ Bind requests to hotkeys as you prefer. I'm running the server as a daemon with

If you don't already have a `whisper.cpp`-compatible model, follow that project's [quick-start instructions](https://github.com/ggerganov/whisper.cpp#quick-start) to get one.

Start the server:
`./run.sh macbook ggml-base.en.bin /tmp/whisper.sock`
## Quick start

Start recording: `curl -X POST -H "Content-Type: application/json" -d "$body" "http://127.0.0.1:8088/voice/$1"`
In your terminal:

Example request body:
```json
{
// partial name matches OK
"input_device": "MacBook Pro Microphone",
```sh
# Build the server and client
INSTALL_DIR=/something/on/your/PATH make install

// optional
"sample_rate": 44100,
}
# Start the server
# (see run.sh for why running the binary directly doesn't work yet)
./run.sh localhost:8088 $PATH_TO_MODEL
```
Response: `{ "type": "ack" }`

Stop recording: `curl -X POST http://localhost:8088/voice/stop`
Example response:
```json
{
"data": {
"content": "And we're recording and we're doing stuff and then we're going to send a stop message.",
"mode": {
"type": "live_typing"
}
},
"type": "transcription"
}

In a separate shell:

```sh
# Send a start command to the server.
#
# Note that `-i` is optional, without it the server will use the first
# compatible device. For example, you might pass "MacBook" if you want to use
# your laptop's built-in mic ("MacBook Pro Microphone").
voice-client localhost:8088 start -i $PARTIAL_INPUT_DEVICE_NAME
```

After executing this command, the server will start recording from this specified input. To get the results, send the stop command:

```sh
voice-client localhost:8088 stop
```

Note: the modes that you get back in the output are just metadata. Your client application that talks to the server should also handle processing the transcription differently based on the mode.
The results will be printed to stdout.
9 changes: 6 additions & 3 deletions client/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub enum Error {
Client(#[from] api::Error),
}

#[derive(Debug, clap::Subcommand)]
#[derive(Debug, Clone, clap::Subcommand)]
pub enum Commands {
Start {
#[clap(short, long)]
Expand All @@ -23,7 +23,7 @@ pub enum Commands {
},
}

#[derive(Debug, clap::Args)]
#[derive(Debug, clap::Parser)]
pub struct App {
#[command(subcommand)]
pub command: Commands,
Expand Down Expand Up @@ -59,7 +59,7 @@ impl RunningApp {
}
}

mod api {
pub mod api {
use serde::de::DeserializeOwned;

use voice::{
Expand All @@ -74,6 +74,9 @@ mod api {

#[error("JSON error: {0}")]
Json(#[from] serde_json::Error),

#[error("Unexpected response: {0}")]
UnexpectedResponse(Response),
}

pub struct Client {
Expand Down
43 changes: 27 additions & 16 deletions client/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,38 @@
mod client;

use clap::Parser;
use client::RunningApp;

#[derive(Debug, clap::Parser)]
#[command(version, about, long_about = None)]
struct App {
#[command(subcommand)]
command: Commands,
}

#[derive(Debug, clap::Subcommand)]
enum Commands {
Client(client::App),
}
use client::{App, Commands, RunningApp};
use voice::app::response::Response;

#[tokio::main]
async fn main() -> Result<(), client::Error> {
env_logger::init();

match App::parse().command {
Commands::Client(client_command) => {
let resp = RunningApp::from(client_command).execute().await?;
let app = App::parse();
match &app.command {
Commands::Start { .. } => {
match RunningApp::from(app).execute().await? {
Response::Ack(_) => (),
r => return Err(client::api::Error::UnexpectedResponse(r).into()),
}
Ok(())
}
Commands::Stop => {
match RunningApp::from(app).execute().await? {
Response::Transcription { content, .. } => {
let Some(content) = content else {
eprintln!("No transcription available");
return Ok(());
};
println!("{content}");
}
r => return Err(client::api::Error::UnexpectedResponse(r).into()),
}
Ok(())
}

_ => {
let resp = RunningApp::from(app).execute().await?;
log::info!("{:?}", resp);
Ok(())
}
Expand Down
12 changes: 6 additions & 6 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ fi

# Additionally, whisper-rs doesn't capture Metal-specific logs for some reason.
# So you'll still see those in the log output, even if you're suppressing.
export RUST_LOG=whisper_sys_log=error,voice=debug
export RUST_LOG=whisper_sys_log=error,voice=debug,server=info

# The address on which the HTTP server should listen (e.g. localhost:PORT)
addr="$1"

# See the whisper.cpp repo for details on how to get a model. I recommend using
# base or small for best results.
model_path="$2"

# The address on which the HTTP server should listen (e.g. localhost:PORT)
addr="$3"

./voice run-daemon \
--serve "$addr"
voice-server run-daemon \
--serve "$addr" \
--model "$model_path"

0 comments on commit ab003af

Please sign in to comment.