refactor(ux): update CLI, README and Makefile to improve usability

- Make the installation process simple, and make subsequent crates easy to add here. - Explain why we still need to run the server using a shell script. - Simplify the client command line interface. We didn't need the extra layer of remove, we only have one command here.
dev-msp · Apr 23, 2024 · ab003af · ab003af
1 parent 68ba8ca
commit ab003af
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 67 deletions.
diff --git a/Makefile b/Makefile
@@ -1,24 +1,45 @@
-BIN_NAME = server
-BUILD_MODE = release
+BUILD_MODE:=release
+PACKAGE=server client
+BINARIES=$(addprefix target/$(BUILD_MODE)/, $(PACKAGE))
+INSTALLED_BINARIES=$(addprefix $(INSTALL_DIR)/voice-, $(PACKAGE))
 
-TARGET = $(BIN_NAME)_$(BUILD_MODE)
+all: $(BINARIES)
 
-BIN = target/$(BUILD_MODE)/$(BIN_NAME)
+.PHONY: require-bin
+require-bin:
+ifndef INSTALL_DIR
+	$(error INSTALL_DIR is not set)
+endif
 
-all: $(TARGET)
+.PHONY: install
+install: require-bin $(INSTALLED_BINARIES)
 
 .PHONY: clean
-clean:
+clean: clean-bin
 	cargo clean
-	rm -f $(TARGET)
 
-$(TARGET): $(BIN)
-	cp $(BIN) $(TARGET)
-
-$(BIN):
-	cargo build --$(BUILD_MODE) -p $(BIN_NAME)
-
-DAEMON = local.personal.transcription
-
-kick: $(TARGET)
+.PHONY: clean-bin
+clean-bin:
+	rm -f $(BINARIES)
+ifdef $(INSTALL_DIR)
+	rm -f $(INSTALLED_BINARIES)
+endif
+
+$(INSTALL_DIR)/voice-%: target/$(BUILD_MODE)/%
+	cp $< $@
+
+ifeq ($(words $(PACKAGE)),1)
+target/$(BUILD_MODE)/%:
+else
+$(BINARIES):
+endif
+ifeq ($(BUILD_MODE),release)
+	cargo build --$(BUILD_MODE) $(foreach p, $(PACKAGE),-p $p)
+else
+	cargo build $(foreach p, $(PACKAGE),-p $p)
+endif
+
+DAEMON:=local.personal.transcription
+
+kick: require-bin $(INSTALL_DIR)/voice-server
 	launchctl kickstart -k gui/$(shell id -u)/$(DAEMON)
diff --git a/README.md b/README.md
@@ -6,35 +6,34 @@ Bind requests to hotkeys as you prefer. I'm running the server as a daemon with
 
 If you don't already have a `whisper.cpp`-compatible model, follow that project's [quick-start instructions](https://github.com/ggerganov/whisper.cpp#quick-start) to get one.
 
-Start the server:
-`./run.sh macbook ggml-base.en.bin /tmp/whisper.sock`
+## Quick start
 
-Start recording: `curl -X POST -H "Content-Type: application/json" -d "$body" "http://127.0.0.1:8088/voice/$1"`
+In your terminal:
 
-Example request body:
-```json
-{
-  // partial name matches OK
-  "input_device": "MacBook Pro Microphone",
+```sh
+# Build the server and client
+INSTALL_DIR=/something/on/your/PATH make install
 
-  // optional
-  "sample_rate": 44100,
-}
+# Start the server
+# (see run.sh for why running the binary directly doesn't work yet)
+./run.sh localhost:8088 $PATH_TO_MODEL
 ```
-Response: `{ "type": "ack" }`
-
-Stop recording: `curl -X POST http://localhost:8088/voice/stop`
-Example response:
-```json
-{
-  "data": {
-    "content": "And we're recording and we're doing stuff and then we're going to send a stop message.",
-    "mode": {
-      "type": "live_typing"
-    }
-  },
-  "type": "transcription"
-}
+
+In a separate shell:
+
+```sh
+# Send a start command to the server.
+#
+# Note that `-i` is optional, without it the server will use the first
+# compatible device. For example, you might pass "MacBook" if you want to use
+# your laptop's built-in mic ("MacBook Pro Microphone").
+voice-client localhost:8088 start -i $PARTIAL_INPUT_DEVICE_NAME
+```
+
+After executing this command, the server will start recording from this specified input. To get the results, send the stop command:
+
+```sh
+voice-client localhost:8088 stop
 ```
 
-Note: the modes that you get back in the output are just metadata. Your client application that talks to the server should also handle processing the transcription differently based on the mode.
+The results will be printed to stdout.
diff --git a/client/src/client.rs b/client/src/client.rs
@@ -6,7 +6,7 @@ pub enum Error {
     Client(#[from] api::Error),
 }
 
-#[derive(Debug, clap::Subcommand)]
+#[derive(Debug, Clone, clap::Subcommand)]
 pub enum Commands {
     Start {
         #[clap(short, long)]
@@ -23,7 +23,7 @@ pub enum Commands {
     },
 }
 
-#[derive(Debug, clap::Args)]
+#[derive(Debug, clap::Parser)]
 pub struct App {
     #[command(subcommand)]
     pub command: Commands,
@@ -59,7 +59,7 @@ impl RunningApp {
     }
 }
 
-mod api {
+pub mod api {
     use serde::de::DeserializeOwned;
 
     use voice::{
@@ -74,6 +74,9 @@ mod api {
 
         #[error("JSON error: {0}")]
         Json(#[from] serde_json::Error),
+
+        #[error("Unexpected response: {0}")]
+        UnexpectedResponse(Response),
     }
 
     pub struct Client {

diff --git a/client/src/main.rs b/client/src/main.rs
@@ -1,27 +1,38 @@
 mod client;
 
 use clap::Parser;
-use client::RunningApp;
-
-#[derive(Debug, clap::Parser)]
-#[command(version, about, long_about = None)]
-struct App {
-    #[command(subcommand)]
-    command: Commands,
-}
-
-#[derive(Debug, clap::Subcommand)]
-enum Commands {
-    Client(client::App),
-}
+use client::{App, Commands, RunningApp};
+use voice::app::response::Response;
 
 #[tokio::main]
 async fn main() -> Result<(), client::Error> {
     env_logger::init();
 
-    match App::parse().command {
-        Commands::Client(client_command) => {
-            let resp = RunningApp::from(client_command).execute().await?;
+    let app = App::parse();
+    match &app.command {
+        Commands::Start { .. } => {
+            match RunningApp::from(app).execute().await? {
+                Response::Ack(_) => (),
+                r => return Err(client::api::Error::UnexpectedResponse(r).into()),
+            }
+            Ok(())
+        }
+        Commands::Stop => {
+            match RunningApp::from(app).execute().await? {
+                Response::Transcription { content, .. } => {
+                    let Some(content) = content else {
+                        eprintln!("No transcription available");
+                        return Ok(());
+                    };
+                    println!("{content}");
+                }
+                r => return Err(client::api::Error::UnexpectedResponse(r).into()),
+            }
+            Ok(())
+        }
+
+        _ => {
+            let resp = RunningApp::from(app).execute().await?;
             log::info!("{:?}", resp);
             Ok(())
         }

diff --git a/run.sh b/run.sh
@@ -12,15 +12,15 @@ fi
 
 # Additionally, whisper-rs doesn't capture Metal-specific logs for some reason.
 # So you'll still see those in the log output, even if you're suppressing.
-export RUST_LOG=whisper_sys_log=error,voice=debug
+export RUST_LOG=whisper_sys_log=error,voice=debug,server=info
+
+# The address on which the HTTP server should listen (e.g. localhost:PORT)
+addr="$1"
 
 # See the whisper.cpp repo for details on how to get a model. I recommend using
 # base or small for best results.
 model_path="$2"
 
-# The address on which the HTTP server should listen (e.g. localhost:PORT)
-addr="$3"
-
-./voice run-daemon \
-    --serve       "$addr"
+voice-server run-daemon \
+    --serve       "$addr" \
     --model       "$model_path"