whisper-stream

#!/bin/bash

# Default configuration and versioning
VERSION="1.1.0"

# Setting the default values for the script parameters
MIN_VOLUME=1%                  # Minimum volume threshold
SILENCE_LENGTH=1.5             # Minimum silence duration in seconds
ONESHOT=false                  # Flag to determine if the script should run once or continuously
DURATION=0                     # Duration of the recording in seconds (0 means continuous)
MODEL="whisper-1"              # Model for the OpenAI API
TOKEN=""                       # OpenAI API token
OUTPUT_DIR=""                  # Directory to save the transcriptions
PROMPT=""                      # Prompt for the API call
LANGUAGE=""                    # Language code for transcription
TRANSLATE=""                   # Flag to indicate translation to English
AUDIO_FILE=""                  # Specific audio file for transcription
PIPE_TO_CMD=""                 # Command to pipe the transcribed text to
QUIET_MODE=false               # Flag to determine if the banner and settings should be suppressed
GRANULARITIES="none"           # Timestamp granularities for transcription: segment or word

# Display help information for script usage
function display_help() {
  echo "Usage: $0 [options]"
  echo "Options:"
  echo "  -v, --volume <value>        Set the minimum volume threshold (default: 1%)"
  echo "  -s, --silence <value>       Set the minimum silence length (default: 1.5)"
  echo "  -o, --oneshot               Enable one-shot mode"
  echo "  -d, --duration <value>      Set the recording duration in seconds (default: 0, continuous)"
  echo "  -t, --token <value>         Set the OpenAI API token"
  echo "  -p, --path <value>          Set the output directory path to create the transcription file"
  echo "  -g, --granularities <value> Set the timestamp granularities (segment or word)"
  echo "  -r, --prompt <value>        Set the prompt for the API call"
  echo "  -l, --language <value>      Set the language in ISO-639-1 format"
  echo "  -f, --file <value>          Set the audio file to be transcribed"
  echo "  -tr, --translate            Translate the transcribed text to English"
  echo "  -p2, --pipe-to <cmd>        Pipe the transcribed text to the specified command (e.g., 'wc -m')"
  echo "  -V, --version               Show the version number"
  echo "  -q, --quiet                 Suppress the banner and settings"
  echo "  -h, --help                  Display this help message"
  echo "To stop the app, press Ctrl+C"
  exit 0
}

# Check the validity of the provided audio file
function check_audio_file() {
  local file=$1

  # Check if the file exists
  if [ ! -f "$file" ]; then
    echo "File does not exist: $file"
    exit 1
  fi

  # Check if the file is not empty
  if [ ! -s "$file" ]; then
    echo "File is empty: $file"
    exit 1
  fi

# Check if the file size is under 25MB
  local filesize
  if [[ "$OSTYPE" == "linux-gnu"* ]]; then
    filesize=$(stat -c%s "$file")
  elif [[ "$OSTYPE" == "darwin"* ]]; then
    filesize=$(stat -f%z "$file")
  else
    echo "Unknown operating system"
    exit 1
  fi
  if [ $filesize -gt 26214400 ]; then
    echo "File size is over 25MB: $file"
    exit 1
  fi

  # Check if the file format is acceptable
  local ext="${file##*.}"
  case "$ext" in
    m4a|mp3|webm|mp4|mpga|wav|mpeg)
      ;;
    *)
      echo "File format is not acceptable: $file"
      exit 1
      ;;
  esac
}

# Parse command-line arguments to set script parameters
while [[ $# -gt 0 ]]; do
  key="$1"
  case $key in
    -v|--volume)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      MIN_VOLUME="$2"
      if [[ "$MIN_VOLUME" != *% ]]; then
        MIN_VOLUME+="%"
      fi
      shift
      shift
      ;;
    -s|--silence)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      SILENCE_LENGTH="$2"
      shift
      shift
      ;;
    -o|--oneshot)
      ONESHOT=true
      shift
      ;;
    -d|--duration)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      DURATION="$2"
      shift
      shift
      ;;
    -t|--token)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      TOKEN="$2"
      shift
      shift
      ;;
    -p|--path)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      OUTPUT_DIR="$2"
      # check if the output directory exists
      if [ ! -d "$OUTPUT_DIR" ]; then
        echo "Directory does not exist: $OUTPUT_DIR"
        exit 1
      fi
      shift
      shift
      ;;
    -g|--granularities)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      GRANULARITIES="$2"
      shift
      shift
      ;;
    -r|--prompt)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      PROMPT="$2"
      shift
      shift
      ;;
    -l|--language)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      LANGUAGE="$2"
      shift
      shift
      ;;
    -tr|--translate)
      TRANSLATE=true
      shift
      ;;
    -p2|--pipe-to)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing cmd for $1"
        exit 1
      fi
      PIPE_TO_CMD="$2"
      shift
      shift
      ;;
    -f|--file)
      if [[ ! $2 || $2 == -* ]]; then
        echo "Error: Missing value for $1"
        exit 1
      fi
      AUDIO_FILE="$2"
      check_audio_file "$AUDIO_FILE"
      shift
      shift
      ;;
    -V|--version)
      SHOW_VERSION=true
      shift
      ;;
    -q|--quiet)
      QUIET_MODE=true
      shift
      ;;
    -h|--help)
      display_help
      ;;
    *)
      echo "Unknown option: $1"
      exit 1
      ;;
  esac
done

# Display version if the version flag is set
if [ "$SHOW_VERSION" = true ]; then
  echo "Whisper Stream Speech-to-Text Transcriber (version: $VERSION)"
  exit
fi

# Fetch OpenAI API token from environment if not provided as an argument
if [ -z "$TOKEN" ]; then
  TOKEN="${OPENAI_API_KEY:-}"
fi

# If no token is provided as an argument or environment variable, exit the script
if [ -z "$TOKEN" ]; then
  echo "No OpenAI API key provided. Please provide it as an argument or environment variable."
  exit 1
fi

output_files=() # Array to store the names of output audio files

# Function to get the name of the current audio input device on macOS
function get_macos_input_device() {
  # if SwitchAudioSource command available
  if [ -x "$(command -v SwitchAudioSource)" ]; then
    local input_device=$(SwitchAudioSource -t input -c)
    echo "$input_device"
    return
  fi
}

# Function to get the volume of the audio input on macOS
function get_macos_input_volume() {
  local input_volume=$(osascript -e "input volume of (get volume settings)")
  echo "$input_volume%"
}

# Function to get the name of the current audio input device on Linux
function get_linux_input_device() {
  # if arecord command available
  if [ -x "$(command -v arecord)" ]; then
    local input_device=$(arecord -l | grep -oP "(?<=card )\d+(?=:\s.*\[)")
    echo "hw:$input_device"
    return
  fi
}

function get_linux_input_volume() {
  # Check if amixer command is available and executable
  if [ -x "$(command -v amixer)" ]; then
    local input_volume=$(amixer sget Capture | grep 'Left:' | awk -F'[][]' '{ print $2 }')
    echo "$input_volume"
    return
  fi
}

# Function to display current settings
function display_settings() {
  if [ "$QUIET_MODE" = true ]; then
    return
  fi

  echo ""
  echo $'\e[1;34m'Whisper Stream Speech-to-Text Transcriber$'\e[0m' ${VERSION}
  echo $'\e[1;33m'-----------------------------------------------$'\e[0m'
  echo "Current settings:"
  echo "  Volume threshold: $MIN_VOLUME"
  echo "  Silence length: $SILENCE_LENGTH seconds"
  echo "  Input language: ${LANGUAGE:-Not specified}"

  if [ -n "$TRANSLATE" ]; then
    echo "  Translate to English: $TRANSLATE"
  fi
  
  if [ -n "$OUTPUT_DIR" ]; then
    echo "  Output Dir: $OUTPUT_DIR"
  fi

  # Get the input device based on the operating system
  local input_device=$(get_input_device)
  if [ -n "$input_device" ]; then
    echo "  Input device: $input_device"
  fi

  # Get the input volume based on the operating system
  local input_volume=$(get_input_volume)
  if [ -n "$input_volume" ]; then
    echo "  Input volume: $input_volume"
  fi
  
  echo $'\e[1;33m'-----------------------------------------------$'\e[0m'
  echo To stop the app, press $'\e[0;36m'Ctrl+C$'\e[0m'
  echo ""
}

# Get the name of the current audio input device based on OS
function get_input_device() {
  case "$(uname)" in
    Darwin)
      get_macos_input_device
      ;;
    Linux)
      get_linux_input_device
      ;;
    *)
      echo "Unknown operating system"
      ;;
  esac
}

# Get the volume level of the current audio input device based on OS
function get_input_volume() {
  case "$(uname)" in
    Darwin)
      get_macos_input_volume
      ;;
    Linux)
      get_linux_input_volume
      ;;
    *)
      echo "Unknown operating system"
      ;;
  esac
}

# Display a rotating spinner animation
function spinner() {
  local pid=$1
  local delay=0.1
  local spinstr='|/-\\'
  while kill -0 $pid 2>/dev/null; do
    local temp=${spinstr#?}
    printf "\r\e[1;31m%c\e[0m" "$spinstr"
    local spinstr=$temp${spinstr%"$temp"}
    sleep $delay
  done
  printf "\r\e[K"
}

# Convert the audio to text using the OpenAI Whisper API
function convert_audio_to_text() {
  local output_file=$1
  if [ -n "$TRANSLATE" ]; then
    base_url="https://api.openai.com/v1/audio/translations"
  else
    base_url="https://api.openai.com/v1/audio/transcriptions"
  fi
  local curl_command="curl -s --request POST \
    --url $base_url \
    --header \"Authorization: Bearer $TOKEN\" \
    --header \"Content-Type: multipart/form-data\" \
    --form \"file=@$output_file\" \
    --form \"model=$MODEL\" \
    --form \"response_format=verbose_json\""

  # Add optional parameters to the curl command when GANULARITIES is not set to none
  if [ "$GRANULARITIES" != "none" ]; then
    curl_command+=" --form \"timestamp_granularities[]=${GRANULARITIES}\""
  fi

  if [ -n "$PROMPT" ]; then
    curl_command+=" --form \"prompt=$PROMPT\""
  fi

  if [ -n "$LANGUAGE" ]; then
    curl_command+=" --form \"language=$LANGUAGE\""
  fi

  maxretries=3

  response=$(eval $curl_command)
  # Check if the curl command was successful
  # retry until successful or max retries reached
  while [ $? -ne 0 ] && [ $maxretries -gt 0 ]; do
    # print a red dot to indicate a failed API call
    printf "\e[1;31m.\e[0m"
    response=$(eval $curl_command)
    maxretries=$((maxretries - 1))
  done

  # if GRAINULARITIES is set to `none`, `.text` will be returned
  if [ "$GRANULARITIES" != "none" ]; then
    transcription=$(echo "$response")
  else
    transcription=$(echo "$response" | jq -r '.text')
  fi
  
  # Check if the curl command was successful
  # retry until successful or max retries reached
  while [ $? -ne 0 ] && [ $maxretries -gt 0 ]; do
    # print a red dot to indicate a failed API call
    printf "\e[1;31m.\e[0m"
    response=$(eval $curl_command)
    maxretries=$((maxretries - 1))
  done

  printf "\r\e[K"
  echo "$transcription"

  if [ -n "$PIPE_TO_CMD" ]; then
    result=$(echo "$transcription" | $PIPE_TO_CMD)
    echo $result
  fi
  
  # Remove the output audio file unless the `-f` option is specified
  if [ -z "$AUDIO_FILE" ]; then
    rm -f "$output_file"
  fi
  
  # Accumulate the transcribed text in a temporary file
  # this is necessary for the data to be available when the script terminates
  echo "$transcription" >> temp_transcriptions.txt
}

# Handle the script termination: clean up and save transcriptions
function handle_exit() {
  # Wait for all background jobs to finish
  wait

  # Kill all child processes
  pkill -P $$

  # Remove all output audio files
  for file in "${output_files[@]}"; do
    rm -f "$file"
  done

  # if temp_transcriptions.txt exists, remove it
  if [ -f temp_transcriptions.txt ]; then
    # read data from temp_transcriptions.txt into the accumulated_text variable
    # and remove temp_transcriptions.txt
    accumulated_text=$(cat temp_transcriptions.txt)
    rm -f temp_transcriptions.txt
    # Clear the current line
    printf "\r\e[K\n"
  else
    printf "\r\e[K"
    exit
  fi

  # If output directory is specified, create a file with the accumulated text in the specified directory
  if [ -n "$OUTPUT_DIR" ]; then
    timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
    # if GRAINULARITIES is set to `none`, a text file will be created
    if [ "$GRANULARITIES" != "none" ]; then
      echo "$accumulated_text" > "$OUTPUT_DIR/transcription_$timestamp.json"
    # otherwise, a JSON file will be created
    else
      echo "$accumulated_text" > "$OUTPUT_DIR/transcription_$timestamp.txt"
    fi
  fi

  # Copy the accumulated text to the clipboard
  case "$(uname -s)" in
    Darwin)
      # if $accumulated_text is not empty
      if [ -n "$accumulated_text" ]; then
        echo "$accumulated_text" > temp.txt
        cat temp.txt | pbcopy
        rm temp.txt
        echo $'\e[0;36m'Transcription copied to clipboard.$'\e[0m'
      fi
      ;;
    Linux)
      echo "$accumulated_text" | xclip -selection clipboard >&1
      ;;
    CYGWIN*|MINGW32*|MSYS*|MINGW*)
      # This is a rough guess that you're on Windows Subsystem for Linux
      # if $accumulated_text is not empty
      if [ -n "$accumulated_text" ]; then
        echo "$accumulated_text" | clip.exe >&1
        echo $'\e[0;36m'Transcription copied to clipboard.$'\e[0m'
      fi
      ;;
    *)
      echo "Unknown OS, cannot copy to clipboard"
      ;;
  esac

  exit
}

# If an audio file is provided, convert it to text and then exit
if [ -n "$AUDIO_FILE" ]; then

  # print banner and settings unless quiet mode is enabled
  # this is necessary for the data to be available when the script terminates
  if [ "$QUIET_MODE" = false ]; then
    echo ""
    echo $'\e[1;34m'Whisper Stream Transcriber$'\e[0m' ${VERSION}
    echo $'\e[1;33m'-----------------------------------------------$'\e[0m'
    echo "Current settings:"
    echo "  Input language: ${LANGUAGE:-Not specified}"

    if [ -n "$TRANSLATE" ]; then
      echo "  Translate to English: $TRANSLATE"
    fi
    
    if [ -n "$OUTPUT_DIR" ]; then
      echo "  Output Dir: $OUTPUT_DIR"
    fi

    echo "  Input file: $AUDIO_FILE"
    echo $'\e[1;33m'-----------------------------------------------$'\e[0m'
    echo $'\e[0;36m'Please wait ...$'\e[0m'
    echo ""
  fi

  convert_audio_to_text "$AUDIO_FILE"
  handle_exit
fi

# Display the current configuration/settings of the script
display_settings

# Handle script termination using trap for SIGINT (Ctrl+C) and SIGTSTP (Ctrl+Z)
trap handle_exit SIGINT SIGTSTP

# Main loop to continuously record audio, detect silence, and transcribe audio
while true; do
  # Set the path to the output audio file
  OUTPUT_FILE="output_$(date +%s).mp3"
  
  # Add the output file to the array
  output_files+=("$OUTPUT_FILE")

  # Add a prompt at the beginning of the recording
  # echo -n $'\e[1;32m'▶ $'\e[0m'

  # Record audio in raw format then convert to mp3
  if [ "$DURATION" -gt 0 ]; then
    rec -q -V0 -e signed -L -c 1 -b 16 -r 44100 -t raw \
      - trim 0 "$DURATION" silence 1 0.1 "$MIN_VOLUME" 1 "$SILENCE_LENGTH" "$MIN_VOLUME" | \
      sox -t raw -r 44100 -b 16 -e signed -c 1 - "$OUTPUT_FILE"
  else
    rec -q -V0 -e signed -L -c 1 -b 16 -r 44100 -t raw \
      - silence 1 0.1 "$MIN_VOLUME" 1 "$SILENCE_LENGTH" "$MIN_VOLUME" | \
      sox -t raw -r 44100 -b 16 -e signed -c 1 - "$OUTPUT_FILE"
  fi
  
  # Check if the audio file is created successfully
  if [ -s "$OUTPUT_FILE" ]; then
    # Convert the MP3 audio to text using the Whisper API in the background
    convert_audio_to_text "$OUTPUT_FILE" &

    # Captures the process ID of the last executed background command.
    pid=$!
    spinner $pid &
    # Read the transcriptions into the accumulated_text variable
  else
    echo "No audio recorded."
  fi

  if [ "$ONESHOT" = true ]; then
    break
  fi
done

# Handle final cleanup and exit
handle_exit