argmaxinc · keith4ever · Dec 23, 2024 · Dec 17, 2024 · Dec 18, 2024 · Dec 19, 2024
diff --git a/scripts/download_models.sh b/scripts/download_models.sh
@@ -11,27 +11,55 @@ SOURCE_DIR="$CURRENT_DIR/.."
 ARIA_OPTIONS="-x 8 -s 8 --continue --file-allocation=none"
 
 # Set directories
-MODELS_DIR="$SOURCE_DIR/models"
-INPUTS_DIR="$SOURCE_DIR/inputs"
+TINY_MODELS_DIR="$SOURCE_DIR/openai_whisper-tiny"
+BASE_MODELS_DIR="$SOURCE_DIR/openai_whisper-base"
+SMALL_MODELS_DIR="$SOURCE_DIR/openai_whisper-small"
 
 # Make sure folders exist
-mkdir -p "$MODELS_DIR"
-mkdir -p "$INPUTS_DIR"
+if [ -d "$TINY_MODELS_DIR" ]; then
+    mkdir -p "$TINY_MODELS_DIR"
+fi
+if [ -d "$BASE_MODELS_DIR" ]; then
+    mkdir -p "$BASE_MODELS_DIR"
+fi
+if [ -d "$SMALL_MODELS_DIR" ]; then
+    mkdir -p "$SMALL_MODELS_DIR"
+fi
 
 # Download Whisper auxiliary models
 HF_ARGMAX_URL="https://huggingface.co/argmaxinc/whisperkit-android/resolve/main"
 
-aria2c $ARIA_OPTIONS -d "$INPUTS_DIR" -o converted_vocab.json $HF_ARGMAX_URL/converted_vocab.json
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o melspectrogram.tflite $HF_ARGMAX_URL/melspectrogram.tflite
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o postproc.tflite $HF_ARGMAX_URL/postproc.tflite
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o voice_activity_detection.tflite $HF_ARGMAX_URL/voice_activity_detection.tflite
+if [ ! -f $TINY_MODELS_DIR/converted_vocab.json ]; then
+    aria2c $ARIA_OPTIONS -d "$TINY_MODELS_DIR" -o converted_vocab.json $HF_ARGMAX_URL/converted_vocab.json
+    aria2c $ARIA_OPTIONS -d "$TINY_MODELS_DIR" -o MelSpectrogram.tflite $HF_ARGMAX_URL/melspectrogram.tflite
+    aria2c $ARIA_OPTIONS -d "$TINY_MODELS_DIR" -o postproc.tflite $HF_ARGMAX_URL/postproc.tflite
+    aria2c $ARIA_OPTIONS -d "$TINY_MODELS_DIR" -o voice_activity_detection.tflite $HF_ARGMAX_URL/voice_activity_detection.tflite
+fi
+if [ ! -f $BASE_MODELS_DIR/converted_vocab.json ]; then
+    cp $TINY_MODELS_DIR/* $BASE_MODELS_DIR/.
+fi
+if [ ! -f $SMALL_MODELS_DIR/converted_vocab.json ]; then
+    cp $TINY_MODELS_DIR/* $SMALL_MODELS_DIR/.
+fi
 
 # Download Qualcomm models
 HF_QUALCOMM_URL="https://huggingface.co/qualcomm"
 
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o decoder_tiny.tflite $HF_QUALCOMM_URL/Whisper-Tiny-En/resolve/main/WhisperDecoder.tflite
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o encoder_tiny.tflite $HF_QUALCOMM_URL/Whisper-Tiny-En/resolve/main/WhisperEncoder.tflite
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o decoder_base.tflite $HF_QUALCOMM_URL/Whisper-Base-En/resolve/main/WhisperDecoder.tflite
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o encoder_base.tflite $HF_QUALCOMM_URL/Whisper-Base-En/resolve/main/WhisperEncoder.tflite
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o decoder_small.tflite $HF_QUALCOMM_URL/Whisper-Small-En/resolve/main/WhisperDecoder.tflite
-aria2c $ARIA_OPTIONS -d "$MODELS_DIR" -o encoder_small.tflite $HF_QUALCOMM_URL/Whisper-Small-En/resolve/main/WhisperEncoder.tflite
+if [ ! -f $TINY_MODELS_DIR/TextDecoder.tflite ]; then
+    aria2c $ARIA_OPTIONS -d "$TINY_MODELS_DIR" -o TextDecoder.tflite $HF_QUALCOMM_URL/Whisper-Tiny-En/resolve/main/WhisperDecoder.tflite
+fi
+if [ ! -f $TINY_MODELS_DIR/AudioEncoder.tflite ]; then
+    aria2c $ARIA_OPTIONS -d "$TINY_MODELS_DIR" -o AudioEncoder.tflite $HF_QUALCOMM_URL/Whisper-Tiny-En/resolve/main/WhisperEncoder.tflite
+fi
+if [ ! -f $BASE_MODELS_DIR/TextDecoder.tflite ]; then
+    aria2c $ARIA_OPTIONS -d "$BASE_MODELS_DIR" -o TextDecoder.tflite $HF_QUALCOMM_URL/Whisper-Base-En/resolve/main/WhisperDecoder.tflite
+fi
+if [ ! -f $BASE_MODELS_DIR/AudioEncoder.tflite ]; then
+    aria2c $ARIA_OPTIONS -d "$BASE_MODELS_DIR" -o AudioEncoder.tflite $HF_QUALCOMM_URL/Whisper-Base-En/resolve/main/WhisperEncoder.tflite
+fi
+if [ ! -f $SMALL_MODELS_DIR/TextDecoder.tflite ]; then
+    aria2c $ARIA_OPTIONS -d "$SMALL_MODELS_DIR" -o TextDecoder.tflite $HF_QUALCOMM_URL/Whisper-Small-En/resolve/main/WhisperDecoder.tflite
+fi
+if [ ! -f $SMALL_MODELS_DIR/AudioEncoder.tflite ]; then
+    aria2c $ARIA_OPTIONS -d "$SMALL_MODELS_DIR" -o AudioEncoder.tflite $HF_QUALCOMM_URL/Whisper-Small-En/resolve/main/WhisperEncoder.tflite
+fi
diff --git a/src/whisperax.cpp b/src/whisperax.cpp
@@ -106,12 +106,18 @@ int tflite_init(string argstr){
         (int)args["freq"], (int)args["ch"], format
     );
 
-    string tokenizer_json = root_path + "/inputs/converted_vocab.json";
-    string audio_model = root_path + "/models/voice_activity_detection.tflite";
-    string melspectro_model = root_path + "/models/melspectrogram.tflite";
-    string encoder_model = root_path + "/models/encoder_" + model_size + ".tflite";
-    string decoder_model = root_path + "/models/decoder_" + model_size + ".tflite";
-    string postproc_model = root_path + "/models/postproc.tflite";
+    std::string tokenizer_json = root_path +
+        "/openai_whisper-" + model_size + "/converted_vocab.json";
+    std::string audio_model = root_path +
+        "/openai_whisper-" + model_size + "/voice_activity_detection.tflite";
+    std::string melspectro_model = root_path +
+        "/openai_whisper-" + model_size + "/MelSpectrogram.tflite";
+    std::string encoder_model = root_path +
+        "/openai_whisper-" + model_size + "/AudioEncoder.tflite";
+    std::string decoder_model = root_path +
+        "/openai_whisper-" + model_size + "/TextDecoder.tflite";
+    std::string postproc_model = root_path +
+        "/openai_whisper-" + model_size + "/postproc.tflite";
 
     melspectro = make_unique<MODEL_SUPER_CLASS>("mel_spectrogram");
     encoder = make_unique<MODEL_SUPER_CLASS>("whisper_encoder");