Skip to content

Commit

Permalink
micro_message
Browse files Browse the repository at this point in the history
  • Loading branch information
Rubiksman78 committed Dec 25, 2022
1 parent 3520512 commit 7e0f655
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 20 deletions.
5 changes: 5 additions & 0 deletions AI_submod/main_ai.rpy
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ label monika_chatting:
$ step += 1
return

if use_voice:
$ begin_speak = receiveMessage()
if begin_speak == "yes":
m 1subfb "Okay, I'm listening."

python:
client_socket.setblocking(0)
k = 0
Expand Down
67 changes: 48 additions & 19 deletions combined_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,9 @@
from PIL import Image
from torchvision import transforms

from speech_to_text import stt
import sys

IMG_SIZE = 256
imgProcessing=FacialImageProcessing(False)
test_transforms = transforms.Compose(
[
transforms.Resize((IMG_SIZE,IMG_SIZE)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
]
)

#from speech_to_text import stt
import speech_recognition as sr
import whisper

auth_dict = json.load(open("auth.json"))
USERNAME = auth_dict["USERNAME"]
Expand All @@ -52,7 +41,7 @@
help='use chatbot')
parser.add_argument('--use_emotion_detection', type=bool, default=True,
help='use emotion detection')
parser.add_argument('--use_audio', type=bool, default=False,
parser.add_argument('--use_audio', type=bool, default=True,
help='use audio')
parser.add_argument('--emotion_time', type=int, default=10,
help='time between camera captures')
Expand Down Expand Up @@ -99,11 +88,41 @@

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#emotion_model = keras.models.load_model('models/mobilenet_7.h5')
###Load the emotion model#####
emotion_model = torch.load('models/enet_b2_7.pt').to(device)

###Load the speech recognizer#####
english = True
def init_stt(model="base", english=True,energy=300, pause=0.8, dynamic_energy=False):
if model != "large" and english:
model = model + ".en"
audio_model = whisper.load_model(model)

#load the speech recognizer and set the initial energy threshold and pause threshold
r = sr.Recognizer()
r.energy_threshold = energy
r.pause_threshold = pause
r.dynamic_energy_threshold = dynamic_energy

return r,audio_model

r,audio_model = init_stt()

# prevents openCL usage and unnecessary logging messages
cv2.ocl.setUseOpenCL(False)


IMG_SIZE = 256
imgProcessing=FacialImageProcessing(False)
test_transforms = transforms.Compose(
[
transforms.Resize((IMG_SIZE,IMG_SIZE)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
]
)

# dictionary which assigns each label an emotion (alphabetical order)
emotion_dict = {0: "Angry", 1: "Disgusted", 2: "Fearful", 3: "Happy", 4: "Neutral", 5: "Sad", 6: "Surprised"}

Expand Down Expand Up @@ -181,7 +200,19 @@ async def listenToClient(client):

#Speech to text
if received_msg == "begin_record":
received_msg = stt()
#received_msg = stt()

with sr.Microphone(sample_rate=16000) as source:
sendMessage("yes".encode("utf-8"))
#get and save audio to wav file
audio = r.listen(source)
torch_audio = torch.from_numpy(np.frombuffer(audio.get_raw_data(), np.int16).flatten().astype(np.float32) / 32768.0)
audio_data = torch_audio
if english:
result = audio_model.transcribe(audio_data,language='english')
else:
result = audio_model.transcribe(audio_data)
received_msg = result['text']

print("User: "+received_msg)

Expand Down Expand Up @@ -267,7 +298,6 @@ async def listenToClient(client):
break

elif received_msg == "camera_int":
emotion_model = torch.load('models/enet_b2_7.pt').to(device)
# start the webcam feed
cap = cv2.VideoCapture(0)
ret, frame = cap.read()
Expand Down Expand Up @@ -299,7 +329,6 @@ async def listenToClient(client):
sendMessage(msg)

else:
emotion_model = torch.load('models/enet_b2_7.pt').to(device)
counter = received_msg[6:]
counter = int(counter)
if counter % EMOTION_TIME == 0:
Expand Down
2 changes: 1 addition & 1 deletion speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np


def stt(model="base", english=True, verbose=False, energy=300, pause=0.8, dynamic_energy=False, save_file=False):
def stt(model="base", english=True, verbose=False, energy=300, pause=1, dynamic_energy=False, save_file=False):
if save_file:
temp_dir = tempfile.mkdtemp()
save_path = os.path.join(temp_dir, "temp.wav")
Expand Down

0 comments on commit 7e0f655

Please sign in to comment.