add AWS Polly,Transcribe

liou666 · Jul 7, 2023 · 68a023c · 68a023c
1 parent 51040df
commit 68a023c
Show file tree

Hide file tree

Showing 7 changed files with 296 additions and 59 deletions.
diff --git a/package.json b/package.json
@@ -34,13 +34,19 @@
     "release:version": "npx standard-version && git push origin --follow-tags"
   },
   "dependencies": {
+    "@aws-sdk/client-cognito-identity": "^3.363.0",
+    "@aws-sdk/client-polly": "^3.363.0",
+    "@aws-sdk/client-transcribe-streaming": "^3.363.0",
+    "@aws-sdk/credential-provider-cognito-identity": "^3.363.0",
     "@iconify-json/svg-spinners": "^1.1.1",
     "@vueuse/core": "^9.13.0",
     "api2d": "^0.1.18",
+    "aws-sdk": "^2.1409.0",
     "dexie": "^3.2.3",
     "electron-updater": "^5.3.0",
     "element-plus": "^2.3.3",
     "eventsource-parser": "^0.1.0",
+    "microphone-stream": "^6.0.1",
     "microsoft-cognitiveservices-speech-sdk": "^1.26.0",
     "pinia": "^2.0.33",
     "pinia-plugin-persistedstate": "^3.1.0",

diff --git a/src/config.ts b/src/config.ts
@@ -190,6 +190,16 @@ export const supportLanguageMap = {
   'zh-TW': '中文(台湾普通话)',
 } as Record<string, string>
 
+export const awsRegions = [
+  'us-east-1',
+  'us-east-2',
+  'us-west-1',
+  'us-west-2',
+  'ap-east-1',
+  'ap-southeast-1',
+  'eu-central-1',
+]
+
 export const azureRegions = [
   'australiaeast',
   'australiasoutheast',

diff --git a/src/constant.ts b/src/constant.ts
@@ -2,6 +2,8 @@ export const OPEN_KEY = 'openKey'
 export const OPEN_PROXY = 'openProxy'
 export const AZURE_REGION = 'azureRegion'
 export const AZURE_KEY = 'azureKey'
+export const AWS_REGION = 'awsRegion'
+export const AWS_COGNITO_IDENTITY_POOL_ID = 'awsCognitoIdentityPoolId'
 export const AZURE_TRANSLATE_KEY = 'azureTranslateKey'
 export const VOICE_API_NAME = 'voiceApiName'
 export const IS_ALWAYS_RECOGNITION = 'isAlwaysRecognition'

diff --git a/src/hooks/useGlobalSetting.ts b/src/hooks/useGlobalSetting.ts
@@ -1,4 +1,4 @@
-import { AUTO_PLAY, AZURE_KEY, AZURE_REGION, AZURE_TRANSLATE_KEY, CHAT_API_NAME, CHAT_REMEMBER_COUNT, IS_ALWAYS_RECOGNITION, OPEN_KEY, OPEN_MAX_TOKEN, OPEN_MODEL, OPEN_PROXY, SELF_AVATAR_URL, TTS_PASSWORD, VOICE_API_NAME } from '@/constant'
+import { AUTO_PLAY,AWS_COGNITO_IDENTITY_POOL_ID, AWS_KEY,AWS_REGION,AWS_SECRET_KEY,AZURE_KEY, AZURE_REGION, AZURE_TRANSLATE_KEY, CHAT_API_NAME, CHAT_REMEMBER_COUNT, IS_ALWAYS_RECOGNITION, OPEN_KEY, OPEN_MAX_TOKEN, OPEN_MODEL, OPEN_PROXY, SELF_AVATAR_URL, TTS_PASSWORD, VOICE_API_NAME } from '@/constant'
 
 import { getAvatarUrl } from '@/utils'
 
@@ -7,6 +7,9 @@ export const useGlobalSetting = () => {
   const openProxy = useLocalStorage(OPEN_PROXY, '')
   const azureRegion = useLocalStorage(AZURE_REGION, 'eastasia')
   const azureKey = useLocalStorage(AZURE_KEY, '')
+  const awsRegion = useLocalStorage(AWS_REGION, 'us-east-1')
+  const awsCognitoIdentityId = useLocalStorage(AWS_COGNITO_IDENTITY_POOL_ID, '')
+
   const openModel = useLocalStorage(OPEN_MODEL, 'gpt-3.5-turbo')
   const selfAvatar = useLocalStorage(SELF_AVATAR_URL, getAvatarUrl('self.png'))
   const chatApiName = useLocalStorage(CHAT_API_NAME, 'openAI')
@@ -22,6 +25,8 @@ export const useGlobalSetting = () => {
     openKey,
     openProxy,
     openModel,
+    awsRegion,
+    awsCognitoIdentityId,
     azureRegion,
     azureKey,
     selfAvatar,

diff --git a/src/hooks/useSpeechService.ts b/src/hooks/useSpeechService.ts
@@ -8,6 +8,16 @@ import {
   SpeechSynthesizer,
 } from 'microsoft-cognitiveservices-speech-sdk'
 
+import MicrophoneStream from 'microphone-stream';
+import { CognitoIdentityClient } from "@aws-sdk/client-cognito-identity";
+import {fromCognitoIdentityPool} from "@aws-sdk/credential-provider-cognito-identity";
+import { Polly,SynthesizeSpeechInput,DescribeVoicesCommand } from "@aws-sdk/client-polly";
+import {
+  TranscribeStreamingClient,
+  StartStreamTranscriptionCommand,
+} from '@aws-sdk/client-transcribe-streaming';
+
+
 const defaultAzureRegion = import.meta.env.VITE_REGION
 const defaultAzureKey = import.meta.env.VITE_SCRIPTION_KEY
 const accessPassword = import.meta.env.VITE_TTS_ACCESS_PASSWORD
@@ -17,8 +27,13 @@ interface Config {
   isFetchAllVoice?: boolean
 }
 export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'zh-CN', 'zh-HK', 'ko-KR', 'de-DE'], isFetchAllVoice = true }: Config = {}) => {
-  const { azureKey, azureRegion, ttsPassword } = useGlobalSetting()
+  const { azureKey, azureRegion, ttsPassword,voiceApiName } = useGlobalSetting()
+  const { awsCognitoIdentityId, awsRegion, } = useGlobalSetting()
 
+
+  if(voiceApiName.value==="AWS"){
+    isFetchAllVoice=false;
+  }
   const resultAzureKey = computed(() => {
     if (!azureKey.value) {
       if (accessPassword !== ttsPassword.value)
@@ -58,6 +73,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
   const audioBlob = ref<Blob>(new Blob())
 
   const allVoices = ref<VoiceInfo[]>([])
+  const allAWSVoices = ref<any[]>([])
 
   const recognizer = ref<SpeechRecognizer>(new SpeechRecognizer(speechConfig.value))
   const synthesizer = ref<SpeechSynthesizer>(new SpeechSynthesizer(speechConfig.value))
@@ -74,8 +90,28 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
     immediate: true,
   })
 
-  // 语音识别
 
+  // AWS polly and transcribe SDK 初始化
+  const audioAWS = new Audio();
+  let micStream: MicrophoneStream | undefined = undefined
+  const polly = new Polly({
+    region: awsRegion.value ?? "us-east-1",
+    credentials: fromCognitoIdentityPool({
+      client: new CognitoIdentityClient({ region: awsRegion.value ?? "us-east-1" }),
+      identityPoolId: awsCognitoIdentityId.value
+    }),
+  });
+
+  const transcribe = new TranscribeStreamingClient({
+    region: awsRegion.value ?? "us-east-1",
+    credentials: fromCognitoIdentityPool({
+      client: new CognitoIdentityClient({ region: awsRegion.value ?? "us-east-1" }),
+      identityPoolId: awsCognitoIdentityId.value
+    }),
+  });
+
+
+  // AZure 语音识别
   const audioRecorder = async () => {
     // 暂时通过 mediaRecorder 方式实现录音保存，后续可能会改为直接通过 SpeechRecognizer 实现保存
 
@@ -250,16 +286,41 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
       catch (error) {
         allVoices.value = []
       }
+      const res = await synthesizer.value.getVoicesAsync()
+      if (res.errorDetails) {
+        console.error(`获取语音列表失败：${res.errorDetails}, 请检查语音配置`)
+        return []
+      }
+      return res.voices
+    }else{
+      return []
     }
 
-    const res = await synthesizer.value.getVoicesAsync()
-    if (res.errorDetails) {
-      console.error(`获取语音列表失败：${res.errorDetails}, 请检查语音配置`)
-      return []
+
+  }
+
+  // 获取AWS 语音列表
+  async function getAWSVoices() {
+    const params = {
+      LanguageCode: "en-US"
+    };
+
+    try {
+      const data = await polly.describeVoices(params)
+      if(data.Voices){
+        allAWSVoices.value=data.Voices.map((item)=>{
+          return {"id":item.Id,"gender":item.Gender}
+        })
+      }  
+      return data.Voices??[];
+    } catch (error) {
+      console.error("Error retrieving AWS voices:", error);
+      return [];
     }
-    return res.voices
+
   }
 
+
   function applySynthesizerConfiguration() {
     // 通过playback结束事件来判断播放结束
     player.value = new SpeakerAudioDestination()
@@ -279,6 +340,100 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
     synthesizer.value = new SpeechSynthesizer(speechConfig.value, speakConfig)
   }
 
+  /* AWS Vocie service */
+  const startAWSRecognizeSpeech = async (cb?: (text: string) => void) => {
+
+    micStream = new MicrophoneStream();
+    // // this part should be put into an async function
+
+    micStream.setStream(
+      await window.navigator.mediaDevices.getUserMedia({
+        video: false,
+        audio: true,
+      })
+    );
+
+
+    //构造audioSream
+    isRecognizing.value = true
+    const MAX_AUDIO_CHUNK_SIZE = 48000
+
+    const audioStream = async function* () {
+      for await (const chunk of micStream as unknown as Iterable<Buffer>) {
+        if (chunk.length <= MAX_AUDIO_CHUNK_SIZE) {
+          yield {
+            AudioEvent: {
+              AudioChunk: pcmEncodeChunk(chunk),
+            },
+          }
+        }
+      }
+    };
+
+    //PCM 编码
+    const pcmEncodeChunk = (chunk: any) => {
+      const input = MicrophoneStream.toRaw(chunk);
+      var offset = 0;
+      var buffer = new ArrayBuffer(input.length * 2);
+      var view = new DataView(buffer);
+      for (var i = 0; i < input.length; i++, offset += 2) {
+        var s = Math.max(-1, Math.min(1, input[i]));
+        view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
+      }
+      return Buffer.from(buffer);
+    };
+    //Transcribe stream command 初始化
+    const command = new StartStreamTranscriptionCommand({
+      LanguageCode: language.value,
+      MediaEncoding: "pcm",
+      MediaSampleRateHertz: 44100,
+      AudioStream: audioStream(),
+    });
+
+    const response = await transcribe.send(command);
+    let resultText = ""
+    if (response.TranscriptResultStream) {
+      for await (const event of response.TranscriptResultStream) {
+        if (event.TranscriptEvent) {
+          const results = event.TranscriptEvent?.Transcript?.Results;
+          results?.map((result: any) => {
+            (result.Alternatives || []).map((alternative: any) => {
+              const transcript = alternative.Items.map((item: any) => item.Content).join(" ");
+              resultText = transcript;
+              cb && cb(transcript)
+            });
+          });
+        }
+      }
+      isRecognizing.value = false
+    }
+    return resultText
+
+  }
+
+  const stopAWSRecognizeSpeech = () => {
+    micStream?.stop()
+  }
+
+
+   //语音合成
+   const awsTextToSpeak = async (text: string, voice?: string) => {
+    const params: SynthesizeSpeechInput = {
+      Text: text,
+      OutputFormat: 'mp3',
+      VoiceId: 'Joanna', // Replace with the desired voice ID (e.g., Joanna, Matthew, etc.)
+    };
+
+    const response = await polly.synthesizeSpeech(params);
+
+    if (response.AudioStream) {
+      const buffer = await response.AudioStream.transformToByteArray();
+      audioAWS.src = URL.createObjectURL(new Blob([buffer], { type: 'audio/mpeg' }));
+      audioAWS.play();
+    }
+  }
+
+
   return {
     languages,
     language,
@@ -289,16 +444,23 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
     isRecognizReadying,
     startRecognizeSpeech,
     stopRecognizeSpeech,
+    startAWSRecognizeSpeech,
+    stopAWSRecognizeSpeech,
     recognizeSpeech,
     textToSpeak,
+    awsTextToSpeak,
     ssmlToSpeak,
     stopTextToSpeak,
     getVoices,
+    getAWSVoices,
     allVoices,
+    allAWSVoices,
     isSynthesizing,
     rate,
     style,
     audioBlob,
     player,
+    audioAWS,
+
   }
 }