Skip to content

Commit

Permalink
finished audio transcription feature
Browse files Browse the repository at this point in the history
  • Loading branch information
aniturza committed Jan 11, 2024
1 parent 39fe12b commit 3258198
Show file tree
Hide file tree
Showing 14 changed files with 158 additions and 151 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -407,3 +407,4 @@ FodyWeavers.xsd

# JetBrains Rider
*.sln.iml
src/AIHub/appsettings.Development.json
226 changes: 99 additions & 127 deletions src/AIHub/Controllers/AudioTranscriptionController.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
namespace MVCWeb.Controllers;

public class AudioTrancriptionController : Controller
using System;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Threading.Tasks;
using Newtonsoft.Json;
using Microsoft.AspNetCore.Mvc;
using System.Net;
using Newtonsoft.Json.Linq;
using Microsoft.VisualBasic;

public class AudioTranscriptionController : Controller
{
private readonly ILogger<HomeController> _logger;
private readonly IConfiguration _config;
private string FormRecogEndpoint;
private string FormRecogSubscriptionKey;
private string AOAIendpoint;
private string AOAIsubscriptionKey;
private string SpeechRegion;
private string SpeechSubscriptionKey;
private string storageconnstring;
private readonly BlobServiceClient blobServiceClient;
private readonly BlobContainerClient containerClient;
Expand All @@ -16,28 +23,25 @@ public class AudioTrancriptionController : Controller


//Results
string result_image_front;
string result_message_front;



private FormAnalyzerModel model;
private AudioTranscriptionModel model;


public AudioTrancriptionController(IConfiguration config)
public AudioTranscriptionController(IConfiguration config)
{
_config = config;
FormRecogEndpoint = _config.GetValue<string>("FormAnalyzer:FormRecogEndpoint");
FormRecogSubscriptionKey = _config.GetValue<string>("FormAnalyzer:FormRecogSubscriptionKey");
AOAIendpoint = _config.GetValue<string>("FormAnalyzer:OpenAIEndpoint");
AOAIsubscriptionKey = _config.GetValue<string>("FormAnalyzer:OpenAISubscriptionKey");
SpeechRegion = _config.GetValue<string>("AudioTranscription:SpeechLocation");
SpeechSubscriptionKey = _config.GetValue<string>("AudioTranscription:SpeechSubscriptionKey");
storageconnstring = _config.GetValue<string>("Storage:ConnectionString");
BlobServiceClient blobServiceClient = new BlobServiceClient(storageconnstring);
containerClient = blobServiceClient.GetBlobContainerClient(_config.GetValue<string>("FormAnalyzer:ContainerName"));
containerClient = blobServiceClient.GetBlobContainerClient(_config.GetValue<string>("AudioTranscription:ContainerName"));
sasUri = containerClient.GenerateSasUri(Azure.Storage.Sas.BlobContainerSasPermissions.Read, DateTimeOffset.UtcNow.AddHours(1));
// Obtiene una lista de blobs en el contenedor
blobs = containerClient.GetBlobs();
model = new FormAnalyzerModel();
model = new AudioTranscriptionModel();
}

public IActionResult AudioTranscription()
Expand All @@ -46,127 +50,95 @@ public IActionResult AudioTranscription()
}

[HttpPost]
public async Task<IActionResult> TranscribeAudio(string image_url, string prompt)
public async Task<IActionResult> TranscribeAudio(string audio_url, IFormFile imageFile)
{

string audio = audio_url + sasUri.Query;

//1. Get Image
string image = image_url + sasUri.Query;
Console.WriteLine(image);
//ViewBag.PdfUrl = "http://docs.google.com/gview?url="+image+"&embedded=true";
ViewBag.PdfUrl = image;
string output_result;

HttpClient client = new HttpClient();
client.BaseAddress = new Uri(FormRecogEndpoint);

// Add an Accept header for JSON format.
client.DefaultRequestHeaders.Accept.Add(
new MediaTypeWithQualityHeaderValue("application/json"));
client.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", FormRecogSubscriptionKey);
// CALL 1: STT 3.1

var content = new
{
urlSource = image
};
var json = System.Text.Json.JsonSerializer.Serialize(content);
// Crear un HttpContent con el JSON y el tipo de contenido
HttpContent content_body = new StringContent(json, Encoding.UTF8, "application/json");
// List data response.
HttpResponseMessage response = await client.PostAsync(FormRecogEndpoint, content_body); // Blocking call! Program will wait here until a response is received or a timeout occurs.
var client = new HttpClient();
var request = new HttpRequestMessage(HttpMethod.Post, "https://"+SpeechRegion+".api.cognitive.microsoft.com/speechtotext/v3.1/transcriptions");
request.Headers.Add("Ocp-Apim-Subscription-Key", SpeechSubscriptionKey);
var content = new StringContent("{\r\n\"contentUrls\": [\r\n \"" + audio + "\"\r\n ],\r\n \"locale\": \"es-es\",\r\n \"displayName\": \"My Transcription\",\r\n \"model\": null,\r\n \"properties\": {\r\n \"wordLevelTimestampsEnabled\": true,\r\n \"languageIdentification\": {\r\n \"candidateLocales\": [\r\n \"en-US\", \"de-DE\", \"es-ES\"\r\n ]\r\n }\r\n }\r\n}", null, "application/json");
request.Content = content;
var response = await client.SendAsync(request);
response.EnsureSuccessStatusCode();

//string responseBody = await response.Content.ReadAsStringAsync();
string operation_location_url = response.Headers.GetValues("Operation-Location").FirstOrDefault();

//Console.WriteLine(await response.Content.ReadAsStringAsync());
var responsejson = JsonConvert.DeserializeObject<dynamic>(await response.Content.ReadAsStringAsync());
Console.WriteLine(responsejson);
var output_result = responsejson.self.ToString();
Console.WriteLine("SELF: "+output_result);

client.Dispose();


//llamar a GET OPERATION
HttpClient client2 = new HttpClient();
client2.BaseAddress = new Uri(operation_location_url);

// Add an Accept header for JSON format.
client2.DefaultRequestHeaders.Accept.Add(
new MediaTypeWithQualityHeaderValue("application/json"));
client2.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", FormRecogSubscriptionKey);

// Crear un HttpContent con el JSON y el tipo de contenido
// List data response.
HttpResponseMessage response2 = await client2.GetAsync(operation_location_url); // Blocking call! Program will wait here until a response is received or a timeout occurs.
Console.WriteLine(response2);
// CALL 2: CHECK FOR FINISH
var client2 = new HttpClient();
var request2 = new HttpRequestMessage(HttpMethod.Get, output_result);
client2.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", SpeechSubscriptionKey);
var content2 = new StringContent(string.Empty);
content2.Headers.ContentType = new MediaTypeHeaderValue("application/json");
request2.Content = content2;
var response2 = await client2.SendAsync(request2);
response2.EnsureSuccessStatusCode();
var responseBody = await response2.Content.ReadAsStringAsync();
var responsejson = JsonConvert.DeserializeObject<dynamic>(await response2.Content.ReadAsStringAsync());

//var analyzeresult = responseBody.analyzeResult;
while (responsejson.status != "succeeded")
{
Thread.Sleep(10000);
response2 = await client2.GetAsync(operation_location_url);
responsejson = JsonConvert.DeserializeObject<dynamic>(await response2.Content.ReadAsStringAsync());
}
output_result = responsejson.analyzeResult.content.ToString();

// Above three lines can be replaced with new helper method below
// string responseBody = await client.GetStringAsync(uri);

// Parse the response as JSON
// var operationLocation= await response.Headers.ReadAsStringAsync();

//Console.WriteLine(await response2.Content.ReadAsStringAsync());
var responsejson2 = JsonConvert.DeserializeObject<dynamic>(await response.Content.ReadAsStringAsync());
Console.WriteLine(responsejson2);
while (responsejson2.status != "Succeeded")
{
Thread.Sleep(10000);
response2 = await client2.GetAsync(output_result);
responsejson2 = JsonConvert.DeserializeObject<dynamic>(await response2.Content.ReadAsStringAsync());
Console.WriteLine(responsejson2.status);
}
client2.Dispose();


try
{
// CALL 3: GET RESULTS URL

var client3 = new HttpClient();
var request3 = new HttpRequestMessage(HttpMethod.Get, output_result+"/files/");
request3.Headers.Add("Ocp-Apim-Subscription-Key", SpeechSubscriptionKey);
var content3 = new StringContent(string.Empty);
content3.Headers.ContentType = new MediaTypeHeaderValue("application/json");
request3.Content = content3;
var response3 = await client3.SendAsync(request3);
response3.EnsureSuccessStatusCode();
var responsejson3 = JsonConvert.DeserializeObject<dynamic>(await response3.Content.ReadAsStringAsync());
Console.WriteLine(responsejson3);
// Extract contentUrl field
string output_result3 = (string)responsejson3["values"][0]["links"]["contentUrl"];
Console.WriteLine(output_result3);
client3.Dispose();

// CALL 4: GET RESULTS (TRANSCRIPTION)

var client4 = new HttpClient();
var request4 = new HttpRequestMessage(HttpMethod.Get, output_result3);
request4.Headers.Add("Ocp-Apim-Subscription-Key", SpeechSubscriptionKey);
var content4 = new StringContent(string.Empty);
content4.Headers.ContentType = new MediaTypeHeaderValue("application/json");
request4.Content = content4;
var response4 = await client4.SendAsync(request4);
response4.EnsureSuccessStatusCode();
Console.WriteLine(await response4.Content.ReadAsStringAsync());
var jsonObject4 = JsonConvert.DeserializeObject<JObject>(await response4.Content.ReadAsStringAsync());
string output_result4 = (string)jsonObject4["combinedRecognizedPhrases"][0]["lexical"];
Console.WriteLine(output_result4);
client4.Dispose();


//Show transcript results
ViewBag.Message = "TRANSCRIPTION RESULTS: \n\n"+output_result4;

OpenAIClient client_oai = new OpenAIClient(
new Uri(AOAIendpoint),
new AzureKeyCredential(AOAIsubscriptionKey));

// ### If streaming is not selected
Response<ChatCompletions> responseWithoutStream = await client_oai.GetChatCompletionsAsync(
"DemoBuild",
new ChatCompletionsOptions()
{
Messages =
{
new ChatMessage(ChatRole.System, @"You are specialized in understanding PDFs and answering questions about it. Document OCR result is: "+output_result),
new ChatMessage(ChatRole.User, @"User question: "+prompt ),
},
Temperature = (float)0.7,
MaxTokens = 1000,
NucleusSamplingFactor = (float)0.95,
FrequencyPenalty = 0,
PresencePenalty = 0,
});

ChatCompletions completions = responseWithoutStream.Value;
ChatChoice results_analisis = completions.Choices[0];
ViewBag.Message =
//"Hate severity: " + (response.Value.HateResult?.Severity ?? 0);
results_analisis.Message.Content
;

/* result_image_front=image;
Console.WriteLine("1) "+result_image_front);
Console.WriteLine("2) "+result_message_front);
/* ViewBag.Message =
results_analisis.Message.Content
; */
//ViewBag.Image=result_image_front+".jpg";

}
catch (RequestFailedException ex)
{
throw;
}

// var result = await _service.GetBuildingHomeAsync();
// return Ok(result);
return View("AudioTranscription", model);
}
public class SpeechToTextResponse
{
[JsonProperty("text")]
public string Text { get; set; }
}

//Upload a file to my azure storage account
[HttpPost]
Expand All @@ -176,19 +148,19 @@ public async Task<IActionResult> UploadFile(IFormFile imageFile, string prompt)

if (CheckNullValues(imageFile))
{
ViewBag.Message = "You must upload an image";
ViewBag.Message = "You must upload an mp3 audio file";
return View("AudioTranscription");
}

//Upload file to azure storage account
string url = imageFile.FileName.ToString();
Console.WriteLine(url);
//Console.WriteLine(url);
url = url.Replace(" ", "");
Console.WriteLine(url);
//Console.WriteLine(url);
BlobClient blobClient = containerClient.GetBlobClient(url);
var httpHeaders = new BlobHttpHeaders
{
ContentType = "application/pdf",
ContentType = "audio/mpeg",
};
await blobClient.UploadAsync(imageFile.OpenReadStream(), new BlobUploadOptions { HttpHeaders = httpHeaders });

Expand All @@ -197,13 +169,13 @@ public async Task<IActionResult> UploadFile(IFormFile imageFile, string prompt)

if (CheckImageExtension(blobUrl.ToString()))
{
ViewBag.Message = "You must upload a document with .mp3 extension";
ViewBag.Message = "You must upload an audio file with .mp3 extension";
return View("AudioTranscription", model);
}


//Call EvaluateImage with the url
await TranscribeAudio(blobUrl.ToString(), prompt);
await TranscribeAudio(blobUrl.ToString(), imageFile);
ViewBag.Waiting = null;

return View("AudioTranscription", model);
Expand All @@ -229,7 +201,7 @@ private bool CheckNullValues(IFormFile imageFile)
private bool CheckImageExtension(string blobUri)
{
string uri_lower = blobUri;
if (uri_lower.Contains(".pdf", StringComparison.OrdinalIgnoreCase))
if (uri_lower.Contains(".mp3", StringComparison.OrdinalIgnoreCase))
{
return false;
}
Expand Down
20 changes: 6 additions & 14 deletions src/AIHub/Views/AudioTranscription/AudioTranscription.cshtml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
<svg style="fill: var(--main-color)" xmlns="http://www.w3.org/2000/svg" height="4em"
viewBox="0 0 512 512"><!--! Font Awesome Free 6.4.2 by @@fontawesome - https://fontawesome.com License - https://fontawesome.com/license (Commercial License) Copyright 2023 Fonticons, Inc. -->
<path
d="M320 464c8.8 0 16-7.2 16-16V160H256c-17.7 0-32-14.3-32-32V48H64c-8.8 0-16 7.2-16 16V448c0 8.8 7.2 16 16 16H320zM0 64C0 28.7 28.7 0 64 0H229.5c17 0 33.3 6.7 45.3 18.7l90.5 90.5c12 12 18.7 28.3 18.7 45.3V448c0 35.3-28.7 64-64 64H64c-35.3 0-64-28.7-64-64V64z" />
<path d="M64 0C28.7 0 0 28.7 0 64V448c0 35.3 28.7 64 64 64H320c35.3 0 64-28.7 64-64V160H256c-17.7 0-32-14.3-32-32V0H64zM256 0V128H384L256 0zm2 226.3c37.1 22.4 62 63.1 62 109.7s-24.9 87.3-62 109.7c-7.6 4.6-17.4 2.1-22-5.4s-2.1-17.4 5.4-22C269.4 401.5 288 370.9 288 336s-18.6-65.5-46.5-82.3c-7.6-4.6-10-14.4-5.4-22s14.4-10 22-5.4zm-91.9 30.9c6 2.5 9.9 8.3 9.9 14.8V400c0 6.5-3.9 12.3-9.9 14.8s-12.9 1.1-17.4-3.5L113.4 376H80c-8.8 0-16-7.2-16-16V312c0-8.8 7.2-16 16-16h33.4l35.3-35.3c4.6-4.6 11.5-5.9 17.4-3.5zm51 34.9c6.6-5.9 16.7-5.3 22.6 1.3C249.8 304.6 256 319.6 256 336s-6.2 31.4-16.3 42.7c-5.9 6.6-16 7.1-22.6 1.3s-7.1-16-1.3-22.6c5.1-5.7 8.1-13.1 8.1-21.3s-3.1-15.7-8.1-21.3c-5.9-6.6-5.3-16.7 1.3-22.6z"/>
</svg>
<h1 class="sectionTitle">Audio Trancription</h1>
<p class="sectionSubTitle">Analiza tus audios usando Azure Speech Service</p>
<h1 class="sectionTitle">Audio Transcription</h1>
<p class="sectionSubTitle">Analiza tus audios usando Azure AI Speech</p>
<p class="sectionDetails">Sólo necesitas subir un audio (.mp3).</p>

</div>
Expand All @@ -22,11 +22,6 @@
@Html.Raw(ViewBag.Message.Replace("\n", "<br />"))
</div>
</div>
<div class="col-md-6">
<div class="alert alert-primary" role="alert">
<iframe src="@ViewBag.PdfUrl" style="width:100%;height:500px;" frameborder="0"></iframe>
</div>
</div>
</div>
}
<form asp-controller="AudioTranscription" asp-action="UploadFile" method="post" enctype="multipart/form-data">
Expand All @@ -42,15 +37,12 @@
<div class="col-md-6">

<div class="form-group">
<label for="imageFile">Image File:</label><br>
<label for="imageFile">Audio File:</label><br>
<input type="file" class="form-control-file" id="imageFile" name="imageFile" />
</br>
<label for="imageFile">Prompt:</label><br>
<textarea class="form-control" id="prompt" name="prompt"
rows="3"> @(Model?.Prompt ?? "Summarize document") </textarea>
</div>
<div id="loadingPanel" style="display: none;">Loading...</div>
<button type="submit" class="btn btn-primary" onclick="submitForm()">Upload PDF</button>
<div id="loadingPanel" style="display: none;">Transcribing...</div>
<button type="submit" class="btn btn-primary" onclick="submitForm()">Transcribe audio</button>

</div>
<script>
Expand Down
2 changes: 1 addition & 1 deletion src/AIHub/Views/Shared/_Layout.cshtml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@

<footer class="border-top footer text-muted">
<div class="container">
&copy; 2023 - Customer Success Unit Spain - Authors: Roberto Arocha, Hector Blasco, Monica Calleja, Ane Iturzaeta - <a asp-area="" asp-controller="Home" asp-action="Privacy">Privacy</a>
&copy; 2024 - Customer Success Unit Spain - Authors: Roberto Arocha, Hector Blasco, Monica Calleja, Ane Iturzaeta - <a asp-area="" asp-controller="Home" asp-action="Privacy">Privacy</a>
</div>
</footer>
<script src="~/lib/jquery/dist/jquery.min.js"></script>
Expand Down
Loading

0 comments on commit 3258198

Please sign in to comment.