From f01011064ba62bbaa187b731c23337450df07ecf Mon Sep 17 00:00:00 2001
From: Luka Milic <lukam321@gmail.com>
Date: Wed, 27 Mar 2024 22:22:23 +0000
Subject: [PATCH 1/8] added support for the open ai whisper api

---
 src/subsai/configs.py                  |  9 +++
 src/subsai/models/whisper_api_model.py | 77 ++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 src/subsai/models/whisper_api_model.py

diff --git a/src/subsai/configs.py b/src/subsai/configs.py
index fc92980..50f1160 100644
--- a/src/subsai/configs.py
+++ b/src/subsai/configs.py
@@ -15,6 +15,7 @@
 from subsai.models.whispercpp_model import WhisperCppModel
 from subsai.utils import get_available_devices, available_translation_models
 from subsai.models.stable_ts_model import StableTsModel
+from subsai.models.whisper_api_model import WhisperAPIModel
 
 AVAILABLE_MODELS = {
     'openai/whisper': {
@@ -62,6 +63,14 @@
         'url': 'https://github.com/jianfch/stable-ts',
         'config_schema': StableTsModel.config_schema,
     },
+    'API/openai/whisper': {
+        'class': WhisperAPIModel,
+        'description': 'Whisper is a general-purpose speech recognition model. It is trained on a large dataset of '
+                       'diverse audio and is also a multi-task model that can perform multilingual speech recognition '
+                       'as well as speech translation and language identification.',
+        'url': 'https://github.com/openai/whisper',
+        'config_schema': WhisperAPIModel.config_schema,
+    },
 }
 
 BASIC_TOOLS_CONFIGS = {
diff --git a/src/subsai/models/whisper_api_model.py b/src/subsai/models/whisper_api_model.py
new file mode 100644
index 0000000..5bddb49
--- /dev/null
+++ b/src/subsai/models/whisper_api_model.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Whisper API Model
+
+See [openai/whisper](https://github.com/openai/whisper)
+"""
+
+from typing import Tuple
+import pysubs2
+from subsai.models.abstract_model import AbstractModel
+from subsai.utils import _load_config, get_available_devices
+
+from openai import OpenAI
+
+import subprocess
+import os
+import ffmpeg
+
+# result_cache = "1\n00:00:00,000 --> 00:00:05,280\nIt has been an interesting few days for the pursuit of Artificial General Intelligence,\n\n2\n00:00:05,280 --> 00:00:08,560\nso I wanted to give you some of the highlights.\n\n3\n00:00:08,560 --> 00:00:14,800\nFrom GPT 4.something to recursively improving semiconductor manufacturing.\n\n4\n00:00:14,800 --> 00:00:20,400\nWe got at least 5 revealing quotes from Jensen Huang, Sam Altman and others.\n\n5\n00:00:20,400 --> 00:00:22,240\nAnd the summary is this.\n\n6\n00:00:22,240 --> 00:00:28,559\nIf you thought or know someone who thinks that Artificial Intelligence peaked with chatGPT,\n\n7\n00:00:28,559 --> 00:00:34,639\nyou or they are going to have to weather exponential increases in computational power\n\n8\n00:00:34,639 --> 00:00:36,959\nthrough at least the rest of this decade.\n\n9\n00:00:36,959 --> 00:00:40,080\nAs Sam Altman just said of 2024, and I agree,\n\n10\n00:00:40,080 --> 00:00:46,080\nthis is the most interesting year in human history except for all future years.\n\n11\n00:00:46,080 --> 00:00:52,639\nSo let's start with Sam Altman who said that OpenAI's goal is to avoid shocking updates.\n\n12\n00:00:52,639 --> 00:00:56,799\nOur goal is not to have shock updates to the world, but that's what we're trying to do.\n\n13\n00:00:56,799 --> 00:00:58,480\nThat's like our state of the strategy.\n\n14\n00:00:58,480 --> 00:01:00,000\nAnd I think we're somehow missing the mark.\n\n15\n00:01:00,560 --> 00:01:01,840\nSo maybe we should think about, you know,\n\n16\n00:01:01,840 --> 00:01:04,000\nreleasing GPT 5 in a different way or something like that.\n\n17\n00:01:05,120 --> 00:01:08,559\nYeah, 4.71, 4.72.\n\n18\n00:01:08,559 --> 00:01:12,559\nAnd what does he mean by releasing iteratively without shock updates?\n\n19\n00:01:12,559 --> 00:01:16,800\nWell, probably something similar to another co-founder of OpenAI, Greg Brockman.\n\n20\n00:01:16,800 --> 00:01:21,839\nHe said that their plan for safety involved deploying GPT 5 in stages,\n\n21\n00:01:21,839 --> 00:01:26,320\nessentially creating a continuum of incrementally better AIs,\n\n22\n00:01:26,320 --> 00:01:29,760\nsuch as by deploying subsequent checkpoints of a given training run.\n\n23\n00:01:29,760 --> 00:01:33,599\nThink of that like saves on the way to completing a video game.\n\n24\n00:01:33,599 --> 00:01:38,720\nIn short, it's highly likely now that we will be getting something equivalent to GPT 4.5\n\n25\n00:01:38,720 --> 00:01:40,480\nbefore we get GPT 5.\n\n26\n00:01:40,480 --> 00:01:46,000\nAs to whether the marketing department comes up with a better name than GPT 4.5, well, let's see.\n\n27\n00:01:46,000 --> 00:01:48,000\nAnd on timing, he said this.\n\n28\n00:01:48,000 --> 00:01:50,080\nBlink twice if it's this year.\n\n29\n00:01:52,160 --> 00:01:59,680\nI also, we will release an amazing model this year.\n\n30\n00:02:00,559 --> 00:02:01,519\nI don't know what we'll call it.\n\n31\n00:02:02,080 --> 00:02:06,959\nWe'll release over in the coming months, many different things.\n\n32\n00:02:06,959 --> 00:02:08,479\nI think they'll be very cool.\n\n33\n00:02:08,479 --> 00:02:14,080\nI think before we talk about like a GPT 5-like model called that or not called that,\n\n34\n00:02:14,080 --> 00:02:17,520\nor a little bit worse or a little bit better than what you'd expect from a GPT 5,\n\n35\n00:02:17,520 --> 00:02:20,399\nI know we have a lot of other important things to release.\n\n36\n00:02:20,399 --> 00:02:23,039\nAnd don't forget that not all progress depends on new models.\n\n37\n00:02:23,039 --> 00:02:28,240\nWe can have new systems like Let's Verify or QSTAR based on existing models.\n\n38\n00:02:28,240 --> 00:02:31,440\nSam Altman practically confirmed the existence of QSTAR\n\n39\n00:02:31,440 --> 00:02:34,160\nin this interview with Lex Friedman from yesterday.\n\n40\n00:02:34,160 --> 00:02:36,000\nCan you speak to what QSTAR is?\n\n41\n00:02:36,000 --> 00:02:37,199\nWe are not ready to talk about that.\n\n42\n00:02:37,759 --> 00:02:40,559\nSee, but an answer like that means there's something to talk about.\n\n43\n00:02:40,559 --> 00:02:41,919\nIt's very mysterious, Sam.\n\n44\n00:02:42,960 --> 00:02:45,600\nI mean, we work on all kinds of research.\n\n45\n00:02:46,000 --> 00:02:46,500\nYeah.\n\n46\n00:02:48,160 --> 00:02:52,880\nI've done an entire video gathering the best evidence as to what QSTAR is.\n\n47\n00:02:52,880 --> 00:02:54,160\nSo do check that one out.\n\n48\n00:02:54,160 --> 00:02:57,919\nBut if you want the massively condensed TLDR, it's this.\n\n49\n00:02:57,919 --> 00:03:02,559\nModels can essentially think or compute for longer on harder questions\n\n50\n00:03:02,559 --> 00:03:05,759\nand generate thousands of example answers\n\n51\n00:03:05,759 --> 00:03:08,800\nand have internal systems for checking which answer is best\n\n52\n00:03:08,800 --> 00:03:10,960\nand only showing you that best answer.\n\n53\n00:03:10,960 --> 00:03:14,960\nA system two to complement the base system one thinking, if you will.\n\n54\n00:03:14,960 --> 00:03:18,160\nBut before we leave that interview, there were two more fascinating quotes,\n\n55\n00:03:18,160 --> 00:03:20,479\nat least from my perspective, that I want to show you.\n\n56\n00:03:20,479 --> 00:03:24,639\nOne involved the possible social response to ever improving AI\n\n57\n00:03:24,639 --> 00:03:27,360\nand the chances of it going theatrically wrong.\n\n58\n00:03:27,360 --> 00:03:28,399\nI worry about that for AI.\n\n59\n00:03:30,000 --> 00:03:32,800\nI think some things are going to go theatrically wrong with AI.\n\n60\n00:03:35,600 --> 00:03:38,800\nI don't know what the percent chance is that I eventually get shot, but it's not zero.\n\n61\n00:03:40,240 --> 00:03:43,520\nI'll come back to social responses later in this video,\n\n62\n00:03:43,520 --> 00:03:45,520\nbut that was a startling moment.\n\n63\n00:03:45,520 --> 00:03:51,360\nAt the moment, at least, thankfully, the power struggle for AGI is only financial.\n\n64\n00:03:51,360 --> 00:03:58,000\nHere is Demis Hassabis essentially laughing when an interviewer said that OpenAI was a non-profit.\n\n65\n00:03:58,000 --> 00:04:01,679\nA lot of AI labs have been grappling with governance\n\n66\n00:04:01,679 --> 00:04:05,759\nand what is the best structure for something like AGI to emerge.\n\n67\n00:04:05,759 --> 00:04:08,960\nYou just mentioned the possibility of some sort of international\n\n68\n00:04:08,960 --> 00:04:11,360\ncollective or cooperative that would handle this.\n\n69\n00:04:12,240 --> 00:04:17,839\nAcross the industry, OpenAI has set itself up as a non-profit with a for-profit subsidiary.\n\n70\n00:04:17,839 --> 00:04:20,640\nAnthropic is a public benefit corporation.\n\n71\n00:04:20,640 --> 00:04:25,200\nSo before we get to NVIDIA's GTC, let's linger for a moment on AGI,\n\n72\n00:04:25,200 --> 00:04:30,000\nits definition, and recent updates to the timeline to AGI.\n\n73\n00:04:30,000 --> 00:04:34,880\nYesterday, Andrej Karpathy, who was until recently at the very top of OpenAI,\n\n74\n00:04:34,880 --> 00:04:37,600\nsaid this about the definition of AGI.\n\n75\n00:04:37,600 --> 00:04:41,359\nHe thinks of it like the OpenAI charter as being an autonomous system\n\n76\n00:04:41,359 --> 00:04:45,200\nthat surpasses humans in most economically valuable tasks.\n\n77\n00:04:45,200 --> 00:04:50,000\nAnd is it me or does that definition not automatically foreshadow economic strife?\n\n78\n00:04:50,000 --> 00:04:53,279\nIn other words, definitionally, AGI won't have arrived\n\n79\n00:04:53,279 --> 00:04:56,399\nuntil it can do the work of at least half of all humans.\n\n80\n00:04:56,399 --> 00:05:01,200\nNow, every word matters when we're defining something as consequential as AGI.\n\n81\n00:05:01,200 --> 00:05:05,440\nAnd Google DeepMind, led by Demis Hassabis, moderated OpenAI's definition.\n\n82\n00:05:05,440 --> 00:05:07,920\nThey said we'll count it as having achieved AGI\n\n83\n00:05:07,920 --> 00:05:12,799\nif we have systems that are technically capable of performing economically important tasks,\n\n84\n00:05:12,799 --> 00:05:15,600\nbut don't necessarily realize that economic value.\n\n85\n00:05:15,600 --> 00:05:18,399\nAs in, they might not actually be deployed in the workforce\n\n86\n00:05:18,399 --> 00:05:20,880\nfor legal, ethical, or social reasons.\n\n87\n00:05:20,880 --> 00:05:24,399\nBut imagine the economic incentives in that scenario.\n\n88\n00:05:24,399 --> 00:05:30,000\nAGI would be here and be capable of realizing trillions of dollars of economic value,\n\n89\n00:05:30,000 --> 00:05:33,519\nand these companies are supposed to hold back from deploying it.\n\n90\n00:05:33,519 --> 00:05:34,799\nWould Google allow that?\n\n91\n00:05:34,799 --> 00:05:35,920\nWould Microsoft?\n\n92\n00:05:35,920 --> 00:05:38,799\nOr would the definition change conveniently?\n\n93\n00:05:38,799 --> 00:05:44,160\nBut even under that wider definition, when does Demis Hassabis think that AGI will arrive?\n\n94\n00:06:05,600 --> 00:06:12,079\nOthers think that that moment, which again would have colossal economic ramifications,\n\n95\n00:06:12,079 --> 00:06:14,079\nwill come before 2030.\n\n96\n00:06:14,079 --> 00:06:16,959\nHere's one alignment researcher at OpenAI.\n\n97\n00:06:16,959 --> 00:06:22,320\nHe thinks that there's around a two-thirds chance of AGI before 2028.\n\n98\n00:06:22,320 --> 00:06:26,559\nAnd he goes on that he can't talk about all the reasons why he has this timeline,\n\n99\n00:06:26,559 --> 00:06:30,799\nbut mostly it should be figureoutable from publicly available information.\n\n100\n00:06:30,799 --> 00:06:34,559\nI'm guessing that's an oblique reference to QSTAR or Let's Verify.\n\n101\n00:06:34,559 --> 00:06:38,160\nHe also returns to the economic definition of AGI.\n\n102\n00:06:38,160 --> 00:06:42,239\nWhen I say AGI, I mean something which is basically a drop-in substitute\n\n103\n00:06:42,239 --> 00:06:45,760\nfor a human remote worker circa 2023.\n\n104\n00:06:45,760 --> 00:06:50,640\nAnd not just a mediocre one, a good one, e.g. an OpenAI research engineer.\n\n105\n00:06:50,640 --> 00:06:52,880\nNotice though he's focusing on remote work,\n\n106\n00:06:52,880 --> 00:06:56,079\nand even Karpathy limits his comments to digital work.\n\n107\n00:06:56,079 --> 00:06:58,239\nBut as we'll see at the end of this video,\n\n108\n00:06:58,239 --> 00:07:01,760\neven physical tasks might be automated sooner than you think.\n\n109\n00:07:01,760 --> 00:07:03,679\nBefore we leave Daniel Cocotagelo though,\n\n110\n00:07:03,679 --> 00:07:06,399\nthere's one more quote I want to show you of his.\n\n111\n00:07:06,399 --> 00:07:09,040\nI think in this one, he's feeling somewhat panicked.\n\n112\n00:07:09,040 --> 00:07:12,320\nProbably there will be AGI soon, literally any year now.\n\n113\n00:07:12,320 --> 00:07:16,320\nAnd probably whoever controls AGI will also be able to use it\n\n114\n00:07:16,320 --> 00:07:19,839\nto get to artificial superintelligence shortly thereafter.\n\n115\n00:07:19,839 --> 00:07:22,720\nHe says, maybe in another year, give or take a year.\n\n116\n00:07:22,720 --> 00:07:24,799\nNow, if you do the maths of that comment,\n\n117\n00:07:24,799 --> 00:07:28,399\ngive or take a year means that it could be instantaneous\n\n118\n00:07:28,399 --> 00:07:31,600\nor it could take two years from AGI, according to him.\n\n119\n00:07:31,600 --> 00:07:33,519\nAt least according to Google DeepMind,\n\n120\n00:07:33,519 --> 00:07:38,480\nan artificial superintelligence would involve outperforming 100% of humans.\n\n121\n00:07:38,480 --> 00:07:42,799\nJust like in their respective domains, AlphaZero and Stockfish already do.\n\n122\n00:07:42,799 --> 00:07:45,200\nAnd in the light of these shortening timelines,\n\n123\n00:07:45,200 --> 00:07:48,559\nsome AI researchers are already adapting their behavior.\n\n124\n00:07:48,559 --> 00:07:51,040\nOne lead researcher at OpenAI said this,\n\n125\n00:07:51,040 --> 00:07:52,720\nthe closer we get to the singularity,\n\n126\n00:07:52,720 --> 00:07:56,079\nthat's the moment when progress is so fast humans can't even keep up,\n\n127\n00:07:56,079 --> 00:07:58,480\nthe lower, he said, my risk tolerance gets.\n\n128\n00:07:58,480 --> 00:08:01,440\nI'd already ruled out skydiving and paragliding.\n\n129\n00:08:01,440 --> 00:08:05,040\nLast year, I started wearing a helmet consistently while cycling\n\n130\n00:08:05,040 --> 00:08:08,640\nand he ended, I think this year might be the year I give up skiing.\n\n131\n00:08:08,640 --> 00:08:12,160\nIn other words, if you think AGI, ASI and the singularity\n\n132\n00:08:12,160 --> 00:08:14,079\nare going to happen in the 2020s,\n\n133\n00:08:14,079 --> 00:08:17,760\nit will be kind of a pity to die before that date, probably.\n\n134\n00:08:17,760 --> 00:08:18,959\nBut at this point in the video,\n\n135\n00:08:18,959 --> 00:08:22,079\nand I promise you I will get to the GTC straight after this,\n\n136\n00:08:22,079 --> 00:08:23,920\nthings are getting kind of heavy.\n\n137\n00:08:23,920 --> 00:08:27,839\nSo I want to bring in a paper I read that's on a lighter note.\n\n138\n00:08:27,839 --> 00:08:31,040\nWhat the paper says essentially is that peer reviewers\n\n139\n00:08:31,040 --> 00:08:34,880\nare now starting to use chatGBT wholesale to do peer review.\n\n140\n00:08:34,880 --> 00:08:36,080\nHow did they discover this?\n\n141\n00:08:36,080 --> 00:08:39,919\nWell, mentions of the word commendable, innovative, meticulous,\n\n142\n00:08:39,919 --> 00:08:42,000\nintricate, notable and versatile.\n\n143\n00:08:42,000 --> 00:08:44,080\nNow, I think those are words that I use all the time,\n\n144\n00:08:44,080 --> 00:08:45,520\nbut maybe not everyone does.\n\n145\n00:08:45,520 --> 00:08:49,200\nPreviously, they were incredibly rare in peer reviews,\n\n146\n00:08:49,200 --> 00:08:51,200\nbut they became somewhat common.\n\n147\n00:08:51,200 --> 00:08:52,559\nHmm, makes you wonder.\n\n148\n00:08:52,559 --> 00:08:54,960\nThey go on that the estimated fraction\n\n149\n00:08:54,960 --> 00:08:57,280\nof large language model generated text\n\n150\n00:08:57,280 --> 00:09:00,000\nis higher in reviews which report lower confidence.\n\n151\n00:09:00,000 --> 00:09:01,119\nThat kind of makes sense, right?\n\n152\n00:09:01,119 --> 00:09:03,840\nIf you're not confident, you're going to use an LLM to help you.\n\n153\n00:09:03,840 --> 00:09:04,960\nBut the next bit is funny.\n\n154\n00:09:04,960 --> 00:09:07,119\nThey were submitted close to the deadline.\n\n155\n00:09:07,119 --> 00:09:09,440\nSo you have these panicked peer reviewers who are like,\n\n156\n00:09:09,440 --> 00:09:11,039\noh no, the deadline's coming.\n\n157\n00:09:11,039 --> 00:09:12,559\nLet's use chatGBT to do it.\n\n158\n00:09:12,559 --> 00:09:15,840\nAnd the other correlation was it was more common from reviewers\n\n159\n00:09:15,840 --> 00:09:18,880\nwho are less likely to respond to author rebuttals.\n\n160\n00:09:18,880 --> 00:09:20,960\nNow that seems somewhat unfair to me.\n\n161\n00:09:20,960 --> 00:09:24,000\nYou don't even bother to write the peer review yourself\n\n162\n00:09:24,000 --> 00:09:26,880\nand you don't even reply when the author replies to you.\n\n163\n00:09:26,880 --> 00:09:30,799\nThese were peer reviews of prominent deep learning conferences\n\n164\n00:09:30,799 --> 00:09:34,239\nand the rates were 10 and almost 17%.\n\n165\n00:09:34,239 --> 00:09:35,760\nAnd we're not talking about spell checks.\n\n166\n00:09:35,760 --> 00:09:39,200\nWe're talking about being substantially modified by chatGBT.\n\n167\n00:09:39,200 --> 00:09:41,280\nObviously, now's not the time to go through this paper,\n\n168\n00:09:41,280 --> 00:09:42,559\nbut I thought it's worth showing you.\n\n169\n00:09:42,559 --> 00:09:44,479\nI mean, it's one more effect of AGI, right?\n\n170\n00:09:44,479 --> 00:09:48,880\nThe whole peer review system might become the AGI review system.\n\n171\n00:09:48,880 --> 00:09:51,679\nSo the conference from around 24 hours ago,\n\n172\n00:09:51,679 --> 00:09:54,000\nobviously way too much to get to in this video,\n\n173\n00:09:54,000 --> 00:09:56,799\nbut I'm going to give you the five moments that stood out for me.\n\n174\n00:09:56,799 --> 00:10:00,400\nFirst, the obvious one, the announcement of the Blackwell GPU.\n\n175\n00:10:01,039 --> 00:10:03,760\nOver the course of the last eight years,\n\n176\n00:10:03,760 --> 00:10:06,880\nwe've increased computation by 1,000 times.\n\n177\n00:10:06,880 --> 00:10:08,400\nEight years, 1,000 times.\n\n178\n00:10:08,400 --> 00:10:10,960\nRemember back in the good old days of Moore's law,\n\n179\n00:10:10,960 --> 00:10:14,960\n10X every five years, 100 times every 10 years,\n\n180\n00:10:15,039 --> 00:10:22,799\n100 times every 10 years in the middle of the heydays of the PC revolution.\n\n181\n00:10:22,799 --> 00:10:24,479\nNow this graph does involve some hype\n\n182\n00:10:24,479 --> 00:10:29,200\nbecause it's not comparing the same level of precision, FP16 to FP4,\n\n183\n00:10:29,200 --> 00:10:32,719\nbut the point still stands that we are exceeding Moore's law.\n\n184\n00:10:32,719 --> 00:10:33,760\nHere's another example.\n\n185\n00:10:33,760 --> 00:10:37,760\nThe Blackwell Superchip system isn't just two times better inference\n\n186\n00:10:37,760 --> 00:10:39,679\nor actually generating tokens.\n\n187\n00:10:39,679 --> 00:10:44,000\nIt's 30 times more performance than the H100 series.\n\n188\n00:10:44,000 --> 00:10:47,599\nIn short, there's going to be a lot more generations from generative AI.\n\n189\n00:10:47,599 --> 00:10:51,440\nThe cost and energy consumption also drops by a major factor.\n\n190\n00:10:51,440 --> 00:10:54,880\nAnd of course, almost every CEO in the world that you can think of\n\n191\n00:10:54,880 --> 00:10:59,520\nlined up to praise and get in the queue for these Blackwell Superchips.\n\n192\n00:10:59,520 --> 00:11:03,200\nNext, of course, the model sizes that these systems can serve\n\n193\n00:11:03,200 --> 00:11:04,559\njust keeps getting bigger.\n\n194\n00:11:04,559 --> 00:11:08,719\nRemember, GPT-3 was trained at 175 billion parameters,\n\n195\n00:11:08,719 --> 00:11:11,679\nthen GPT-4 at around 1.8 trillion.\n\n196\n00:11:11,679 --> 00:11:12,880\nThat's 10 times bigger.\n\n197\n00:11:12,880 --> 00:11:16,799\nWell, notice how as we proceed, we're not doubling or 3Xing.\n\n198\n00:11:16,799 --> 00:11:19,280\nWe're talking about another tenfold increase.\n\n199\n00:11:19,280 --> 00:11:24,159\nNVIDIA said that their server clusters could deploy a 27 trillion parameter model.\n\n200\n00:11:24,159 --> 00:11:27,359\nNow, of course, just because NVIDIA can deploy that size of model\n\n201\n00:11:27,359 --> 00:11:30,080\ndoesn't mean that the AGI labs will create one that big.\n\n202\n00:11:30,080 --> 00:11:33,679\nI think a more reasonable estimate for the next generation of models\n\n203\n00:11:33,679 --> 00:11:35,520\nwould be around 10 trillion parameters.\n\n204\n00:11:35,520 --> 00:11:37,760\nBut the point still stands, we're not doubling each time.\n\n205\n00:11:37,760 --> 00:11:39,039\nIn case you're not familiar, by the way,\n\n206\n00:11:39,039 --> 00:11:41,200\nthe number of parameters is like the number of dials\n\n207\n00:11:41,200 --> 00:11:45,359\nin a model that you can tune to better match deep intricate patterns\n\n208\n00:11:45,359 --> 00:11:47,280\nand patterns within patterns.\n\n209\n00:11:47,280 --> 00:11:50,320\nOf course, those patterns have to be found within the data that you give it.\n\n210\n00:11:50,320 --> 00:11:51,919\nSo garbage in, garbage out.\n\n211\n00:11:51,919 --> 00:11:55,200\nBut of course, everyone's working on getting higher quality data.\n\n212\n00:11:55,200 --> 00:11:58,400\nThe next interesting moment I think many people might have slept on.\n\n213\n00:11:58,400 --> 00:12:01,359\nNVIDIA have built a platform that accelerates\n\n214\n00:12:01,359 --> 00:12:04,320\nthe compute intensive part of lithography.\n\n215\n00:12:04,320 --> 00:12:08,559\nThat's the key process in making new and more advanced chips.\n\n216\n00:12:08,559 --> 00:12:11,679\nAnd in this announcement, NVIDIA say that TSMC\n\n217\n00:12:11,679 --> 00:12:14,799\nare already going into production with this platform.\n\n218\n00:12:14,799 --> 00:12:16,960\nThey're going to be accelerating manufacturing\n\n219\n00:12:16,960 --> 00:12:18,559\nand pushing the limits of physics\n\n220\n00:12:18,559 --> 00:12:21,840\nfor the next generation of advanced semiconductor chips.\n\n221\n00:12:21,840 --> 00:12:27,039\nNot only that, but these 40 or 60x improvements also utilize Gen AI.\n\n222\n00:12:27,039 --> 00:12:29,760\nAs best I can tell, they're using generative AI\n\n223\n00:12:29,760 --> 00:12:33,119\nto create a mask, which is key in lithography.\n\n224\n00:12:33,119 --> 00:12:37,679\nThink of that mask as a template that transfers a pattern onto the chip.\n\n225\n00:12:37,679 --> 00:12:39,440\nI'm reading the book Chip Wars at the moment,\n\n226\n00:12:39,440 --> 00:12:43,119\nand lithography is absolutely key to the latest chips.\n\n227\n00:12:43,119 --> 00:12:44,239\nBut the bigger point is this.\n\n228\n00:12:44,239 --> 00:12:47,359\nThey're using Gen AI for ideation suggestions,\n\n229\n00:12:47,359 --> 00:12:51,599\nbut then the actual mask is derived by traditional physically rigorous methods.\n\n230\n00:12:51,599 --> 00:12:54,080\nIt's that marrying of Gen AI to suggest\n\n231\n00:12:54,080 --> 00:12:57,520\nand traditional systems to check that we'll see again in a moment.\n\n232\n00:12:57,520 --> 00:12:59,200\nBut there is another obvious point here.\n\n233\n00:12:59,200 --> 00:13:00,799\nThis is somewhat recursive.\n\n234\n00:13:00,799 --> 00:13:05,119\nWe have better chips creating better, cheaper, faster generative AI.\n\n235\n00:13:05,119 --> 00:13:07,760\nAnd now more and more, we're getting generative AI\n\n236\n00:13:07,760 --> 00:13:10,799\nhelping in the creation of new and better chips.\n\n237\n00:13:10,799 --> 00:13:13,760\nAnd of course, those new chips might be with us sooner.\n\n238\n00:13:13,760 --> 00:13:16,960\nPhoto masks that took two weeks can now be processed overnight.\n\n239\n00:13:16,960 --> 00:13:19,520\nHere's what the CEO of ASML said,\n\n240\n00:13:19,520 --> 00:13:23,359\nwhich of course is the company that's integral to the creation of semiconductors.\n\n241\n00:13:23,359 --> 00:13:27,919\nThis collaboration will bring tremendous benefit to computational lithography\n\n242\n00:13:27,919 --> 00:13:30,479\nand therefore to semiconductor scaling.\n\n243\n00:13:30,479 --> 00:13:33,119\nIf you thought things were already progressing fast,\n\n244\n00:13:33,119 --> 00:13:35,919\nit'll get even faster for the rest of this decade.\n\n245\n00:13:35,919 --> 00:13:38,799\nOne quick point to make here is that we're actually still lagging\n\n246\n00:13:38,799 --> 00:13:42,400\nthe front edge of what's computable by about two years.\n\n247\n00:13:42,400 --> 00:13:45,280\nMost people, if they're not still using the original chat GPT,\n\n248\n00:13:45,280 --> 00:13:50,640\nare using GPT-4, which finished training two years ago, or Gemini 1 Ultra.\n\n249\n00:13:50,640 --> 00:13:54,239\nBut I spotted recently from Asabis that Gemini 1 Ultra\n\n250\n00:13:54,239 --> 00:13:58,400\nactually used just the same compute as was rumored for GPT-4.\n\n251\n00:13:58,400 --> 00:14:00,799\nThat's 2022 compute levels.\n\n252\n00:14:00,799 --> 00:14:03,760\nActually, Gemini 1 used roughly the same amount of compute,\n\n253\n00:14:03,760 --> 00:14:06,000\nmaybe slightly more than what was rumored for GPT-4.\n\n254\n00:14:06,000 --> 00:14:07,520\nI don't know exactly what was used.\n\n255\n00:14:07,520 --> 00:14:09,760\nSo I think it was in the same ballpark.\n\n256\n00:14:09,760 --> 00:14:12,719\nIn other words, the public hasn't begun to grasp\n\n257\n00:14:12,719 --> 00:14:17,760\nwhat even 2023 levels of compute could do for training a language model.\n\n258\n00:14:17,760 --> 00:14:19,919\nBut there was, of course, one more announcement\n\n259\n00:14:19,919 --> 00:14:21,760\nthat I simply cannot ignore.\n\n260\n00:14:21,760 --> 00:14:22,719\nProject Groot.\n\n261\n00:14:24,080 --> 00:14:26,400\nThis is NVIDIA Project Groot.\n\n262\n00:14:31,119 --> 00:14:35,039\nA general purpose foundation model for humanoid robot learning.\n\n263\n00:14:36,799 --> 00:14:39,520\nThe Groot model takes multimodal instructions\n\n264\n00:14:39,520 --> 00:14:41,840\nand past interactions as input\n\n265\n00:14:41,840 --> 00:14:45,039\nand produces the next action for the robot to execute.\n\n266\n00:14:47,039 --> 00:14:48,799\nWe developed Isaac Lab,\n\n267\n00:14:48,799 --> 00:14:53,359\na robot learning application to train Groot on Omniverse Isaac Sim.\n\n268\n00:14:53,359 --> 00:14:56,719\nWe can train Groot in physically-based simulation\n\n269\n00:14:56,719 --> 00:14:59,119\nand transfer zero-shot to the real world.\n\n270\n00:15:00,960 --> 00:15:05,599\nThe Groot model will enable a robot to learn from a handful of human demonstrations\n\n271\n00:15:06,159 --> 00:15:08,159\nso it can help with everyday tasks\n\n272\n00:15:10,640 --> 00:15:13,840\nand emulate human movement just by observing us.\n\n273\n00:15:15,919 --> 00:15:18,559\nThis is made possible with NVIDIA's technologies\n\n274\n00:15:18,559 --> 00:15:20,960\nthat can understand humans from videos,\n\n275\n00:15:20,960 --> 00:15:22,640\ntrain models and simulation,\n\n276\n00:15:22,640 --> 00:15:26,159\nand ultimately deploy them directly to physical robots.\n\n277\n00:15:26,159 --> 00:15:27,280\nAs Jensen Huang said,\n\n278\n00:15:27,280 --> 00:15:29,760\nhumanoid robots will at first just watch us\n\n279\n00:15:29,760 --> 00:15:31,520\nand learn from imitation data\n\n280\n00:15:31,520 --> 00:15:35,200\nbut embodied learning does have one advantage over large language models.\n\n281\n00:15:35,200 --> 00:15:38,000\nThey can use reinforcement learning in simulation,\n\n282\n00:15:38,000 --> 00:15:39,679\ntry tasks in simulation,\n\n283\n00:15:39,679 --> 00:15:41,039\nsee how they work out\n\n284\n00:15:41,039 --> 00:15:42,799\nand then perform in the real world.\n\n285\n00:15:42,799 --> 00:15:46,080\nI actually discussed this with two leading figures at NVIDIA,\n\n286\n00:15:46,080 --> 00:15:48,239\nfour AI insiders on Patreon.\n\n287\n00:15:48,239 --> 00:15:50,000\nBut let me leave you with this thought.\n\n288\n00:15:50,000 --> 00:15:54,559\nIf you think these robot imitations look kind of cute and rubbish at the moment,\n\n289\n00:15:54,559 --> 00:15:55,919\nthink about GPT-2\n\n290\n00:15:55,919 --> 00:15:58,960\nor maybe the first system of BARD that you interacted with\n\n291\n00:15:58,960 --> 00:16:02,320\nand now think of GPT-4 or CLAWD-3.\n\n292\n00:16:02,320 --> 00:16:05,679\nIn those cases, they were learning to imitate human text.\n\n293\n00:16:05,679 --> 00:16:07,679\nIn this case, it will be human actions.\n\n294\n00:16:07,679 --> 00:16:08,880\nBut the lesson is the same.\n\n295\n00:16:08,880 --> 00:16:11,919\nThese models can improve much faster than you might think.\n\n296\n00:16:11,919 --> 00:16:13,359\nAnd don't forget with all of this,\n\n297\n00:16:13,359 --> 00:16:16,080\nas yet another OpenAI employee put it,\n\n298\n00:16:16,080 --> 00:16:18,320\nhope you enjoyed some time to relax.\n\n299\n00:16:18,320 --> 00:16:21,679\nIt'll have been the slowest 12 months of AI progress\n\n300\n00:16:21,679 --> 00:16:23,440\nfor quite some time to come.\n\n301\n00:16:23,440 --> 00:16:27,119\nHopefully you'll join me as I cover that progress in the coming months.\n\n302\n00:16:27,119 --> 00:16:29,520\nThank you, as always, for watching to the end\n\n303\n00:16:29,520 --> 00:16:31,760\nand have a wonderful day.\n\n\n"
+from pysubs2 import SSAFile
+def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
+    # Construct the output file name
+    filename, ext = os.path.splitext(video_file)
+    output_file = f"{filename}.{output_ext}"
+    
+    # Execute the ffmpeg conversion
+    (
+        ffmpeg
+        .input(video_file)
+        .output(output_file)
+        .overwrite_output()
+        .run(quiet=True)
+    )
+
+    return output_file
+
+
+class WhisperAPIModel(AbstractModel):
+    model_name = 'openai/whisper'
+    config_schema = {
+            # load model config
+            'model_type': {
+                'type': list,
+                'description': "One of the official model names listed by `whisper.available_models()`, or "
+                               "path to a model checkpoint containing the model dimensions and the model "
+                               "state_dict.",
+                'options': ['whisper-1'],
+                'default': 'whisper-1'
+            },
+            'api_key': {
+                'type': str,
+                'description': "text or tokens to prefix the current context",
+                'options': None,
+                'default': None
+            },
+        }
+
+    def __init__(self, model_config):
+        # config
+        self.model_type = _load_config('model_type', model_config, self.config_schema)
+        self.api_key = _load_config('api_key', model_config, self.config_schema)
+
+        self.client = OpenAI(api_key=self.api_key)
+        
+
+    def transcribe(self, media_file) -> str:
+        audio_file_path = convert_video_to_audio_ffmpeg(media_file)
+        audio_file = open(audio_file_path, "rb")
+        result = self.client.audio.transcriptions.create(
+            model=self.model_type, 
+            file=audio_file,
+            response_format="srt"
+        )
+        return SSAFile.from_string(result)
+

From 860fbd89864dc6268a2cd72f32292b5d2999f4e2 Mon Sep 17 00:00:00 2001
From: Luka Milic <lukam321@gmail.com>
Date: Wed, 27 Mar 2024 22:28:33 +0000
Subject: [PATCH 2/8] tidy up new model

---
 src/subsai/configs.py                  |  4 +---
 src/subsai/models/whisper_api_model.py | 18 ++++++------------
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/subsai/configs.py b/src/subsai/configs.py
index 50f1160..0966ca0 100644
--- a/src/subsai/configs.py
+++ b/src/subsai/configs.py
@@ -65,9 +65,7 @@
     },
     'API/openai/whisper': {
         'class': WhisperAPIModel,
-        'description': 'Whisper is a general-purpose speech recognition model. It is trained on a large dataset of '
-                       'diverse audio and is also a multi-task model that can perform multilingual speech recognition '
-                       'as well as speech translation and language identification.',
+        'description': 'API variant of the OpenAI whisper model, just requires an api key',
         'url': 'https://github.com/openai/whisper',
         'config_schema': WhisperAPIModel.config_schema,
     },
diff --git a/src/subsai/models/whisper_api_model.py b/src/subsai/models/whisper_api_model.py
index 5bddb49..7cd36c3 100644
--- a/src/subsai/models/whisper_api_model.py
+++ b/src/subsai/models/whisper_api_model.py
@@ -4,22 +4,16 @@
 """
 Whisper API Model
 
-See [openai/whisper](https://github.com/openai/whisper)
+See [openai/whisper](https://platform.openai.com/docs/guides/speech-to-text)
 """
 
-from typing import Tuple
-import pysubs2
-from subsai.models.abstract_model import AbstractModel
-from subsai.utils import _load_config, get_available_devices
-
-from openai import OpenAI
-
-import subprocess
 import os
 import ffmpeg
-
-# result_cache = "1\n00:00:00,000 --> 00:00:05,280\nIt has been an interesting few days for the pursuit of Artificial General Intelligence,\n\n2\n00:00:05,280 --> 00:00:08,560\nso I wanted to give you some of the highlights.\n\n3\n00:00:08,560 --> 00:00:14,800\nFrom GPT 4.something to recursively improving semiconductor manufacturing.\n\n4\n00:00:14,800 --> 00:00:20,400\nWe got at least 5 revealing quotes from Jensen Huang, Sam Altman and others.\n\n5\n00:00:20,400 --> 00:00:22,240\nAnd the summary is this.\n\n6\n00:00:22,240 --> 00:00:28,559\nIf you thought or know someone who thinks that Artificial Intelligence peaked with chatGPT,\n\n7\n00:00:28,559 --> 00:00:34,639\nyou or they are going to have to weather exponential increases in computational power\n\n8\n00:00:34,639 --> 00:00:36,959\nthrough at least the rest of this decade.\n\n9\n00:00:36,959 --> 00:00:40,080\nAs Sam Altman just said of 2024, and I agree,\n\n10\n00:00:40,080 --> 00:00:46,080\nthis is the most interesting year in human history except for all future years.\n\n11\n00:00:46,080 --> 00:00:52,639\nSo let's start with Sam Altman who said that OpenAI's goal is to avoid shocking updates.\n\n12\n00:00:52,639 --> 00:00:56,799\nOur goal is not to have shock updates to the world, but that's what we're trying to do.\n\n13\n00:00:56,799 --> 00:00:58,480\nThat's like our state of the strategy.\n\n14\n00:00:58,480 --> 00:01:00,000\nAnd I think we're somehow missing the mark.\n\n15\n00:01:00,560 --> 00:01:01,840\nSo maybe we should think about, you know,\n\n16\n00:01:01,840 --> 00:01:04,000\nreleasing GPT 5 in a different way or something like that.\n\n17\n00:01:05,120 --> 00:01:08,559\nYeah, 4.71, 4.72.\n\n18\n00:01:08,559 --> 00:01:12,559\nAnd what does he mean by releasing iteratively without shock updates?\n\n19\n00:01:12,559 --> 00:01:16,800\nWell, probably something similar to another co-founder of OpenAI, Greg Brockman.\n\n20\n00:01:16,800 --> 00:01:21,839\nHe said that their plan for safety involved deploying GPT 5 in stages,\n\n21\n00:01:21,839 --> 00:01:26,320\nessentially creating a continuum of incrementally better AIs,\n\n22\n00:01:26,320 --> 00:01:29,760\nsuch as by deploying subsequent checkpoints of a given training run.\n\n23\n00:01:29,760 --> 00:01:33,599\nThink of that like saves on the way to completing a video game.\n\n24\n00:01:33,599 --> 00:01:38,720\nIn short, it's highly likely now that we will be getting something equivalent to GPT 4.5\n\n25\n00:01:38,720 --> 00:01:40,480\nbefore we get GPT 5.\n\n26\n00:01:40,480 --> 00:01:46,000\nAs to whether the marketing department comes up with a better name than GPT 4.5, well, let's see.\n\n27\n00:01:46,000 --> 00:01:48,000\nAnd on timing, he said this.\n\n28\n00:01:48,000 --> 00:01:50,080\nBlink twice if it's this year.\n\n29\n00:01:52,160 --> 00:01:59,680\nI also, we will release an amazing model this year.\n\n30\n00:02:00,559 --> 00:02:01,519\nI don't know what we'll call it.\n\n31\n00:02:02,080 --> 00:02:06,959\nWe'll release over in the coming months, many different things.\n\n32\n00:02:06,959 --> 00:02:08,479\nI think they'll be very cool.\n\n33\n00:02:08,479 --> 00:02:14,080\nI think before we talk about like a GPT 5-like model called that or not called that,\n\n34\n00:02:14,080 --> 00:02:17,520\nor a little bit worse or a little bit better than what you'd expect from a GPT 5,\n\n35\n00:02:17,520 --> 00:02:20,399\nI know we have a lot of other important things to release.\n\n36\n00:02:20,399 --> 00:02:23,039\nAnd don't forget that not all progress depends on new models.\n\n37\n00:02:23,039 --> 00:02:28,240\nWe can have new systems like Let's Verify or QSTAR based on existing models.\n\n38\n00:02:28,240 --> 00:02:31,440\nSam Altman practically confirmed the existence of QSTAR\n\n39\n00:02:31,440 --> 00:02:34,160\nin this interview with Lex Friedman from yesterday.\n\n40\n00:02:34,160 --> 00:02:36,000\nCan you speak to what QSTAR is?\n\n41\n00:02:36,000 --> 00:02:37,199\nWe are not ready to talk about that.\n\n42\n00:02:37,759 --> 00:02:40,559\nSee, but an answer like that means there's something to talk about.\n\n43\n00:02:40,559 --> 00:02:41,919\nIt's very mysterious, Sam.\n\n44\n00:02:42,960 --> 00:02:45,600\nI mean, we work on all kinds of research.\n\n45\n00:02:46,000 --> 00:02:46,500\nYeah.\n\n46\n00:02:48,160 --> 00:02:52,880\nI've done an entire video gathering the best evidence as to what QSTAR is.\n\n47\n00:02:52,880 --> 00:02:54,160\nSo do check that one out.\n\n48\n00:02:54,160 --> 00:02:57,919\nBut if you want the massively condensed TLDR, it's this.\n\n49\n00:02:57,919 --> 00:03:02,559\nModels can essentially think or compute for longer on harder questions\n\n50\n00:03:02,559 --> 00:03:05,759\nand generate thousands of example answers\n\n51\n00:03:05,759 --> 00:03:08,800\nand have internal systems for checking which answer is best\n\n52\n00:03:08,800 --> 00:03:10,960\nand only showing you that best answer.\n\n53\n00:03:10,960 --> 00:03:14,960\nA system two to complement the base system one thinking, if you will.\n\n54\n00:03:14,960 --> 00:03:18,160\nBut before we leave that interview, there were two more fascinating quotes,\n\n55\n00:03:18,160 --> 00:03:20,479\nat least from my perspective, that I want to show you.\n\n56\n00:03:20,479 --> 00:03:24,639\nOne involved the possible social response to ever improving AI\n\n57\n00:03:24,639 --> 00:03:27,360\nand the chances of it going theatrically wrong.\n\n58\n00:03:27,360 --> 00:03:28,399\nI worry about that for AI.\n\n59\n00:03:30,000 --> 00:03:32,800\nI think some things are going to go theatrically wrong with AI.\n\n60\n00:03:35,600 --> 00:03:38,800\nI don't know what the percent chance is that I eventually get shot, but it's not zero.\n\n61\n00:03:40,240 --> 00:03:43,520\nI'll come back to social responses later in this video,\n\n62\n00:03:43,520 --> 00:03:45,520\nbut that was a startling moment.\n\n63\n00:03:45,520 --> 00:03:51,360\nAt the moment, at least, thankfully, the power struggle for AGI is only financial.\n\n64\n00:03:51,360 --> 00:03:58,000\nHere is Demis Hassabis essentially laughing when an interviewer said that OpenAI was a non-profit.\n\n65\n00:03:58,000 --> 00:04:01,679\nA lot of AI labs have been grappling with governance\n\n66\n00:04:01,679 --> 00:04:05,759\nand what is the best structure for something like AGI to emerge.\n\n67\n00:04:05,759 --> 00:04:08,960\nYou just mentioned the possibility of some sort of international\n\n68\n00:04:08,960 --> 00:04:11,360\ncollective or cooperative that would handle this.\n\n69\n00:04:12,240 --> 00:04:17,839\nAcross the industry, OpenAI has set itself up as a non-profit with a for-profit subsidiary.\n\n70\n00:04:17,839 --> 00:04:20,640\nAnthropic is a public benefit corporation.\n\n71\n00:04:20,640 --> 00:04:25,200\nSo before we get to NVIDIA's GTC, let's linger for a moment on AGI,\n\n72\n00:04:25,200 --> 00:04:30,000\nits definition, and recent updates to the timeline to AGI.\n\n73\n00:04:30,000 --> 00:04:34,880\nYesterday, Andrej Karpathy, who was until recently at the very top of OpenAI,\n\n74\n00:04:34,880 --> 00:04:37,600\nsaid this about the definition of AGI.\n\n75\n00:04:37,600 --> 00:04:41,359\nHe thinks of it like the OpenAI charter as being an autonomous system\n\n76\n00:04:41,359 --> 00:04:45,200\nthat surpasses humans in most economically valuable tasks.\n\n77\n00:04:45,200 --> 00:04:50,000\nAnd is it me or does that definition not automatically foreshadow economic strife?\n\n78\n00:04:50,000 --> 00:04:53,279\nIn other words, definitionally, AGI won't have arrived\n\n79\n00:04:53,279 --> 00:04:56,399\nuntil it can do the work of at least half of all humans.\n\n80\n00:04:56,399 --> 00:05:01,200\nNow, every word matters when we're defining something as consequential as AGI.\n\n81\n00:05:01,200 --> 00:05:05,440\nAnd Google DeepMind, led by Demis Hassabis, moderated OpenAI's definition.\n\n82\n00:05:05,440 --> 00:05:07,920\nThey said we'll count it as having achieved AGI\n\n83\n00:05:07,920 --> 00:05:12,799\nif we have systems that are technically capable of performing economically important tasks,\n\n84\n00:05:12,799 --> 00:05:15,600\nbut don't necessarily realize that economic value.\n\n85\n00:05:15,600 --> 00:05:18,399\nAs in, they might not actually be deployed in the workforce\n\n86\n00:05:18,399 --> 00:05:20,880\nfor legal, ethical, or social reasons.\n\n87\n00:05:20,880 --> 00:05:24,399\nBut imagine the economic incentives in that scenario.\n\n88\n00:05:24,399 --> 00:05:30,000\nAGI would be here and be capable of realizing trillions of dollars of economic value,\n\n89\n00:05:30,000 --> 00:05:33,519\nand these companies are supposed to hold back from deploying it.\n\n90\n00:05:33,519 --> 00:05:34,799\nWould Google allow that?\n\n91\n00:05:34,799 --> 00:05:35,920\nWould Microsoft?\n\n92\n00:05:35,920 --> 00:05:38,799\nOr would the definition change conveniently?\n\n93\n00:05:38,799 --> 00:05:44,160\nBut even under that wider definition, when does Demis Hassabis think that AGI will arrive?\n\n94\n00:06:05,600 --> 00:06:12,079\nOthers think that that moment, which again would have colossal economic ramifications,\n\n95\n00:06:12,079 --> 00:06:14,079\nwill come before 2030.\n\n96\n00:06:14,079 --> 00:06:16,959\nHere's one alignment researcher at OpenAI.\n\n97\n00:06:16,959 --> 00:06:22,320\nHe thinks that there's around a two-thirds chance of AGI before 2028.\n\n98\n00:06:22,320 --> 00:06:26,559\nAnd he goes on that he can't talk about all the reasons why he has this timeline,\n\n99\n00:06:26,559 --> 00:06:30,799\nbut mostly it should be figureoutable from publicly available information.\n\n100\n00:06:30,799 --> 00:06:34,559\nI'm guessing that's an oblique reference to QSTAR or Let's Verify.\n\n101\n00:06:34,559 --> 00:06:38,160\nHe also returns to the economic definition of AGI.\n\n102\n00:06:38,160 --> 00:06:42,239\nWhen I say AGI, I mean something which is basically a drop-in substitute\n\n103\n00:06:42,239 --> 00:06:45,760\nfor a human remote worker circa 2023.\n\n104\n00:06:45,760 --> 00:06:50,640\nAnd not just a mediocre one, a good one, e.g. an OpenAI research engineer.\n\n105\n00:06:50,640 --> 00:06:52,880\nNotice though he's focusing on remote work,\n\n106\n00:06:52,880 --> 00:06:56,079\nand even Karpathy limits his comments to digital work.\n\n107\n00:06:56,079 --> 00:06:58,239\nBut as we'll see at the end of this video,\n\n108\n00:06:58,239 --> 00:07:01,760\neven physical tasks might be automated sooner than you think.\n\n109\n00:07:01,760 --> 00:07:03,679\nBefore we leave Daniel Cocotagelo though,\n\n110\n00:07:03,679 --> 00:07:06,399\nthere's one more quote I want to show you of his.\n\n111\n00:07:06,399 --> 00:07:09,040\nI think in this one, he's feeling somewhat panicked.\n\n112\n00:07:09,040 --> 00:07:12,320\nProbably there will be AGI soon, literally any year now.\n\n113\n00:07:12,320 --> 00:07:16,320\nAnd probably whoever controls AGI will also be able to use it\n\n114\n00:07:16,320 --> 00:07:19,839\nto get to artificial superintelligence shortly thereafter.\n\n115\n00:07:19,839 --> 00:07:22,720\nHe says, maybe in another year, give or take a year.\n\n116\n00:07:22,720 --> 00:07:24,799\nNow, if you do the maths of that comment,\n\n117\n00:07:24,799 --> 00:07:28,399\ngive or take a year means that it could be instantaneous\n\n118\n00:07:28,399 --> 00:07:31,600\nor it could take two years from AGI, according to him.\n\n119\n00:07:31,600 --> 00:07:33,519\nAt least according to Google DeepMind,\n\n120\n00:07:33,519 --> 00:07:38,480\nan artificial superintelligence would involve outperforming 100% of humans.\n\n121\n00:07:38,480 --> 00:07:42,799\nJust like in their respective domains, AlphaZero and Stockfish already do.\n\n122\n00:07:42,799 --> 00:07:45,200\nAnd in the light of these shortening timelines,\n\n123\n00:07:45,200 --> 00:07:48,559\nsome AI researchers are already adapting their behavior.\n\n124\n00:07:48,559 --> 00:07:51,040\nOne lead researcher at OpenAI said this,\n\n125\n00:07:51,040 --> 00:07:52,720\nthe closer we get to the singularity,\n\n126\n00:07:52,720 --> 00:07:56,079\nthat's the moment when progress is so fast humans can't even keep up,\n\n127\n00:07:56,079 --> 00:07:58,480\nthe lower, he said, my risk tolerance gets.\n\n128\n00:07:58,480 --> 00:08:01,440\nI'd already ruled out skydiving and paragliding.\n\n129\n00:08:01,440 --> 00:08:05,040\nLast year, I started wearing a helmet consistently while cycling\n\n130\n00:08:05,040 --> 00:08:08,640\nand he ended, I think this year might be the year I give up skiing.\n\n131\n00:08:08,640 --> 00:08:12,160\nIn other words, if you think AGI, ASI and the singularity\n\n132\n00:08:12,160 --> 00:08:14,079\nare going to happen in the 2020s,\n\n133\n00:08:14,079 --> 00:08:17,760\nit will be kind of a pity to die before that date, probably.\n\n134\n00:08:17,760 --> 00:08:18,959\nBut at this point in the video,\n\n135\n00:08:18,959 --> 00:08:22,079\nand I promise you I will get to the GTC straight after this,\n\n136\n00:08:22,079 --> 00:08:23,920\nthings are getting kind of heavy.\n\n137\n00:08:23,920 --> 00:08:27,839\nSo I want to bring in a paper I read that's on a lighter note.\n\n138\n00:08:27,839 --> 00:08:31,040\nWhat the paper says essentially is that peer reviewers\n\n139\n00:08:31,040 --> 00:08:34,880\nare now starting to use chatGBT wholesale to do peer review.\n\n140\n00:08:34,880 --> 00:08:36,080\nHow did they discover this?\n\n141\n00:08:36,080 --> 00:08:39,919\nWell, mentions of the word commendable, innovative, meticulous,\n\n142\n00:08:39,919 --> 00:08:42,000\nintricate, notable and versatile.\n\n143\n00:08:42,000 --> 00:08:44,080\nNow, I think those are words that I use all the time,\n\n144\n00:08:44,080 --> 00:08:45,520\nbut maybe not everyone does.\n\n145\n00:08:45,520 --> 00:08:49,200\nPreviously, they were incredibly rare in peer reviews,\n\n146\n00:08:49,200 --> 00:08:51,200\nbut they became somewhat common.\n\n147\n00:08:51,200 --> 00:08:52,559\nHmm, makes you wonder.\n\n148\n00:08:52,559 --> 00:08:54,960\nThey go on that the estimated fraction\n\n149\n00:08:54,960 --> 00:08:57,280\nof large language model generated text\n\n150\n00:08:57,280 --> 00:09:00,000\nis higher in reviews which report lower confidence.\n\n151\n00:09:00,000 --> 00:09:01,119\nThat kind of makes sense, right?\n\n152\n00:09:01,119 --> 00:09:03,840\nIf you're not confident, you're going to use an LLM to help you.\n\n153\n00:09:03,840 --> 00:09:04,960\nBut the next bit is funny.\n\n154\n00:09:04,960 --> 00:09:07,119\nThey were submitted close to the deadline.\n\n155\n00:09:07,119 --> 00:09:09,440\nSo you have these panicked peer reviewers who are like,\n\n156\n00:09:09,440 --> 00:09:11,039\noh no, the deadline's coming.\n\n157\n00:09:11,039 --> 00:09:12,559\nLet's use chatGBT to do it.\n\n158\n00:09:12,559 --> 00:09:15,840\nAnd the other correlation was it was more common from reviewers\n\n159\n00:09:15,840 --> 00:09:18,880\nwho are less likely to respond to author rebuttals.\n\n160\n00:09:18,880 --> 00:09:20,960\nNow that seems somewhat unfair to me.\n\n161\n00:09:20,960 --> 00:09:24,000\nYou don't even bother to write the peer review yourself\n\n162\n00:09:24,000 --> 00:09:26,880\nand you don't even reply when the author replies to you.\n\n163\n00:09:26,880 --> 00:09:30,799\nThese were peer reviews of prominent deep learning conferences\n\n164\n00:09:30,799 --> 00:09:34,239\nand the rates were 10 and almost 17%.\n\n165\n00:09:34,239 --> 00:09:35,760\nAnd we're not talking about spell checks.\n\n166\n00:09:35,760 --> 00:09:39,200\nWe're talking about being substantially modified by chatGBT.\n\n167\n00:09:39,200 --> 00:09:41,280\nObviously, now's not the time to go through this paper,\n\n168\n00:09:41,280 --> 00:09:42,559\nbut I thought it's worth showing you.\n\n169\n00:09:42,559 --> 00:09:44,479\nI mean, it's one more effect of AGI, right?\n\n170\n00:09:44,479 --> 00:09:48,880\nThe whole peer review system might become the AGI review system.\n\n171\n00:09:48,880 --> 00:09:51,679\nSo the conference from around 24 hours ago,\n\n172\n00:09:51,679 --> 00:09:54,000\nobviously way too much to get to in this video,\n\n173\n00:09:54,000 --> 00:09:56,799\nbut I'm going to give you the five moments that stood out for me.\n\n174\n00:09:56,799 --> 00:10:00,400\nFirst, the obvious one, the announcement of the Blackwell GPU.\n\n175\n00:10:01,039 --> 00:10:03,760\nOver the course of the last eight years,\n\n176\n00:10:03,760 --> 00:10:06,880\nwe've increased computation by 1,000 times.\n\n177\n00:10:06,880 --> 00:10:08,400\nEight years, 1,000 times.\n\n178\n00:10:08,400 --> 00:10:10,960\nRemember back in the good old days of Moore's law,\n\n179\n00:10:10,960 --> 00:10:14,960\n10X every five years, 100 times every 10 years,\n\n180\n00:10:15,039 --> 00:10:22,799\n100 times every 10 years in the middle of the heydays of the PC revolution.\n\n181\n00:10:22,799 --> 00:10:24,479\nNow this graph does involve some hype\n\n182\n00:10:24,479 --> 00:10:29,200\nbecause it's not comparing the same level of precision, FP16 to FP4,\n\n183\n00:10:29,200 --> 00:10:32,719\nbut the point still stands that we are exceeding Moore's law.\n\n184\n00:10:32,719 --> 00:10:33,760\nHere's another example.\n\n185\n00:10:33,760 --> 00:10:37,760\nThe Blackwell Superchip system isn't just two times better inference\n\n186\n00:10:37,760 --> 00:10:39,679\nor actually generating tokens.\n\n187\n00:10:39,679 --> 00:10:44,000\nIt's 30 times more performance than the H100 series.\n\n188\n00:10:44,000 --> 00:10:47,599\nIn short, there's going to be a lot more generations from generative AI.\n\n189\n00:10:47,599 --> 00:10:51,440\nThe cost and energy consumption also drops by a major factor.\n\n190\n00:10:51,440 --> 00:10:54,880\nAnd of course, almost every CEO in the world that you can think of\n\n191\n00:10:54,880 --> 00:10:59,520\nlined up to praise and get in the queue for these Blackwell Superchips.\n\n192\n00:10:59,520 --> 00:11:03,200\nNext, of course, the model sizes that these systems can serve\n\n193\n00:11:03,200 --> 00:11:04,559\njust keeps getting bigger.\n\n194\n00:11:04,559 --> 00:11:08,719\nRemember, GPT-3 was trained at 175 billion parameters,\n\n195\n00:11:08,719 --> 00:11:11,679\nthen GPT-4 at around 1.8 trillion.\n\n196\n00:11:11,679 --> 00:11:12,880\nThat's 10 times bigger.\n\n197\n00:11:12,880 --> 00:11:16,799\nWell, notice how as we proceed, we're not doubling or 3Xing.\n\n198\n00:11:16,799 --> 00:11:19,280\nWe're talking about another tenfold increase.\n\n199\n00:11:19,280 --> 00:11:24,159\nNVIDIA said that their server clusters could deploy a 27 trillion parameter model.\n\n200\n00:11:24,159 --> 00:11:27,359\nNow, of course, just because NVIDIA can deploy that size of model\n\n201\n00:11:27,359 --> 00:11:30,080\ndoesn't mean that the AGI labs will create one that big.\n\n202\n00:11:30,080 --> 00:11:33,679\nI think a more reasonable estimate for the next generation of models\n\n203\n00:11:33,679 --> 00:11:35,520\nwould be around 10 trillion parameters.\n\n204\n00:11:35,520 --> 00:11:37,760\nBut the point still stands, we're not doubling each time.\n\n205\n00:11:37,760 --> 00:11:39,039\nIn case you're not familiar, by the way,\n\n206\n00:11:39,039 --> 00:11:41,200\nthe number of parameters is like the number of dials\n\n207\n00:11:41,200 --> 00:11:45,359\nin a model that you can tune to better match deep intricate patterns\n\n208\n00:11:45,359 --> 00:11:47,280\nand patterns within patterns.\n\n209\n00:11:47,280 --> 00:11:50,320\nOf course, those patterns have to be found within the data that you give it.\n\n210\n00:11:50,320 --> 00:11:51,919\nSo garbage in, garbage out.\n\n211\n00:11:51,919 --> 00:11:55,200\nBut of course, everyone's working on getting higher quality data.\n\n212\n00:11:55,200 --> 00:11:58,400\nThe next interesting moment I think many people might have slept on.\n\n213\n00:11:58,400 --> 00:12:01,359\nNVIDIA have built a platform that accelerates\n\n214\n00:12:01,359 --> 00:12:04,320\nthe compute intensive part of lithography.\n\n215\n00:12:04,320 --> 00:12:08,559\nThat's the key process in making new and more advanced chips.\n\n216\n00:12:08,559 --> 00:12:11,679\nAnd in this announcement, NVIDIA say that TSMC\n\n217\n00:12:11,679 --> 00:12:14,799\nare already going into production with this platform.\n\n218\n00:12:14,799 --> 00:12:16,960\nThey're going to be accelerating manufacturing\n\n219\n00:12:16,960 --> 00:12:18,559\nand pushing the limits of physics\n\n220\n00:12:18,559 --> 00:12:21,840\nfor the next generation of advanced semiconductor chips.\n\n221\n00:12:21,840 --> 00:12:27,039\nNot only that, but these 40 or 60x improvements also utilize Gen AI.\n\n222\n00:12:27,039 --> 00:12:29,760\nAs best I can tell, they're using generative AI\n\n223\n00:12:29,760 --> 00:12:33,119\nto create a mask, which is key in lithography.\n\n224\n00:12:33,119 --> 00:12:37,679\nThink of that mask as a template that transfers a pattern onto the chip.\n\n225\n00:12:37,679 --> 00:12:39,440\nI'm reading the book Chip Wars at the moment,\n\n226\n00:12:39,440 --> 00:12:43,119\nand lithography is absolutely key to the latest chips.\n\n227\n00:12:43,119 --> 00:12:44,239\nBut the bigger point is this.\n\n228\n00:12:44,239 --> 00:12:47,359\nThey're using Gen AI for ideation suggestions,\n\n229\n00:12:47,359 --> 00:12:51,599\nbut then the actual mask is derived by traditional physically rigorous methods.\n\n230\n00:12:51,599 --> 00:12:54,080\nIt's that marrying of Gen AI to suggest\n\n231\n00:12:54,080 --> 00:12:57,520\nand traditional systems to check that we'll see again in a moment.\n\n232\n00:12:57,520 --> 00:12:59,200\nBut there is another obvious point here.\n\n233\n00:12:59,200 --> 00:13:00,799\nThis is somewhat recursive.\n\n234\n00:13:00,799 --> 00:13:05,119\nWe have better chips creating better, cheaper, faster generative AI.\n\n235\n00:13:05,119 --> 00:13:07,760\nAnd now more and more, we're getting generative AI\n\n236\n00:13:07,760 --> 00:13:10,799\nhelping in the creation of new and better chips.\n\n237\n00:13:10,799 --> 00:13:13,760\nAnd of course, those new chips might be with us sooner.\n\n238\n00:13:13,760 --> 00:13:16,960\nPhoto masks that took two weeks can now be processed overnight.\n\n239\n00:13:16,960 --> 00:13:19,520\nHere's what the CEO of ASML said,\n\n240\n00:13:19,520 --> 00:13:23,359\nwhich of course is the company that's integral to the creation of semiconductors.\n\n241\n00:13:23,359 --> 00:13:27,919\nThis collaboration will bring tremendous benefit to computational lithography\n\n242\n00:13:27,919 --> 00:13:30,479\nand therefore to semiconductor scaling.\n\n243\n00:13:30,479 --> 00:13:33,119\nIf you thought things were already progressing fast,\n\n244\n00:13:33,119 --> 00:13:35,919\nit'll get even faster for the rest of this decade.\n\n245\n00:13:35,919 --> 00:13:38,799\nOne quick point to make here is that we're actually still lagging\n\n246\n00:13:38,799 --> 00:13:42,400\nthe front edge of what's computable by about two years.\n\n247\n00:13:42,400 --> 00:13:45,280\nMost people, if they're not still using the original chat GPT,\n\n248\n00:13:45,280 --> 00:13:50,640\nare using GPT-4, which finished training two years ago, or Gemini 1 Ultra.\n\n249\n00:13:50,640 --> 00:13:54,239\nBut I spotted recently from Asabis that Gemini 1 Ultra\n\n250\n00:13:54,239 --> 00:13:58,400\nactually used just the same compute as was rumored for GPT-4.\n\n251\n00:13:58,400 --> 00:14:00,799\nThat's 2022 compute levels.\n\n252\n00:14:00,799 --> 00:14:03,760\nActually, Gemini 1 used roughly the same amount of compute,\n\n253\n00:14:03,760 --> 00:14:06,000\nmaybe slightly more than what was rumored for GPT-4.\n\n254\n00:14:06,000 --> 00:14:07,520\nI don't know exactly what was used.\n\n255\n00:14:07,520 --> 00:14:09,760\nSo I think it was in the same ballpark.\n\n256\n00:14:09,760 --> 00:14:12,719\nIn other words, the public hasn't begun to grasp\n\n257\n00:14:12,719 --> 00:14:17,760\nwhat even 2023 levels of compute could do for training a language model.\n\n258\n00:14:17,760 --> 00:14:19,919\nBut there was, of course, one more announcement\n\n259\n00:14:19,919 --> 00:14:21,760\nthat I simply cannot ignore.\n\n260\n00:14:21,760 --> 00:14:22,719\nProject Groot.\n\n261\n00:14:24,080 --> 00:14:26,400\nThis is NVIDIA Project Groot.\n\n262\n00:14:31,119 --> 00:14:35,039\nA general purpose foundation model for humanoid robot learning.\n\n263\n00:14:36,799 --> 00:14:39,520\nThe Groot model takes multimodal instructions\n\n264\n00:14:39,520 --> 00:14:41,840\nand past interactions as input\n\n265\n00:14:41,840 --> 00:14:45,039\nand produces the next action for the robot to execute.\n\n266\n00:14:47,039 --> 00:14:48,799\nWe developed Isaac Lab,\n\n267\n00:14:48,799 --> 00:14:53,359\na robot learning application to train Groot on Omniverse Isaac Sim.\n\n268\n00:14:53,359 --> 00:14:56,719\nWe can train Groot in physically-based simulation\n\n269\n00:14:56,719 --> 00:14:59,119\nand transfer zero-shot to the real world.\n\n270\n00:15:00,960 --> 00:15:05,599\nThe Groot model will enable a robot to learn from a handful of human demonstrations\n\n271\n00:15:06,159 --> 00:15:08,159\nso it can help with everyday tasks\n\n272\n00:15:10,640 --> 00:15:13,840\nand emulate human movement just by observing us.\n\n273\n00:15:15,919 --> 00:15:18,559\nThis is made possible with NVIDIA's technologies\n\n274\n00:15:18,559 --> 00:15:20,960\nthat can understand humans from videos,\n\n275\n00:15:20,960 --> 00:15:22,640\ntrain models and simulation,\n\n276\n00:15:22,640 --> 00:15:26,159\nand ultimately deploy them directly to physical robots.\n\n277\n00:15:26,159 --> 00:15:27,280\nAs Jensen Huang said,\n\n278\n00:15:27,280 --> 00:15:29,760\nhumanoid robots will at first just watch us\n\n279\n00:15:29,760 --> 00:15:31,520\nand learn from imitation data\n\n280\n00:15:31,520 --> 00:15:35,200\nbut embodied learning does have one advantage over large language models.\n\n281\n00:15:35,200 --> 00:15:38,000\nThey can use reinforcement learning in simulation,\n\n282\n00:15:38,000 --> 00:15:39,679\ntry tasks in simulation,\n\n283\n00:15:39,679 --> 00:15:41,039\nsee how they work out\n\n284\n00:15:41,039 --> 00:15:42,799\nand then perform in the real world.\n\n285\n00:15:42,799 --> 00:15:46,080\nI actually discussed this with two leading figures at NVIDIA,\n\n286\n00:15:46,080 --> 00:15:48,239\nfour AI insiders on Patreon.\n\n287\n00:15:48,239 --> 00:15:50,000\nBut let me leave you with this thought.\n\n288\n00:15:50,000 --> 00:15:54,559\nIf you think these robot imitations look kind of cute and rubbish at the moment,\n\n289\n00:15:54,559 --> 00:15:55,919\nthink about GPT-2\n\n290\n00:15:55,919 --> 00:15:58,960\nor maybe the first system of BARD that you interacted with\n\n291\n00:15:58,960 --> 00:16:02,320\nand now think of GPT-4 or CLAWD-3.\n\n292\n00:16:02,320 --> 00:16:05,679\nIn those cases, they were learning to imitate human text.\n\n293\n00:16:05,679 --> 00:16:07,679\nIn this case, it will be human actions.\n\n294\n00:16:07,679 --> 00:16:08,880\nBut the lesson is the same.\n\n295\n00:16:08,880 --> 00:16:11,919\nThese models can improve much faster than you might think.\n\n296\n00:16:11,919 --> 00:16:13,359\nAnd don't forget with all of this,\n\n297\n00:16:13,359 --> 00:16:16,080\nas yet another OpenAI employee put it,\n\n298\n00:16:16,080 --> 00:16:18,320\nhope you enjoyed some time to relax.\n\n299\n00:16:18,320 --> 00:16:21,679\nIt'll have been the slowest 12 months of AI progress\n\n300\n00:16:21,679 --> 00:16:23,440\nfor quite some time to come.\n\n301\n00:16:23,440 --> 00:16:27,119\nHopefully you'll join me as I cover that progress in the coming months.\n\n302\n00:16:27,119 --> 00:16:29,520\nThank you, as always, for watching to the end\n\n303\n00:16:29,520 --> 00:16:31,760\nand have a wonderful day.\n\n\n"
+from subsai.models.abstract_model import AbstractModel
+from subsai.utils import _load_config
+from openai import OpenAI
 from pysubs2 import SSAFile
+
 def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
     # Construct the output file name
     filename, ext = os.path.splitext(video_file)
@@ -51,7 +45,7 @@ class WhisperAPIModel(AbstractModel):
             },
             'api_key': {
                 'type': str,
-                'description': "text or tokens to prefix the current context",
+                'description': "Your OpenAI API key",
                 'options': None,
                 'default': None
             },

From 15df3526480c63c1cbcf9acdde607e6c5e9db372 Mon Sep 17 00:00:00 2001
From: Luka Milic <lukam321@gmail.com>
Date: Wed, 27 Mar 2024 23:25:29 +0000
Subject: [PATCH 3/8] update requirements.txt to include openai api package

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index fed3d6d..def5875 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,5 @@ pywhispercpp==1.1.1
 dl_translate==0.3.0
 faster_whisper
 whisperx @ git+https://github.com/m-bain/whisperx.git
-stable-ts
\ No newline at end of file
+stable-ts
+openai
\ No newline at end of file

From a77163ca02c3f883becc26d3c57aceeef4fc95aa Mon Sep 17 00:00:00 2001
From: Luka Milic <lukam321@gmail.com>
Date: Thu, 28 Mar 2024 15:15:32 +0000
Subject: [PATCH 4/8] Added support for audio files greater than 25MB in OpenAI
 Whisper API

---
 src/subsai/models/whisper_api_model.py | 82 ++++++++++++++++++++++----
 1 file changed, 70 insertions(+), 12 deletions(-)

diff --git a/src/subsai/models/whisper_api_model.py b/src/subsai/models/whisper_api_model.py
index 7cd36c3..3e3a06a 100644
--- a/src/subsai/models/whisper_api_model.py
+++ b/src/subsai/models/whisper_api_model.py
@@ -9,16 +9,29 @@
 
 import os
 import ffmpeg
+import tempfile
 from subsai.models.abstract_model import AbstractModel
 from subsai.utils import _load_config
 from openai import OpenAI
 from pysubs2 import SSAFile
+from pydub import AudioSegment
+
+TMPDIR = tempfile.gettempdir()
+
+def split_filename(filepath):
+    path, full_filename = os.path.split(filepath)
+    filename, ext = os.path.splitext(full_filename)
+    return path,filename,ext
+
+path,filename,ext = split_filename('/Users/luka/Desktop/y2mate.is - AGI Inches Closer 5 Key Quotes Altman Huang and The Most Interesting Year -fPzp_sdCf2Y-1080pp-1711573970.mp3')
 
 def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
     # Construct the output file name
-    filename, ext = os.path.splitext(video_file)
-    output_file = f"{filename}.{output_ext}"
+    path,filename,ext = split_filename(video_file)
+    output_file = os.path.join(TMPDIR,f"{filename}.{output_ext}")
     
+
+    print('Saving audio to {} with ffmpeg...'.format(output_file))
     # Execute the ffmpeg conversion
     (
         ffmpeg
@@ -37,9 +50,8 @@ class WhisperAPIModel(AbstractModel):
             # load model config
             'model_type': {
                 'type': list,
-                'description': "One of the official model names listed by `whisper.available_models()`, or "
-                               "path to a model checkpoint containing the model dimensions and the model "
-                               "state_dict.",
+                'description': "OpenAI Whisper API, currently only supports large-v2 which is named as whisper-1/ \
+                                There is a 25mb upload limit so audio is chunked locally, this may lead to lower performance.",
                 'options': ['whisper-1'],
                 'default': 'whisper-1'
             },
@@ -57,15 +69,61 @@ def __init__(self, model_config):
         self.api_key = _load_config('api_key', model_config, self.config_schema)
 
         self.client = OpenAI(api_key=self.api_key)
+
+    def chunk_audio(self,audio_file_path) -> list:
+        # Load the audio file
+        audio = AudioSegment.from_mp3(audio_file_path)
+
+        # Desired chunk size in megabytes (MB)
+        chunk_size_mb = 5
+        chunk_size_bits = chunk_size_mb * 1024 * 1024 * 8
+        bitrate = audio.frame_rate * audio.frame_width
+        chunk_duration_ms = ((chunk_size_bits) / bitrate) * 1000
+
+        chunks = []
+
+        # Split the audio into chunks
+        current_ms = 0
+        while current_ms < len(audio):
+            # Calculate the end of the current chunk
+            end_ms = current_ms + chunk_duration_ms
+            # Create a chunk from the current position to the end position
+            chunk = audio[current_ms:int(end_ms)]
+            # Add the chunk to the list of chunks
+            chunks.append((chunk,current_ms))
+            # Update the current position
+            current_ms = end_ms
+
+        return chunks
         
 
     def transcribe(self, media_file) -> str:
+
         audio_file_path = convert_video_to_audio_ffmpeg(media_file)
-        audio_file = open(audio_file_path, "rb")
-        result = self.client.audio.transcriptions.create(
-            model=self.model_type, 
-            file=audio_file,
-            response_format="srt"
-        )
-        return SSAFile.from_string(result)
+
+        chunks = self.chunk_audio(audio_file_path)
+
+        # Export each chunk as needed
+        results = ''
+
+        for i, (chunk,offset) in enumerate(chunks):
+            chunk_path = os.path.join(TMPDIR,f'chunk_{i}.mp3')
+            print(chunk_path)
+            chunk.export(chunk_path, format='mp3')
+
+            audio_file = open(chunk_path, "rb")
+            result = self.client.audio.transcriptions.create(
+                model=self.model_type, 
+                file=audio_file,
+                response_format="srt"
+            )
+            # shift subtitles by offset
+            result = SSAFile.from_string(result)
+            print('SHIFTING {}'.format(offset))
+            result.shift(ms=offset)
+            results += result.to_string('srt')
+
+        results = ''.join(results)
+
+        return SSAFile.from_string(results)
 

From 11022660fde0ba4944839fb9f1d755cec61476f2 Mon Sep 17 00:00:00 2001
From: Luka Milic <lukam321@gmail.com>
Date: Thu, 28 Mar 2024 15:52:10 +0000
Subject: [PATCH 5/8] Add more arguments to whisper api model

---
 src/subsai/configs.py                  |  4 +--
 src/subsai/models/whisper_api_model.py | 42 ++++++++++++++++++++------
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/src/subsai/configs.py b/src/subsai/configs.py
index 0966ca0..e003974 100644
--- a/src/subsai/configs.py
+++ b/src/subsai/configs.py
@@ -65,8 +65,8 @@
     },
     'API/openai/whisper': {
         'class': WhisperAPIModel,
-        'description': 'API variant of the OpenAI whisper model, just requires an api key',
-        'url': 'https://github.com/openai/whisper',
+        'description': 'API for the OpenAI large-v2 Whisper model, requires an API key.',
+        'url': 'https://platform.openai.com/docs/guides/speech-to-text',
         'config_schema': WhisperAPIModel.config_schema,
     },
 }
diff --git a/src/subsai/models/whisper_api_model.py b/src/subsai/models/whisper_api_model.py
index 3e3a06a..079ca9d 100644
--- a/src/subsai/models/whisper_api_model.py
+++ b/src/subsai/models/whisper_api_model.py
@@ -17,6 +17,7 @@
 from pydub import AudioSegment
 
 TMPDIR = tempfile.gettempdir()
+OPENAI_API_SIZE_LIMIT_MB = 24
 
 def split_filename(filepath):
     path, full_filename = os.path.split(filepath)
@@ -40,10 +41,8 @@ def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
         .overwrite_output()
         .run(quiet=True)
     )
-
     return output_file
 
-
 class WhisperAPIModel(AbstractModel):
     model_name = 'openai/whisper'
     config_schema = {
@@ -61,12 +60,33 @@ class WhisperAPIModel(AbstractModel):
                 'options': None,
                 'default': None
             },
+            'language': {
+                'type': str,
+                'description': "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.",
+                'options': None,
+                'default': None
+            },
+            'prompt': {
+                'type': str,
+                'description': "An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.",
+                'options': None,
+                'default': None
+            },
+            'temperature': {
+                'type': float,
+                'description': "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.",
+                'options': None,
+                'default': 0
+            }
         }
 
     def __init__(self, model_config):
         # config
         self.model_type = _load_config('model_type', model_config, self.config_schema)
         self.api_key = _load_config('api_key', model_config, self.config_schema)
+        self.language = _load_config('language', model_config, self.config_schema)
+        self.prompt = _load_config('prompt', model_config, self.config_schema)
+        self.temperature = _load_config('temperature', model_config, self.config_schema)
 
         self.client = OpenAI(api_key=self.api_key)
 
@@ -75,8 +95,7 @@ def chunk_audio(self,audio_file_path) -> list:
         audio = AudioSegment.from_mp3(audio_file_path)
 
         # Desired chunk size in megabytes (MB)
-        chunk_size_mb = 5
-        chunk_size_bits = chunk_size_mb * 1024 * 1024 * 8
+        chunk_size_bits = OPENAI_API_SIZE_LIMIT_MB * 1024 * 1024 * 8
         bitrate = audio.frame_rate * audio.frame_width
         chunk_duration_ms = ((chunk_size_bits) / bitrate) * 1000
 
@@ -89,7 +108,7 @@ def chunk_audio(self,audio_file_path) -> list:
             end_ms = current_ms + chunk_duration_ms
             # Create a chunk from the current position to the end position
             chunk = audio[current_ms:int(end_ms)]
-            # Add the chunk to the list of chunks
+            # Add the chunk to the list of chunks and include offset
             chunks.append((chunk,current_ms))
             # Update the current position
             current_ms = end_ms
@@ -103,23 +122,26 @@ def transcribe(self, media_file) -> str:
 
         chunks = self.chunk_audio(audio_file_path)
 
-        # Export each chunk as needed
         results = ''
 
         for i, (chunk,offset) in enumerate(chunks):
             chunk_path = os.path.join(TMPDIR,f'chunk_{i}.mp3')
-            print(chunk_path)
+            print('Saving audio chunk {} to {}'.format(i,chunk_path))
             chunk.export(chunk_path, format='mp3')
-
             audio_file = open(chunk_path, "rb")
+
+            # Use OpenAI Whisper API
             result = self.client.audio.transcriptions.create(
-                model=self.model_type, 
+                model=self.model_type,
+                language=self.language,
+                prompt=self.prompt,
+                temperature=self.temperature,
                 file=audio_file,
                 response_format="srt"
             )
+
             # shift subtitles by offset
             result = SSAFile.from_string(result)
-            print('SHIFTING {}'.format(offset))
             result.shift(ms=offset)
             results += result.to_string('srt')
 

From 0ee158f35d265bb2ace6dfe9bd400156c02daf9d Mon Sep 17 00:00:00 2001
From: Luka Milic <lukam321@gmail.com>
Date: Thu, 28 Mar 2024 16:06:40 +0000
Subject: [PATCH 6/8] Added API model to the README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 2bf3f64..8edf605 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,8 @@
       >- 🗣️ VAD preprocessing, reduces hallucination & batching with no WER degradation.
   * [x] :new: [jianfch/stable-ts](https://github.com/jianfch/stable-ts)
     * >**Stabilizing Timestamps for Whisper**: This library modifies [Whisper](https://github.com/openai/whisper) to produce more reliable timestamps and extends its functionality.
+  * [x] [API/openai/whisper](https://platform.openai.com/docs/guides/speech-to-text)
+    * > OpenAI Whisper via their API
 
 * Web UI
   * Fully offline, no third party services 

From 3cc15f482b242ba6382067d88b81fff1c11f9d6b Mon Sep 17 00:00:00 2001
From: Luka Milic <lukam321@gmail.com>
Date: Thu, 28 Mar 2024 21:18:28 +0000
Subject: [PATCH 7/8] Add defaut api key value to whisper openai api model

---
 src/subsai/models/whisper_api_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/subsai/models/whisper_api_model.py b/src/subsai/models/whisper_api_model.py
index 079ca9d..877af7e 100644
--- a/src/subsai/models/whisper_api_model.py
+++ b/src/subsai/models/whisper_api_model.py
@@ -58,7 +58,7 @@ class WhisperAPIModel(AbstractModel):
                 'type': str,
                 'description': "Your OpenAI API key",
                 'options': None,
-                'default': None
+                'default': os.environ.get('OPENAI_KEY', None)
             },
             'language': {
                 'type': str,

From 50086f5dce1cffee4d5c7cf5b3383b1fa82b8279 Mon Sep 17 00:00:00 2001
From: Luka Milic <lukam321@gmail.com>
Date: Fri, 29 Mar 2024 12:46:54 +0000
Subject: [PATCH 8/8] remove debug line and add better print messages

---
 src/subsai/models/whisper_api_model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/subsai/models/whisper_api_model.py b/src/subsai/models/whisper_api_model.py
index 877af7e..4c6ff59 100644
--- a/src/subsai/models/whisper_api_model.py
+++ b/src/subsai/models/whisper_api_model.py
@@ -24,8 +24,6 @@ def split_filename(filepath):
     filename, ext = os.path.splitext(full_filename)
     return path,filename,ext
 
-path,filename,ext = split_filename('/Users/luka/Desktop/y2mate.is - AGI Inches Closer 5 Key Quotes Altman Huang and The Most Interesting Year -fPzp_sdCf2Y-1080pp-1711573970.mp3')
-
 def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
     # Construct the output file name
     path,filename,ext = split_filename(video_file)
@@ -126,7 +124,7 @@ def transcribe(self, media_file) -> str:
 
         for i, (chunk,offset) in enumerate(chunks):
             chunk_path = os.path.join(TMPDIR,f'chunk_{i}.mp3')
-            print('Saving audio chunk {} to {}'.format(i,chunk_path))
+            print('Transcribing audio chunk {}/{}'.format(i,len(chunks)))
             chunk.export(chunk_path, format='mp3')
             audio_file = open(chunk_path, "rb")
 
@@ -140,6 +138,9 @@ def transcribe(self, media_file) -> str:
                 response_format="srt"
             )
 
+            with open(chunk_path+'.srt','w') as f:
+                f.write(result)
+
             # shift subtitles by offset
             result = SSAFile.from_string(result)
             result.shift(ms=offset)