Skip to content

Commit

Permalink
Merge pull request #45 from Huanshere/dev_v7
Browse files Browse the repository at this point in the history
update language support
  • Loading branch information
Huanshere authored Sep 15, 2024
2 parents 1f9e302 + c18aa8b commit a59bdf9
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 135 deletions.
19 changes: 11 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,20 @@ https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7

- 音频长度:目前仅支持30分钟以内的视频,我们计划很快扩展这一限制。

- 输入语言支持(whisperX对部分语言产出的时间轴和标点不稳定)
- 输入语言支持:

| 输入语言 | 支持程度 | 示例视频 |
|---------|---------|---------|
| 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b) |
| 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
| 法语 | 🤩 | [法转日 demo](https://github.com/user-attachments/assets/3ce068c7-9854-4c72-ae77-f2484c7c6630)|
| 德语 | ❓ (尚未测试) | |
| 西班牙语 | ❓ (尚未测试) | |
| 日语 | 😖 ||
| 中文 | 😖 ||
| 🇬🇧🇺🇸 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b) |
| 🇷🇺 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
| 🇫🇷 法语 | 🤩 | [法转日 demo](https://github.com/user-attachments/assets/3ce068c7-9854-4c72-ae77-f2484c7c6630)|
| 🇩🇪 德语 | 🤩 | [德转中 demo](https://github.com/user-attachments/assets/07cb9d21-069e-4725-871d-c4d9701287a3) |
| 🇮🇹 意大利语 | 🤩 | [意转中 demo](https://github.com/user-attachments/assets/f1f893eb-dad3-4460-aaf6-10cac999195e) |
| 🇪🇸 西班牙语 | 🤩 | [西转中 demo](https://github.com/user-attachments/assets/c1d28f1c-83d2-4f13-a1a1-859bd6cc3553) |
| 🇯🇵 日语 | 😐 | [日转中 demo](https://github.com/user-attachments/assets/856c3398-2da3-4e25-9c36-27ca2d1f68c2) |
| 🇨🇳 中文 | 😖 ||

😖 whisper 识别中文字词级时间轴时难以给出标点符号。

- 输出语言支持:VideoLingo 支持翻译成claude会的所有语言

Expand Down
15 changes: 7 additions & 8 deletions config.example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
# 为了最好的效果,请使用 claude-3-5-sonnet-20240620. 便宜渠道推荐使用 https://api2.wlai.vip/register?aff=TXMB.
# For best results, please use claude-3-5-sonnet-20240620.
API_KEY = 'sk-xxx'
BASE_URL = 'https://api2.wlai.vip'
BASE_URL = 'https://api.wlai.vip'
MODEL = ['claude-3-5-sonnet-20240620']

# Replicate API 设置
# Replicate API settings for using whisperX
REPLICATE_API_TOKEN = "r8_xxx"
REPLICATE_API_TOKEN = 'r8_xxx'

# 语言设置,用自然语言描述
# Language settings, described in natural language
Expand Down Expand Up @@ -96,15 +96,14 @@
"en": "en_core_web_md",
"ru": "ru_core_news_md",
"fr": "fr_core_news_md",

# "es": "es_core_news_md",
# "de": "de_core_news_md",
# "it": "it_core_news_md",
"ja": "ja_core_news_md",
"es": "es_core_news_md",
"de": "de_core_news_md",
"it": "it_core_news_md",


# Not supported
# "zh": "zh_core_web_md",
# "ja": "ja_core_news_md",

}

# 使用空格分割的语言
Expand Down
63 changes: 1 addition & 62 deletions core/all_whisper_methods/whisperX.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,10 @@
import pandas as pd
import json
from typing import Dict
import subprocess
import base64

sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from config import MODEL_DIR

def convert_video_to_audio(input_file: str) -> str:
os.makedirs('output/audio', exist_ok=True)
audio_file = 'output/audio/raw_full_audio.wav'

if not os.path.exists(audio_file):
ffmpeg_cmd = [
'ffmpeg',
'-i', input_file,
'-vn',
'-acodec', 'libmp3lame',
'-ar', '16000',
'-b:a', '64k',
audio_file
]
print(f"🎬➡️🎵 Converting to audio......")
subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE)
print(f"🎬➡️🎵 Converted <{input_file}> to <{audio_file}>\n")

return audio_file

def encode_file_to_base64(file_path: str) -> str:
print("🔄 Encoding audio file to base64...")
with open(file_path, 'rb') as file:
encoded = base64.b64encode(file.read()).decode('utf-8')
print("✅ File successfully encoded to base64")
return encoded
from core.all_whisper_methods.whisperXapi import process_transcription, convert_video_to_audio

def transcribe_audio(audio_file: str) -> Dict:
from config import WHISPER_LANGUAGE
Expand Down Expand Up @@ -69,39 +41,6 @@ def transcribe_audio(audio_file: str) -> Dict:
except Exception as e:
raise Exception(f"WhisperX processing error: {e}")

def process_transcription(result: Dict) -> pd.DataFrame:
from config import get_joiner, WHISPER_LANGUAGE
language = result['language'] if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # consider force english case
joiner = get_joiner(language)

all_words = []
for segment in result['segments']:
for word in segment['words']:
# ! For French, we need to convert guillemets to empty strings
word["word"] = word["word"].replace('»', '').replace('«', '')

if 'start' not in word and 'end' not in word:
if all_words:
# Merge with the previous word
all_words[-1]['text'] = f'{all_words[-1]["text"]}{joiner}{word["word"]}'
else:
# If it's the first word, temporarily save it and wait for the next word with a timestamp
temp_word = word["word"]
else:
# Normal case, with start and end times
word_dict = {
'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
'end': word['end'],
'score': word.get('score', 0)
}

all_words.append(word_dict)
if 'temp_word' in locals():
del temp_word

return pd.DataFrame(all_words)

def save_results(df: pd.DataFrame):
os.makedirs('output/log', exist_ok=True)
excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")
Expand Down
31 changes: 19 additions & 12 deletions core/all_whisper_methods/whisperXapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,6 @@ def transcribe_audio(audio_base64: str) -> Dict:
raise Exception(f"Error accessing whisperX API: {e} Please check your Replicate API key and internet connection.\n")

def process_transcription(result: Dict) -> pd.DataFrame:
from config import get_joiner, WHISPER_LANGUAGE
language = result['detected_language'] if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # consider force english case
joiner = get_joiner(language)

all_words = []
for segment in result['segments']:
for word in segment['words']:
Expand All @@ -79,23 +75,34 @@ def process_transcription(result: Dict) -> pd.DataFrame:

if 'start' not in word and 'end' not in word:
if all_words:
# Merge with the previous word
all_words[-1]['text'] = f'{all_words[-1]["text"]}{joiner}{word["word"]}'
# Assign the end time of the previous word as the start and end time of the current word
word_dict = {
'text': word["word"],
'start': all_words[-1]['end'],
'end': all_words[-1]['end'],
}
all_words.append(word_dict)
else:
# If it's the first word, temporarily save it and wait for the next word with a timestamp
temp_word = word["word"]
# If it’s the first word, look next for a timestamp then assign it to the current word
next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None)
if next_word:
word_dict = {
'text': word["word"],
'start': next_word["start"],
'end': next_word["end"],
}
all_words.append(word_dict)
else:
raise Exception(f"No next word with timestamp found for the current word : {word}")
else:
# Normal case, with start and end times
word_dict = {
'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
'text': f'{word["word"]}',
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
'end': word['end'],
'score': word.get('score', 0)
}

all_words.append(word_dict)
if 'temp_word' in locals():
del temp_word

return pd.DataFrame(all_words)

Expand Down
6 changes: 5 additions & 1 deletion core/ask_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def make_api_call(client, model, messages, response_format):
response_format=response_format
)

def ask_gpt(prompt, model, response_json=True, valid_key='', log_title='default'):
def ask_gpt(prompt, model, response_json=True, valid_key='', valid_sub_key='', log_title='default'):
with LOCK:
if check_ask_gpt_history(prompt, model):
return check_ask_gpt_history(prompt, model)
Expand All @@ -82,6 +82,10 @@ def ask_gpt(prompt, model, response_json=True, valid_key='', log_title='default'
if valid_key and valid_key not in response_data:
print(f"❎ API response error: Missing '{valid_key}' key. Retrying...")
raise ValueError(f"Response missing '{valid_key}' key")
if valid_sub_key:
if not all(valid_sub_key in item for item in response_data.values()):
print(f"❎ API response error: Missing '{valid_sub_key}' sub-key in some items. Retrying...")
raise ValueError(f"Response missing '{valid_sub_key}' sub-key in some items")
break # Successfully accessed and parsed, break the loop
except Exception as e:
response_data = response.choices[0].message.content
Expand Down
2 changes: 1 addition & 1 deletion core/prompts_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ def get_prompt_faithfulness(lines, shared_prompt):
return prompt_faithfulness.strip()



def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt):
from config import TARGET_LANGUAGE
json_format = {}
Expand Down Expand Up @@ -227,6 +226,7 @@ def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt):
</subtitles>
### Output Format
Make sure to generate the correct Json format, don't output " in the value.
Please complete the following JSON data, where << >> represents placeholders that should not appear in your answer, and return your translation results in JSON format:
{json.dumps(json_format, ensure_ascii=False, indent=4)}
'''
Expand Down
10 changes: 2 additions & 8 deletions core/spacy_utils/load_nlp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,10 @@ def init_nlp():
nlp = spacy.load(model)
except:
print(f"Downloading {model} model...")
print("If download failed, please check your network and try again.")
download(model)
nlp = spacy.load(model)
except:
print(f"Language not detected, using en_core_web_sm model as fallback...")
model = "en_core_web_sm"
try:
nlp = spacy.load(model)
except:
print(f"Downloading {model} model...")
download(model)
nlp = spacy.load(model)
raise ValueError(f"❌ Failed to load NLP Spacy model: {model}")
print(f"✅ NLP Spacy model loaded successfully!")
return nlp
70 changes: 55 additions & 15 deletions core/spacy_utils/split_by_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,67 @@ def analyze_connectors(doc, token):
Analyze whether a token is a connector that should trigger a sentence split.
Processing logic and order:
1. Check if the token is one of the target connectors (that, which, where, when).
2. For 'that', check if it's part of a contraction (e.g., that's, that'll).
3. For all connectors, check if they function as a 'mark' dependent of a verb.
4. For 'which', 'where', 'when', check if they function as determiners or pronouns
for nouns or proper nouns.
5. Default to splitting for 'which', 'where', 'when' if no other conditions are met.
6. For 'and', 'or', 'but', check if they connect two independent clauses.
1. Check if the token is one of the target connectors based on the language.
2. For 'that' (English), check if it's part of a contraction (e.g., that's, that'll).
3. For all connectors, check if they function as a specific dependency of a verb or noun.
4. Default to splitting for certain connectors if no other conditions are met.
5. For coordinating conjunctions, check if they connect two independent clauses.
"""
# Check if the token is one of the target connectors
if token.text.lower() not in ["that", "which", "where", "when", "because", "but", "and", "or"]:
lang = doc.lang_
if lang == "en":
connectors = ["that", "which", "where", "when", "because", "but", "and", "or"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "ja":
connectors = ["けれども", "しかし", "だから", "それで", "ので", "のに", "ため"]
mark_dep = "mark"
det_pron_deps = ["case"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "fr":
connectors = ["que", "qui", "où", "quand", "parce que", "mais", "et", "ou"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "ru":
connectors = ["что", "который", "где", "когда", "потому что", "но", "и", "или"]
mark_dep = "mark"
det_pron_deps = ["det"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "es":
connectors = ["que", "cual", "donde", "cuando", "porque", "pero", "y", "o"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "de":
connectors = ["dass", "welche", "wo", "wann", "weil", "aber", "und", "oder"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "it":
connectors = ["che", "quale", "dove", "quando", "perché", "ma", "e", "o"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
else:
return False, False

if token.text.lower() not in connectors:
return False, False

if token.text.lower() == "that":
if token.dep_ == "mark" and token.head.pos_ == "VERB":
# Split if 'that' is a 'mark' dependent of a verb
if lang == "en" and token.text.lower() == "that":
if token.dep_ == mark_dep and token.head.pos_ == verb_pos:
return True, False
else:
# Don't split for other uses of 'that'
return False, False
elif token.text.lower() != "that" and token.dep_ in ["det", "pron"] and token.head.pos_ in ["NOUN", "PROPN"]:
# Don't split if 'which', 'where', 'when' are determiners or pronouns for nouns
elif token.dep_ in det_pron_deps and token.head.pos_ in noun_pos:
return False, False
else:
return True, False
Expand Down
9 changes: 7 additions & 2 deletions core/spacy_utils/split_by_mark.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ def split_by_mark(nlp):
sentences_by_mark = [sent.text for sent in doc.sents]

with open("output/log/sentence_by_mark.txt", "w", encoding="utf-8") as output_file:
for sentence in sentences_by_mark:
output_file.write(sentence + "\n")
for i, sentence in enumerate(sentences_by_mark):
if i > 0 and sentence.strip() in [',', '.', ',', '。', '?', '!']:
# ! If the current line contains only punctuation, merge it with the previous line, this happens in Chinese, Japanese, etc.
output_file.seek(output_file.tell() - 1, os.SEEK_SET) # Move to the end of the previous line
output_file.write(sentence) # Add the punctuation
else:
output_file.write(sentence + "\n")

print("💾 Sentences split by punctuation marks saved to → `sentences_by_mark.txt`")

Expand Down
Loading

0 comments on commit a59bdf9

Please sign in to comment.