Merge pull request #45 from Huanshere/dev_v7

update language support
Huanshere · Sep 15, 2024 · a59bdf9 · a59bdf9
2 parents 1f9e302 + c18aa8b
commit a59bdf9
Show file tree

Hide file tree

Showing 10 changed files with 122 additions and 135 deletions.
diff --git a/README.md b/README.md
@@ -63,17 +63,20 @@ https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7
 
 - 音频长度：目前仅支持30分钟以内的视频，我们计划很快扩展这一限制。
 
-- 输入语言支持（whisperX对部分语言产出的时间轴和标点不稳定）：
+- 输入语言支持：
 
 | 输入语言 | 支持程度 | 示例视频 |
 |---------|---------|---------|
-| 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b)  |
-| 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
-| 法语 | 🤩 | [法转日 demo](https://github.com/user-attachments/assets/3ce068c7-9854-4c72-ae77-f2484c7c6630)|
-| 德语 | ❓ (尚未测试) |  |
-| 西班牙语 | ❓ (尚未测试) |  |
-| 日语 | 😖 | ❌ |
-| 中文 | 😖 | ❌ |
+| 🇬🇧🇺🇸 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b)  |
+| 🇷🇺 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
+| 🇫🇷 法语 | 🤩 | [法转日 demo](https://github.com/user-attachments/assets/3ce068c7-9854-4c72-ae77-f2484c7c6630)|
+| 🇩🇪 德语 | 🤩 | [德转中 demo](https://github.com/user-attachments/assets/07cb9d21-069e-4725-871d-c4d9701287a3) |
+| 🇮🇹 意大利语 | 🤩 | [意转中 demo](https://github.com/user-attachments/assets/f1f893eb-dad3-4460-aaf6-10cac999195e) |
+| 🇪🇸 西班牙语 | 🤩 | [西转中 demo](https://github.com/user-attachments/assets/c1d28f1c-83d2-4f13-a1a1-859bd6cc3553) |
+| 🇯🇵 日语 | 😐 | [日转中 demo](https://github.com/user-attachments/assets/856c3398-2da3-4e25-9c36-27ca2d1f68c2) |
+| 🇨🇳 中文 | 😖 | ❌ |
+
+😖 whisper 识别中文字词级时间轴时难以给出标点符号。
 
 - 输出语言支持：VideoLingo 支持翻译成claude会的所有语言
 

diff --git a/config.example.py b/config.example.py
@@ -7,12 +7,12 @@
 # 为了最好的效果，请使用 claude-3-5-sonnet-20240620. 便宜渠道推荐使用 https://api2.wlai.vip/register?aff=TXMB.
 # For best results, please use claude-3-5-sonnet-20240620.
 API_KEY = 'sk-xxx'
-BASE_URL = 'https://api2.wlai.vip'
+BASE_URL = 'https://api.wlai.vip'
 MODEL = ['claude-3-5-sonnet-20240620']
 
 # Replicate API 设置
 # Replicate API settings for using whisperX
-REPLICATE_API_TOKEN = "r8_xxx"
+REPLICATE_API_TOKEN = 'r8_xxx'
 
 # 语言设置，用自然语言描述
 # Language settings, described in natural language
@@ -96,15 +96,14 @@
     "en": "en_core_web_md",
     "ru": "ru_core_news_md",
     "fr": "fr_core_news_md",
-
-    # "es": "es_core_news_md",
-    # "de": "de_core_news_md",
-    # "it": "it_core_news_md",
+    "ja": "ja_core_news_md",
+    "es": "es_core_news_md",
+    "de": "de_core_news_md",
+    "it": "it_core_news_md",
 
-
     # Not supported
     # "zh": "zh_core_web_md",
-    # "ja": "ja_core_news_md",
+
 }
 
 # 使用空格分割的语言

diff --git a/core/all_whisper_methods/whisperX.py b/core/all_whisper_methods/whisperX.py
@@ -5,38 +5,10 @@
 import pandas as pd
 import json
 from typing import Dict
-import subprocess
-import base64
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 from config import MODEL_DIR
-
-def convert_video_to_audio(input_file: str) -> str:
-    os.makedirs('output/audio', exist_ok=True)
-    audio_file = 'output/audio/raw_full_audio.wav'
-
-    if not os.path.exists(audio_file):
-        ffmpeg_cmd = [
-            'ffmpeg',
-            '-i', input_file,
-            '-vn',
-            '-acodec', 'libmp3lame',
-            '-ar', '16000',
-            '-b:a', '64k',
-            audio_file
-        ]
-        print(f"🎬➡️🎵 Converting to audio......")
-        subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE)
-        print(f"🎬➡️🎵 Converted <{input_file}> to <{audio_file}>\n")
-
-    return audio_file
-
-def encode_file_to_base64(file_path: str) -> str:
-    print("🔄 Encoding audio file to base64...")
-    with open(file_path, 'rb') as file:
-        encoded = base64.b64encode(file.read()).decode('utf-8')
-        print("✅ File successfully encoded to base64")
-        return encoded
+from core.all_whisper_methods.whisperXapi import process_transcription, convert_video_to_audio
 
 def transcribe_audio(audio_file: str) -> Dict:
     from config import WHISPER_LANGUAGE
@@ -69,39 +41,6 @@ def transcribe_audio(audio_file: str) -> Dict:
     except Exception as e:
         raise Exception(f"WhisperX processing error: {e}")
 
-def process_transcription(result: Dict) -> pd.DataFrame:
-    from config import get_joiner, WHISPER_LANGUAGE
-    language = result['language'] if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # consider force english case
-    joiner = get_joiner(language)
-
-    all_words = []
-    for segment in result['segments']:
-        for word in segment['words']:
-            # ! For French, we need to convert guillemets to empty strings
-            word["word"] = word["word"].replace('»', '').replace('«', '')
-
-            if 'start' not in word and 'end' not in word:
-                if all_words:
-                    # Merge with the previous word
-                    all_words[-1]['text'] = f'{all_words[-1]["text"]}{joiner}{word["word"]}'
-                else:
-                    # If it's the first word, temporarily save it and wait for the next word with a timestamp
-                    temp_word = word["word"]
-            else:
-                # Normal case, with start and end times
-                word_dict = {
-                    'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
-                    'start': word.get('start', all_words[-1]['end'] if all_words else 0),
-                    'end': word['end'],
-                    'score': word.get('score', 0)
-                }
-
-                all_words.append(word_dict)
-                if 'temp_word' in locals():
-                    del temp_word
-
-    return pd.DataFrame(all_words)
-
 def save_results(df: pd.DataFrame):
     os.makedirs('output/log', exist_ok=True)
     excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")

diff --git a/core/all_whisper_methods/whisperXapi.py b/core/all_whisper_methods/whisperXapi.py
@@ -67,10 +67,6 @@ def transcribe_audio(audio_base64: str) -> Dict:
         raise Exception(f"Error accessing whisperX API: {e} Please check your Replicate API key and internet connection.\n")
 
 def process_transcription(result: Dict) -> pd.DataFrame:
-    from config import get_joiner, WHISPER_LANGUAGE
-    language = result['detected_language'] if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # consider force english case
-    joiner = get_joiner(language)
-
     all_words = []
     for segment in result['segments']:
         for word in segment['words']:
@@ -79,23 +75,34 @@ def process_transcription(result: Dict) -> pd.DataFrame:
 
             if 'start' not in word and 'end' not in word:
                 if all_words:
-                    # Merge with the previous word
-                    all_words[-1]['text'] = f'{all_words[-1]["text"]}{joiner}{word["word"]}'
+                    # Assign the end time of the previous word as the start and end time of the current word
+                    word_dict = {
+                        'text': word["word"],
+                        'start': all_words[-1]['end'],
+                        'end': all_words[-1]['end'],
+                    }
+                    all_words.append(word_dict)
                 else:
-                    # If it's the first word, temporarily save it and wait for the next word with a timestamp
-                    temp_word = word["word"]
+                    # If it’s the first word, look next for a timestamp then assign it to the current word
+                    next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None)
+                    if next_word:
+                        word_dict = {
+                            'text': word["word"],
+                            'start': next_word["start"],
+                            'end': next_word["end"],
+                        }
+                        all_words.append(word_dict)
+                    else:
+                        raise Exception(f"No next word with timestamp found for the current word : {word}")
             else:
                 # Normal case, with start and end times
                 word_dict = {
-                    'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
+                    'text': f'{word["word"]}',
                     'start': word.get('start', all_words[-1]['end'] if all_words else 0),
                     'end': word['end'],
-                    'score': word.get('score', 0)
                 }
 
                 all_words.append(word_dict)
-                if 'temp_word' in locals():
-                    del temp_word
 
     return pd.DataFrame(all_words)
 

diff --git a/core/ask_gpt.py b/core/ask_gpt.py
@@ -58,7 +58,7 @@ def make_api_call(client, model, messages, response_format):
         response_format=response_format
     )
 
-def ask_gpt(prompt, model, response_json=True, valid_key='', log_title='default'):
+def ask_gpt(prompt, model, response_json=True, valid_key='', valid_sub_key='', log_title='default'):
     with LOCK:
         if check_ask_gpt_history(prompt, model):
             return check_ask_gpt_history(prompt, model)
@@ -82,6 +82,10 @@ def ask_gpt(prompt, model, response_json=True, valid_key='', log_title='default'
                     if valid_key and valid_key not in response_data:
                         print(f"❎ API response error: Missing '{valid_key}' key. Retrying...")
                         raise ValueError(f"Response missing '{valid_key}' key")
+                    if valid_sub_key:
+                        if not all(valid_sub_key in item for item in response_data.values()):
+                            print(f"❎ API response error: Missing '{valid_sub_key}' sub-key in some items. Retrying...")
+                            raise ValueError(f"Response missing '{valid_sub_key}' sub-key in some items")
                     break  # Successfully accessed and parsed, break the loop
                 except Exception as e:
                     response_data = response.choices[0].message.content

diff --git a/core/prompts_storage.py b/core/prompts_storage.py
@@ -179,7 +179,6 @@ def get_prompt_faithfulness(lines, shared_prompt):
     return prompt_faithfulness.strip()
 
 
-
 def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt):
     from config import TARGET_LANGUAGE
     json_format = {}
@@ -227,6 +226,7 @@ def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt):
 </subtitles>
 
 ### Output Format
+Make sure to generate the correct Json format, don't output " in the value.
 Please complete the following JSON data, where << >> represents placeholders that should not appear in your answer, and return your translation results in JSON format:
 {json.dumps(json_format, ensure_ascii=False, indent=4)}
 '''

diff --git a/core/spacy_utils/load_nlp_model.py b/core/spacy_utils/load_nlp_model.py
@@ -21,16 +21,10 @@ def init_nlp():
             nlp = spacy.load(model)
         except:
             print(f"Downloading {model} model...")
+            print("If download failed, please check your network and try again.")
             download(model)
             nlp = spacy.load(model)
     except:
-        print(f"Language not detected, using en_core_web_sm model as fallback...")
-        model = "en_core_web_sm"
-        try:
-            nlp = spacy.load(model)
-        except:
-            print(f"Downloading {model} model...")
-            download(model)
-            nlp = spacy.load(model)
+        raise ValueError(f"❌ Failed to load NLP Spacy model: {model}")
     print(f"✅ NLP Spacy model loaded successfully!")
     return nlp
diff --git a/core/spacy_utils/split_by_connector.py b/core/spacy_utils/split_by_connector.py
@@ -9,27 +9,67 @@ def analyze_connectors(doc, token):
     Analyze whether a token is a connector that should trigger a sentence split.
     
     Processing logic and order:
-    1. Check if the token is one of the target connectors (that, which, where, when).
-    2. For 'that', check if it's part of a contraction (e.g., that's, that'll).
-    3. For all connectors, check if they function as a 'mark' dependent of a verb.
-    4. For 'which', 'where', 'when', check if they function as determiners or pronouns 
-    for nouns or proper nouns.
-    5. Default to splitting for 'which', 'where', 'when' if no other conditions are met.
-    6. For 'and', 'or', 'but', check if they connect two independent clauses.
+    1. Check if the token is one of the target connectors based on the language.
+    2. For 'that' (English), check if it's part of a contraction (e.g., that's, that'll).
+    3. For all connectors, check if they function as a specific dependency of a verb or noun.
+    4. Default to splitting for certain connectors if no other conditions are met.
+    5. For coordinating conjunctions, check if they connect two independent clauses.
     """
-    # Check if the token is one of the target connectors
-    if token.text.lower() not in ["that", "which", "where", "when", "because", "but", "and", "or"]:
+    lang = doc.lang_
+    if lang == "en":
+        connectors = ["that", "which", "where", "when", "because", "but", "and", "or"]
+        mark_dep = "mark"
+        det_pron_deps = ["det", "pron"]
+        verb_pos = "VERB"
+        noun_pos = ["NOUN", "PROPN"]
+    elif lang == "ja":
+        connectors = ["けれども", "しかし", "だから", "それで", "ので", "のに", "ため"]
+        mark_dep = "mark"
+        det_pron_deps = ["case"]
+        verb_pos = "VERB"
+        noun_pos = ["NOUN", "PROPN"]
+    elif lang == "fr":
+        connectors = ["que", "qui", "où", "quand", "parce que", "mais", "et", "ou"]
+        mark_dep = "mark"
+        det_pron_deps = ["det", "pron"]
+        verb_pos = "VERB"
+        noun_pos = ["NOUN", "PROPN"]
+    elif lang == "ru":
+        connectors = ["что", "который", "где", "когда", "потому что", "но", "и", "или"] 
+        mark_dep = "mark"
+        det_pron_deps = ["det"]
+        verb_pos = "VERB"
+        noun_pos = ["NOUN", "PROPN"]
+    elif lang == "es":
+        connectors = ["que", "cual", "donde", "cuando", "porque", "pero", "y", "o"]
+        mark_dep = "mark"
+        det_pron_deps = ["det", "pron"]
+        verb_pos = "VERB"
+        noun_pos = ["NOUN", "PROPN"]
+    elif lang == "de":
+        connectors = ["dass", "welche", "wo", "wann", "weil", "aber", "und", "oder"]
+        mark_dep = "mark"
+        det_pron_deps = ["det", "pron"]
+        verb_pos = "VERB"
+        noun_pos = ["NOUN", "PROPN"]
+    elif lang == "it":
+        connectors = ["che", "quale", "dove", "quando", "perché", "ma", "e", "o"]
+        mark_dep = "mark"
+        det_pron_deps = ["det", "pron"]
+        verb_pos = "VERB"
+        noun_pos = ["NOUN", "PROPN"]
+    else:
+        return False, False
+
+    if token.text.lower() not in connectors:
         return False, False
 
-    if token.text.lower() == "that":
-        if token.dep_ == "mark" and token.head.pos_ == "VERB":
-            # Split if 'that' is a 'mark' dependent of a verb
+    if lang == "en" and token.text.lower() == "that":
+        if token.dep_ == mark_dep and token.head.pos_ == verb_pos:
             return True, False
         else:
-            # Don't split for other uses of 'that'
             return False, False
-    elif token.text.lower() != "that" and token.dep_ in ["det", "pron"] and token.head.pos_ in ["NOUN", "PROPN"]:
-        # Don't split if 'which', 'where', 'when' are determiners or pronouns for nouns
+    elif token.dep_ in det_pron_deps and token.head.pos_ in noun_pos:
         return False, False
     else:
         return True, False

diff --git a/core/spacy_utils/split_by_mark.py b/core/spacy_utils/split_by_mark.py
@@ -23,8 +23,13 @@ def split_by_mark(nlp):
     sentences_by_mark = [sent.text for sent in doc.sents]
 
     with open("output/log/sentence_by_mark.txt", "w", encoding="utf-8") as output_file:
-        for sentence in sentences_by_mark:
-            output_file.write(sentence + "\n")
+        for i, sentence in enumerate(sentences_by_mark):
+            if i > 0 and sentence.strip() in [',', '.', '，', '。', '？', '！']:
+                # ! If the current line contains only punctuation, merge it with the previous line, this happens in Chinese, Japanese, etc.
+                output_file.seek(output_file.tell() - 1, os.SEEK_SET)  # Move to the end of the previous line
+                output_file.write(sentence)  # Add the punctuation
+            else:
+                output_file.write(sentence + "\n")
 
     print("💾 Sentences split by punctuation marks saved to →  `sentences_by_mark.txt`")