Bug fixes (#795)

* Bug fixes * Fix failing test * Fix tokenizers * Add tests for language tokenizer
MontrealCorpusTools · Apr 21, 2024 · 33b3127 · 33b3127
1 parent d35a9df
commit 33b3127
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 7 deletions.
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,6 +5,14 @@
 3.0 Changelog
 *************
 
+3.0.6
+-----
+
+- Fixed an issue where alignment analysis would not produce data for speech log likelihood and phone duration deviation
+- Changed phone duration deviation metric to be maximum duration deviation rather than average across all phones in the utterance
+- Fixed a crash when an empty phone set was specified in phone groups configuration files
+- Fixed a crash when when using the :code:`--language` flag with values other than :code`japanese`, :code`thai`, :code`chinese` or :code`korean`
+
 3.0.5
 =====
 

diff --git a/montreal_forced_aligner/acoustic_modeling/trainer.py b/montreal_forced_aligner/acoustic_modeling/trainer.py
@@ -625,6 +625,7 @@ def compute_phone_pdf_counts(self) -> None:
     def finalize_training(self):
         self.compute_phone_pdf_counts()
         self.collect_alignments()
+        self.analyze_alignments()
         self.train_phone_lm()
 
     def export_files(
@@ -707,7 +708,6 @@ def align(self) -> None:
                     {"done": True}
                 )
                 session.commit()
-            self.analyze_alignments()
         except Exception as e:
             with self.session() as session:
                 session.query(CorpusWorkflow).filter(CorpusWorkflow.id == wf.id).update(

diff --git a/montreal_forced_aligner/alignment/base.py b/montreal_forced_aligner/alignment/base.py
@@ -170,6 +170,9 @@ def analyze_alignments(self):
         if not config.USE_POSTGRES:
             logger.warning("Alignment analysis not available without using postgresql")
             return
+        workflow = self.current_workflow
+        if not workflow.alignments_collected:
+            self.collect_alignments()
         logger.info("Analyzing alignment quality...")
         begin = time.time()
         with self.session() as session:

diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
@@ -614,7 +614,7 @@ def _finalize_load(self, session: Session, import_data: DatabaseImportData):
         self._num_files = None
         session.commit()
 
-    def normalize_text_arguments(self):
+    def get_tokenizers(self):
         from montreal_forced_aligner.dictionary.mixins import DictionaryMixin
 
         if self.language is Language.unknown:
@@ -631,6 +631,16 @@ def normalize_text_arguments(self):
                 tokenizers = self.tokenizer
             else:
                 return None
+        return tokenizers
+
+    def get_tokenizer(self, dictionary_id: int):
+        tokenizers = self.get_tokenizers()
+        if not isinstance(tokenizers, dict):
+            return tokenizers
+        return tokenizers[dictionary_id]
+
+    def normalize_text_arguments(self):
+        tokenizers = self.get_tokenizers()
         from montreal_forced_aligner.corpus.multiprocessing import NormalizeTextArguments
 
         with self.session() as session:
@@ -642,7 +652,7 @@ def normalize_text_arguments(self):
                     self.split_directory.joinpath("log", f"normalize.{j.id}.log"),
                     tokenizers,
                     getattr(self, "g2p_model", None),
-                    self.ignore_case,
+                    getattr(self, "ignore_case", True),
                 )
                 for j in jobs
             ]

diff --git a/montreal_forced_aligner/corpus/multiprocessing.py b/montreal_forced_aligner/corpus/multiprocessing.py
@@ -313,7 +313,11 @@ def _run(self):
                             if isinstance(tokenized, tuple):
                                 normalized_text, pronunciation_form = tokenized
                             else:
-                                normalized_text, pronunciation_form = tokenized, tokenized
+                                if not isinstance(tokenized, str):
+                                    tokenized = " ".join([x.text for x in tokenized])
+                                if self.ignore_case:
+                                    tokenized = tokenized.lower()
+                                normalized_text, pronunciation_form = tokenized, tokenized.lower()
                             oovs = set()
                             self.callback(
                                 (
@@ -334,7 +338,11 @@ def _run(self):
                     .filter(Utterance.job_id == self.job_name)
                 )
                 for u_id, u_text in utterances:
-                    normalized_text, normalized_character_text, oovs = tokenizer(u_text)
+                    if tokenizer is None:
+                        normalized_text, normalized_character_text = u_text, u_text
+                        oovs = []
+                    else:
+                        normalized_text, normalized_character_text, oovs = tokenizer(u_text)
                     self.callback(
                         (
                             {

diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
@@ -18,9 +18,9 @@
 if TYPE_CHECKING:
     from montreal_forced_aligner.abc import MetaDict
 
-DEFAULT_PUNCTUATION = list(r'、。।，？！!@<>→"”()“„–,.:;—¿?¡：）؟!\\&%#*،~【】，…‥「」『』〝〟″⟨⟩♪・‚‘‹›«»～′$+=‘۔')
+DEFAULT_PUNCTUATION = list(r'、。।，？！!@<>→"”()“„–,.:;—¿?¡：）|؟!\\&%#*،~【】，…‥「」『』〝〟″⟨⟩♪・‚‘‹›«»～′$+=‘۔')
 
-DEFAULT_WORD_BREAK_MARKERS = list(r'？！!()，,.:;¡¿?“„"”&~%#—…‥、。【】$+=〝〟″‹›«»・⟨⟩،「」『』؟')
+DEFAULT_WORD_BREAK_MARKERS = list(r'？！!()，,.:;¡¿?“„"”&~%#—…‥、。|【】$+=〝〟″‹›«»・⟨⟩،「」『』؟')
 
 DEFAULT_QUOTE_MARKERS = list("“„\"”〝〟″「」『』‚ʻʿ‘′'")
 

diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py
@@ -160,6 +160,8 @@ def load_phone_groups(self) -> None:
                 if isinstance(self._phone_groups, list):
                     self._phone_groups = {k: v for k, v in enumerate(self._phone_groups)}
                 for k, v in self._phone_groups.items():
+                    if not v:
+                        continue
                     self._phone_groups[k] = sorted(
                         set(x for x in v if x in self.non_silence_phones)
                     )

diff --git a/tests/test_commandline_align.py b/tests/test_commandline_align.py
@@ -671,6 +671,8 @@ def test_swedish_cv(
         swedish_cv_dictionary,
         swedish_cv_acoustic_model,
         output_dir,
+        "--language",
+        "swedish",
         "--config_path",
         basic_align_config_path,
         "-q",