Final bug fixes for 3.0 release (#761)

* Final bug fixes * Fix for '--dither 0' flag being ignored
MontrealCorpusTools · Feb 27, 2024 · 76c539f · 76c539f
1 parent 76b1ddd
commit 76c539f
Show file tree

Hide file tree

Showing 16 changed files with 170 additions and 265 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -9,7 +9,7 @@ assignees: mmcauliffe
 
 **Debugging checklist**
 
-[ ] Have you updated to latest MFA version?
+[ ] Have you updated to latest MFA version? What is the output of `mfa version`?
 [ ] Have you tried rerunning the command with the `--clean` flag?
 
 **Describe the issue**

diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,6 +5,12 @@
 3.0 Changelog
 *************
 
+3.0.0
+=====
+
+- Fixed a regression where :code:`--dither` was not being passed correctly
+- Fixed a bug on Windows when symlink permissions were not present
+
 3.0.0rc2
 ========
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -85,6 +85,10 @@
     "python-mecab-ko": ("python-mecab-ko", "https://github.com/jonghwanhyeon/python-mecab-ko"),
     "jamo": ("jamo", "https://github.com/jdongian/python-jamo"),
     "pythainlp": ("pythainlp", "https://pythainlp.github.io/"),
+    "sudachipy": (
+        "sudachipy",
+        "https://github.com/WorksApplications/sudachi.rs/tree/develop/python",
+    ),
     "click": ("click", "https://click.palletsprojects.com/en/8.1.x/"),
     "pgvector": ("pgvector", "https://github.com/pgvector/pgvector"),
     "pretrained_acoustic_models": (

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -20,6 +20,14 @@ General installation
 
 3. Ensure you're in the new environment created (:code:`conda activate aligner`)
 
+.. note::
+
+   I recommend using :code:`mamba` as the primary installer.  Mamba is a drop-in replacement for the conda command that needs to be installed in the conda base environment.  You can install and use :code:`mamba` via:
+
+   1. :code:`conda activate base`
+   2. :code:`conda install -c conda-forge mamba`
+   3. :code:`mamba create -n aligner -c conda-forge montreal-forced-aligner`
+
 Installing SpeechBrain
 ----------------------
 
@@ -44,9 +52,9 @@ If you need to use an older version of MFA, you can install it via:
 
 More stable key versions:
 
-* Stable 2.2 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.2.17`
-* Stable 2.1 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.1.7`
-* Stable 2.0 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.0.6`
+* Stable 2.2 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068`
+* Stable 2.1 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.1.7 openfst=1.8.2 kaldi=5.5.1068`
+* Stable 2.0 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.0.6 openfst=1.8.2 kaldi=5.5.1068`
 * Stable 1.0 release: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1
 
 .. _docker_installation:

diff --git a/docs/source/user_guide/concepts/features.md b/docs/source/user_guide/concepts/features.md
@@ -14,6 +14,10 @@ Still under construction, I hope to fill these sections out as I have time.
 (features_mfcc)=
 ## Mel-Frequency Cepstrum Coefficients (MFCCs)
 
+```{seealso}
+* https://kaldi-asr.org/doc/feat.html#feat_mfcc
+```
+
 
 (features_pitch)=
 ## Pitch
diff --git a/docs/source/user_guide/concepts/hmm.md b/docs/source/user_guide/concepts/hmm.md
@@ -21,3 +21,5 @@ Still under construction, I hope to fill these sections out as I have time.
 ```
 
 ### MFA topology
+
+MFA uses a variable 5-state topology for modeling phones.  Each state has a likelihood to transition to the final state in addition to the next state.  What this is means is that each phone has a minimum duration of 10ms (corresponding to the default time step for MFCC generation), rather than 30ms for a more standard 3-state HMM.  Having a shorter minimum duration reduces alignment errors from short or dropped phones, i.e., American English flaps or schwas, or accommodate for dictionary errors (though these should still be fixed).
diff --git a/docs/source/user_guide/configuration/global.rst b/docs/source/user_guide/configuration/global.rst
@@ -85,4 +85,3 @@ This sections details configuration options related to how MFA normalizes text a
    "quote_markers", "“„"”〝〟″「」『』‚ʻʿ‘′'", "Characters that are used as quotes in the language"
    "word_break_markers", "？!()，,.:;¡¿?“„"”&~%#—…‥、。【】$+=〝〟″‹›«»・⟨⟩「」『』", "Characters to use in addition to white space when breaking transcripts into words"
    "brackets", "('[', ']'), ('{', '}'), ('<', '>'), ('(', ')'), , ('＜', '＞')", "Punctuation to keep as bracketing a whole word, i.e., a restart, disfluency, etc"
-   "phone_set_type", "UNKNOWN", ":class:`~montreal_forced_aligner.data.PhoneSetType` to enable extra questions and more directed decision tree clustering during training"
diff --git a/docs/source/user_guide/dictionary.rst b/docs/source/user_guide/dictionary.rst
@@ -26,7 +26,7 @@ Text normalization and dictionary lookup
 
 If a word is not found in the dictionary, and has no orthographic
 markers for morpheme boundaries (apostrophes or hyphens), then it will
-be replaced in the output with '<unk>' for unknown word.
+be replaced in the output with :code:`<unk>` for unknown word.
 
 .. note::
 
@@ -44,11 +44,9 @@ from the ends and beginnings of words, except for the :code:`brackets` specified
 
    The definition of punctuation, clitic markers, and compound markers can be set in a config file, see :ref:`configuration_dictionary` for more details.
 
-Dictionary lookup will attempt to generate the most maximal coverage of
-novel forms if they use some overt morpheme boundary in the orthography.
+Dictionary lookup will attempt to generate the most maximal coverage of novel forms if they use some overt morpheme boundary in the orthography.
 
-For instance, in French, clitics are marked using apostrophes between the
-bound clitic and the stem.  Thus given a dictionary like:
+For instance, in French, clitics are marked using apostrophes between the bound clitic and the stem.  Thus given a dictionary like:
 
 .. highlight:: none
 
@@ -196,7 +194,15 @@ Often in spontaneous speech, speakers will produce truncated or cut-off words of
 1. The cutoff word matches the pattern of :code:`{start_bracket}(cutoff|hes)`, where :code:`{start_bracket}` is the set of all left side brackets defined in :code:`brackets` (:ref:`configuration_dictionary`). The following word must not be an OOV or non-speech word (silence, laughter, another cutoff, etc).
 2. The cutoff word matches the pattern of :code:`{start_bracket}(cutoff|hes)[-_](word){end_bracket}`, where start and end brackets are defined in :code:`brackets` (:ref:`configuration_dictionary`).  The :code:`word` will be used in place of the following word above, but needs to be present in the dictionary, otherwise the target word for the cutoff will default back to the following word.
 
-The generated pronunciations
+The generated pronunciations will be subsequences of the following word, along with an :code:`spn` pronunciation.  For example, given an utterance transcript like "<cutoff> cut off" will have the following pronunciations generated for  the `English (US) MFA dictionary <https://mfa-models.readthedocs.io/en/latest/dictionary/English/English%20%28US%29%20MFA%20dictionary%20v3_0_0.html>`_:
+
+::
+
+  <cutoff-cut>   spn
+  <cutoff-cut>   kʰ ɐ t
+  <cutoff-cut>   kʰ ɐ
+  <cutoff-cut>   kʰ
+
 
 .. _speaker_dictionaries:
 

diff --git a/docs/source/user_guide/workflows/train_acoustic_model.rst b/docs/source/user_guide/workflows/train_acoustic_model.rst
@@ -5,204 +5,24 @@ Train a new acoustic model ``(mfa train)``
 
 You can train new :term:`acoustic models` from scratch using MFA, and export the final alignments as :term:`TextGrids` at the end.  You don't need a ton of data to generate decent alignments (see `the blog post comparing alignments trained on various corpus sizes <https://memcauliffe.com/how-much-data-do-you-need-for-a-good-mfa-alignment.html>`_).  At the end of the day, it comes down to trial and error, so I would recommend trying different workflows of pretrained models vs training your own or adapting a model to your data to see what performs best.
 
-Phone set
-=========
+Phone topology
+==============
+
+The phone topology that MFA uses is different from the standard 3-state HMMs.  Each phone can have a maximum of 5 states, but allows for early exiting, so each phone has a minimum duration of 10ms (one MFCC frame) rather than 30ms for the 3-state HMM (three MFCC frames).
+
+.. seealso::::
+
+   See :doc:`phone groups <../concepts/hmm>` for more information on HMMs and phone typologies.
+
+Phone groups
+============
+
+By default each phone is treated independently of one another, which can lead to data sparsity issues or worse contextual modeling for clearly related phones when modeling triphones (i.e., long/short vowels :ipa_inline:`ɑ/ɑː`, stressed/unstressed versions :ipa_inline:`OY1/OY2/OY0`). Phone groups can be specified via the :code:`--phone_groups_path` flag. See :doc:`phone groups <../implementations/phone_groups>` for more information.
 
-.. note::
 
-   See :doc:`phone groups <../implementations/phone_groups>` for how to customize phone groups to your specific needs rather than using the preset phone groups of the defined phone sets in this section.
-
-The type of phone set can be specified through ``--phone_set``.  Currently only ``IPA``, ``ARPA``, and ``PINYIN`` are supported, but I plan to make it more customizable in the future.  The primary benefit of specifying the phone set is to create phone topologies that are more sensible than the defaults.
-
-The default phone model uses 3 HMM states to represent phones, as that generally does a decent job of capturing the dynamic nature of phones.  Something like an aspirated stop typically has three clear states, a closure, a burst, and an aspiration period.  However, other phones like a tap, glottal stop, or unstressed schwa are so short that they can cause misalignment errors.  For these, a single HMM state is more sensible, so they have a shorter minimum duration (each HMM state has a minimum 10ms duration).  For vowels, 3 states generally makes sense for monophthongs, where one state corresponds to the onset, one to the "steady state", and one to the offset. For diphthongs and triphthongs, three states doesn't map as clearly to the states, as you'll have an onset, a first target, a transition, a second target, and an offset (and a third target for tiphthongs).  Specifying phone sets will use preset stops, affricates, diphthongs, triphthongs and extra short segments.  Certain diacritics (``ʱʼʰʲʷⁿˠ``) will result in one more state being added, as they represent quite different acoustics from the base phone.
-
-An additional benefit is in guiding the decision tree clustering for triphone modeling, where using phone sets will add extra questions for allophonic variation, as well as for general classes of sounds (sibilant sounds, places of articulation, rhotics, groups of vowels, etc). These questions should be more appropriate than the default setting.
-
-.. tab-set::
-
-   .. tab-item:: IPA
-      :sync: ipa
-
-
-      The topology generated by IPA phone set generates base phone classes for extra short phones, stop phones, affricate phones, diphthongs, and triphthongs. Any phones below not used in the dictionary will be ignored.
-
-      .. list-table:: Non-default IPA Topologies
-         :header-rows: 1
-
-         * - Phone class
-           - HMM states
-           - Phones
-         * - Extra short phones
-           - 1
-           - ``ʔ ə ɚ ɾ p̚ t̚ k̚``
-         * - Stop phones
-           - 2
-           - ``p b t d ʈ ɖ c ɟ k ɡ q ɢ``
-         * - Affricate phones
-           - 4
-           - ``pf ts dz tʃ dʒ tɕ dʑ tʂ ʈʂ dʐ ɖʐ cç ɟʝ kx ɡɣ tç dʝ``
-         * - Diphthongs
-           - 5
-           - Two of: ``i u e ə a o y ɔ j w ɪ ʊ w ʏ ɯ ɤ ɑ æ ɐ ɚ ɵ ɘ ɛ ɜ ɝ ɛ ɞ ɑ ɨ ɪ̈ œ ɒ ɶ ø ʉ ʌ``
-         * - Triphthongs
-           - 6
-           - Three of: ``i u e ə a o y ɔ j w ɪ ʊ w ʏ ɯ ɤ ɑ æ ɐ ɚ ɵ ɘ ɛ ɜ ɝ ɛ ɞ ɑ ɨ ɪ̈ œ ɒ ɶ ø ʉ ʌ``
-
-   .. tab-item:: ARPA
-      :sync: arpa
-
-      For ARPA, we use the following topology calculation.  Additionally, stress-marked vowels are collected under a single base phone (i.e., ``AA0 AA1 AA2`` are collected under ``AA``), so they will share states during training.
-
-      .. list-table:: Non-default ARPA Topologies
-         :header-rows: 1
-
-         * - Phone class
-           - HMM states
-           - Phones
-         * - Extra short phones
-           - 1
-           - ``AH0 IH0 ER0 UH0``
-         * - Stop phones
-           - 2
-           - ``B D G`` (``P T K`` not included because they include aspiration)
-         * - Affricate phones
-           - 4
-           - ``CH JH``
-         * - Diphthongs
-           - 5
-           - ``AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 EY0 EY1 EY2 OW0 OW1 OW2``
-
-
-      .. list-table:: ARPA Extra Questions
-         :header-rows: 1
-
-         * - Question Group
-           - Phones
-           - Notes
-         * - Bilabial stops
-           - ``B P``
-           -
-         * - Dentals
-           - ``D DH``
-           - ``/ð/`` often is realized as ``/d/`` for high frequency words in many dialects of American English
-         * - Flapping
-           - ``D T``
-           -
-         * - Nasals
-           - ``M N NG``
-           -
-         * - Voiceless sibilants
-           - ``CH SH S``
-           -
-         * - Voiced sibilants
-           - ``JH ZH Z``
-           -
-         * - Voiceless fricatives
-           - ``F TH HH K``
-           - ``K`` is included for reductions to a more fricative realization ``/x/`` in casual speech
-         * - Voiced fricatives
-           - ``V DH HH G``
-           - G included for the same reason as above
-         * - Dorsals
-           - ``K G HH``
-           -
-         * - Rhotics
-           - ``ER0 ER1 ER2 R``
-           - ``ER`` vowels are really just ``/ɹ̩/``
-         * - Low back vowels
-           - ``AO0 AO1 AO2 AA0 AA1 AA2``
-           - Cot-caught merger
-         * - Central vowels
-           - ``ER0 ER1 ER2 AH0 AH1 AH2 UH0 UH1 UH2 IH0 IH1 IH2``
-           -
-         * - High back vowels
-           - ``UW1 UW2 UW0 UH1 UH2 UH0``
-           -
-         * - High front vowels
-           - ``IY1 IY2 IY0 IH0 IH1 IH2``
-           -
-         * - Mid front vowels
-           - ``EY1 EY2 EY0 EH0 EH1 EH2``
-           -
-         * - Primary stressed vowels
-           - ``AA1 AE1 AH1 AO1 AW1 AY1 EH1 ER1 EY1 IH1 IY1 OW1 OY1 UH1 UW1``
-           - Following the `Kaldi LibriSpeech recipe <https://github.com/kaldi-asr/kaldi/blob/master/egs/librispeech/s5/local/prepare_dict.sh#L125>`_
-         * - Secondary stressed vowels
-           - ``AA2 AE2 AH2 AO2 AW2 AY2 EH2 ER2 EY2 IH2 IY2 OW2 OY2 UH2 UW2``
-           -
-         * - Unstressed vowels
-           - ``AA0 AE0 AH0 AO0 AW0 AY0 EH0 ER0 EY0 IH0 IY0 OW0 OY0 UH0 UW0``
-           -
-
-
-
-   .. tab-item:: PINYIN
-      :sync: pinyin
-
-      .. list-table:: Non-default Pinyin Topologies
-         :header-rows: 1
-
-         * - Phone class
-           - HMM states
-           - Phones
-         * - Stop phones
-           - 2
-           - ``b d g`` (``p t k`` not included because they're aspirated)
-         * - Affricate phones
-           - 4
-           - ``z zh j``
-         * - Aspirated affricate phones
-           - 5
-           - ``c ch q``
-         * - Diphthongs
-           - 5
-           - Two of: ``i u y e w a o e ü``
-         * - Triphthongs
-           - 6
-           - Three of: ``i u y e w a o e ü``
-
-      .. list-table:: Pinyin Extra Questions
-         :header-rows: 1
-
-         * - Question Group
-           - Phones
-           - Notes
-         * - Bilabial stops
-           - ``b p``
-           -
-         * - Alveolar stops
-           - ``d t``
-           -
-         * - Nasals
-           - ``m n ng``
-           -
-         * - Voiceless sibilants
-           - ``z zh j c ch q s sh x``
-           -
-         * - Dorsals
-           - ``k g h``
-           - Pinyin ``h`` is a velar fricative ``/x/``
-         * - Rhotics
-           - ``r sh e``
-           - ``e`` is included to capture instances of ``ɚ``
-         * - Approximants
-           - ``l r y w``
-           -
-         * - Tone 1
-           - All monophthong, diphthongs, triphthongs with tone 1
-           -
-         * - Tone 2
-           - All monophthong, diphthongs, triphthongs with tone 2
-           -
-         * - Tone 3
-           - All monophthong, diphthongs, triphthongs with tone 3
-           -
-         * - Tone 4
-           - All monophthong, diphthongs, triphthongs with tone 4
-           -
-         * - Tone 5
-           - All monophthong, diphthongs, triphthongs with tone 5
-           -
+.. deprecated:: 3.0.0
+
+   Using the :code:`--phone_set` flag to generate phone groups is deprecated as of MFA 3.0, please refer to using :code:`--phone_groups_path` flag to specify a phone groups configuration file instead.
 
 Pronunciation modeling
 ======================
@@ -216,6 +36,20 @@ A recent experimental feature for training acoustic models is the ``--train_g2p`
    See :doc:`phonological rules <../implementations/phonological_rules>` for how to specify regular expression-like phonological rules so you don't have to code every form for a regular rule.
 
 
+Language tokenization
+=====================
+
+By specifying a language via the :code:`--language` flag, tokenization will occur as part of text normalization.  This functionality is primarily useful for languages that do not rely on spaces to delimit words like Japanese, Thai, or Chinese languages.  If you're also using :code:`--g2p_model_path` to generate pronunciations during training, note that the language setting will require G2P models trained on specific orthographies (i.e., using :code:`mfa model download g2p korean_jamo_mfa` instead of :code:`mfa model download g2p korean_mfa`).
+
+
+.. csv-table::
+   :header: "Language", "Pronunciation orthography", "Input", "Output", "Dependencies", "G2P model"
+
+   "Japanese", "Katakana", "これは日本語です", "コレ ワ ニホンゴ デス", ":xref:`sudachipy`", "`Katakana G2P <https://mfa-models.readthedocs.io/en/latest/g2p/Japanese/Japanese%20%28Katakana%29%20MFA%20G2P%20model%20v3_0_0.html>`_"
+   "Korean", "Jamo", "이건 한국어야", "이건 한국어 야", ":xref:`python-mecab-ko`, :xref:`jamo`", "`Jamo G2P <https://mfa-models.readthedocs.io/en/latest/g2p/Korean/Korean%20%28Jamo%29%20MFA%20G2P%20model%20v3_0_0.html>`_"
+   "Chinese", "Pinyin", "这是中文", "zhèshì zhōngwén", ":xref:`spacy-pkuseg`, :xref:`hanziconv`, :xref:`dragonmapper`", "`Pinyin G2P <https://mfa-models.readthedocs.io/en/latest/g2p/Mandarin/Mandarin%20%28China%20Pinyin%29%20MFA%20G2P%20model%20v3_0_0.html>`_"
+   "Thai", "Thai script", "นี่คือภาษาไทย", "นี่ คือ ภาษาไทย", ":xref:`pythainlp`", "`Thai G2P <https://mfa-models.readthedocs.io/en/latest/g2p/Thai/Thai%20MFA%20G2P%20model%20v3_0_0.html>`_"
+
 Command reference
 =================
 

diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py
@@ -632,7 +632,7 @@ def parse_args(
                 unknown_dict[name] = val
         for name, param_type in param_types.items():
             if (name.endswith("_directory") and name != "audio_directory") or (
-                name.endswith("_path") and name not in {"rules_path", "groups_path"}
+                name.endswith("_path") and name not in {"rules_path", "phone_groups_path"}
             ):
                 continue
             if args is not None and name in args and args[name] is not None:

diff --git a/montreal_forced_aligner/alignment/mixins.py b/montreal_forced_aligner/alignment/mixins.py
@@ -301,7 +301,7 @@ def align_utterances(self, training=False) -> None:
                     }
                 )
         if not training:
-            if len(update_mappings) == 0:
+            if len(update_mappings) == 0 or num_successful == 0:
                 raise NoAlignmentsError(self.num_current_utterances, self.beam, self.retry_beam)
             with self.session() as session:
                 bulk_update(session, Utterance, update_mappings)