Merge pull request #101 from r9y9/svs

[Frontend] [hts]: fixes for singing voice synthesis
r9y9 · Apr 26, 2020 · 1cfaeca · 1cfaeca
2 parents 39ac339 + 357966b
commit 1cfaeca
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 29 deletions.
diff --git a/appveyor.yml b/appveyor.yml
@@ -26,7 +26,7 @@ install:
   - conda config --set always_yes yes  --set changeps1 no
   - conda update -q conda
   - conda info -a
-  - "conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy cython nose pytorch -c pytorch"
+  - "conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy cython nose pytorch=1.4 -c pytorch"
   - activate test-environment
   - pip install scikit-learn==0.20.0
 

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -3,7 +3,8 @@ Change log
 
 v0.0.21 <2020-xx-xx>
 
-- `#99`_: FIx future warning from sklearn
+- `#99`_: Fix future warning from sklearn
+- `#101`_: [hts][frontend] various fixes for singing voice synthesis. Our frontend now supports MIDI number extraction. HTSLabelFile supports list and slice indexing.
 
 --------------------
 
@@ -198,4 +199,5 @@ v0.0.1 <2017-08-14>
 .. _#91: https://github.com/r9y9/nnmnkwii/issues/91
 .. _#95: https://github.com/r9y9/nnmnkwii/issues/95
 .. _#98: https://github.com/r9y9/nnmnkwii/pull/98
-.. _#99: https://github.com/r9y9/nnmnkwii/issues/99
+.. _#99: https://github.com/r9y9/nnmnkwii/issues/99
+.. _#101: https://github.com/r9y9/nnmnkwii/pull/101
diff --git a/nnmnkwii/frontend/__init__.py b/nnmnkwii/frontend/__init__.py
@@ -1 +1,22 @@
 from __future__ import division, print_function, absolute_import
+
+NOTE_MAPPING = {
+    'A0': 21, 'Bb0': 22, 'B0': 23, 'C1': 24, 'Db1': 25, 'D1': 26,
+    'Eb1': 27, 'E1': 28, 'F1': 29, 'Gb1': 30, 'G1': 31, 'Ab1': 32,
+    'A1': 33, 'Bb1': 34, 'B1': 35, 'C2': 36, 'Db2': 37, 'D2': 38,
+    'Eb2': 39, 'E2': 40, 'F2': 41, 'Gb2': 42, 'G2': 43, 'Ab2': 44,
+    'A2': 45, 'Bb2': 46, 'B2': 47, 'C3': 48, 'Db3': 49, 'D3': 50,
+    'Eb3': 51, 'E3': 52, 'F3': 53, 'Gb3': 54, 'G3': 55, 'Ab3': 56,
+    'A3': 57, 'Bb3': 58, 'B3': 59, 'C4': 60, 'Db4': 61, 'D4': 62,
+    'Eb4': 63, 'E4': 64, 'F4': 65, 'Gb4': 66, 'G4': 67, 'Ab4': 68,
+    'A4': 69, 'Bb4': 70, 'B4': 71, 'C5': 72, 'Db5': 73, 'D5': 74,
+    'Eb5': 75, 'E5': 76, 'F5': 77, 'Gb5': 78, 'G5': 79, 'Ab5': 80,
+    'A5': 81, 'Bb5': 82, 'B5': 83, 'C6': 84, 'Db6': 85, 'D6': 86,
+    'Eb6': 87, 'E6': 88, 'F6': 89, 'Gb6': 90, 'G6': 91, 'Ab6': 92,
+    'A6': 93, 'Bb6': 94, 'B6': 95, 'C7': 96, 'Db7': 97, 'D7': 98,
+    'Eb7': 99, 'E7': 100, 'F7': 101, 'Gb7': 102, 'G7': 103, 'Ab7': 104,
+    'A7': 105, 'Bb7': 106, 'B7': 107, 'C8': 108, 'Db8': 109, 'D8': 110,
+    'Eb8': 111, 'E8': 112, 'F8': 113, 'Gb8': 114, 'G8': 115, 'Ab8': 116,
+    'A8': 117, 'Bb8': 118, 'B8': 119, 'C9': 120, 'Db9': 121, 'D9': 122,
+    'Eb9': 123, 'E9': 124, 'F9': 125, 'Gb9': 126, 'G9': 127,
+}
diff --git a/nnmnkwii/frontend/merlin.py b/nnmnkwii/frontend/merlin.py
@@ -45,6 +45,7 @@
 import numpy as np
 
 from nnmnkwii.io import hts
+from nnmnkwii.frontend import NOTE_MAPPING
 
 
 def get_frame_feature_size(subphone_features="full"):
@@ -155,6 +156,8 @@ def pattern_matching_continous_position(continuous_dict, label):
         ms = current_compiled.search(label)
         if ms is not None:
             continuous_value = ms.group(1)
+            if continuous_value in NOTE_MAPPING:
+                continuous_value = NOTE_MAPPING[continuous_value]
 
         lab_continuous_vector[0, i] = continuous_value
 
@@ -236,7 +239,6 @@ def load_labels_with_phone_alignment(hts_labels,
                     raise ValueError(
                         "Combination of subphone_features and add_frame_features is not supported: {}, {}".format(
                             subphone_features, add_frame_features))
-
             label_feature_matrix[label_feature_index:label_feature_index +
                                  frame_number, ] = current_block_binary_array
             label_feature_index = label_feature_index + frame_number
@@ -587,7 +589,7 @@ def extract_dur_from_state_alignment_labels(hts_labels,
         else:
             pass
 
-    # dur_feature_matrix = dur_feature_matrix[0:dur_feature_index, ]
+    dur_feature_matrix = dur_feature_matrix[0:dur_feature_index, ]
     return dur_feature_matrix
 
 
@@ -634,7 +636,7 @@ def extract_dur_from_phone_alignment_labels(hts_labels,
         else:
             assert False
 
-    # dur_feature_matrix = dur_feature_matrix[0:dur_feature_index]
+    dur_feature_matrix = dur_feature_matrix[0:dur_feature_index]
     return dur_feature_matrix
 
 

diff --git a/nnmnkwii/io/hts.py b/nnmnkwii/io/hts.py
@@ -42,8 +42,7 @@
 
 import numpy as np
 import re
-
-# TODO: consider two label alignmetn format
+from copy import copy
 
 
 class HTSLabelFile(object):
@@ -96,13 +95,29 @@ def __init__(self, frame_shift_in_micro_sec=50000):
         self.start_times = []
         self.end_times = []
         self.contexts = []
-        frame_shift_in_micro_sec = frame_shift_in_micro_sec
+        self.frame_shift_in_micro_sec = frame_shift_in_micro_sec
 
     def __len__(self):
         return len(self.start_times)
 
     def __getitem__(self, idx):
-        return self.start_times[idx], self.end_times[idx], self.contexts[idx]
+        if isinstance(idx, slice):
+            # yes, this is inefficient and there will probably a bette way
+            # but this is okay for now
+            current, stop, _ = idx.indices(len(self))
+            obj = copy(self)
+            obj.start_times = obj.start_times[current:stop]
+            obj.end_times = obj.end_times[current:stop]
+            obj.contexts = obj.contexts[current:stop]
+            return obj
+        elif isinstance(idx, list):
+            obj = copy(self)
+            obj.start_times = list(np.asarray(obj.start_times)[idx])
+            obj.end_times = list(np.asarray(obj.end_times)[idx])
+            obj.contexts = list(np.asarray(obj.contexts)[idx])
+            return obj
+        else:
+            return self.start_times[idx], self.end_times[idx], self.contexts[idx]
 
     def __str__(self):
         ret = ""
@@ -115,11 +130,18 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
-    def append(self, label):
+    def round_(self):
+        s = self.frame_shift_in_micro_sec
+        self.start_times = list(np.round(np.asarray(self.start_times) / s).astype(np.int) * s)
+        self.end_times = list(np.round(np.asarray(self.end_times) / s).astype(np.int) * s)
+        return self
+
+    def append(self, label, strict=True):
         """Append a single alignment label
 
         Args:
             label (tuple): tuple of (start_time, end_time, context).
+            strict (bool): strict mode.
 
         Returns:
             self
@@ -132,14 +154,15 @@ def append(self, label):
         start_time = int(start_time)
         end_time = int(end_time)
 
-        if start_time >= end_time:
-            raise ValueError(
-                "end_time ({}) must be larger than start_time ({}).".format(
-                    end_time, start_time))
-        if len(self.end_times) > 0 and start_time != self.end_times[-1]:
-            raise ValueError(
-                "start_time ({}) must be equal to the last end_time ({}).".format(
-                    start_time, self.end_times[-1]))
+        if strict:
+            if start_time >= end_time:
+                raise ValueError(
+                    "end_time ({}) must be larger than start_time ({}).".format(
+                        end_time, start_time))
+            if len(self.end_times) > 0 and start_time != self.end_times[-1]:
+                raise ValueError(
+                    "start_time ({}) must be equal to the last end_time ({}).".format(
+                        start_time, self.end_times[-1]))
 
         self.start_times.append(start_time)
         self.end_times.append(end_time)
@@ -152,14 +175,14 @@ def set_durations(self, durations, frame_shift_in_micro_sec=50000):
         TODO:
             this should be refactored
         """
+        offset = self.start_times[0]
+
         # Unwrap state-axis
-        end_times = np.cumsum(
+        end_times = offset + np.cumsum(
             durations.reshape(-1, 1) * frame_shift_in_micro_sec).astype(np.int)
         if len(end_times) != len(self.end_times):
             raise RuntimeError("Unexpected input, maybe")
-        # Assuming first label starts with time `0`
-        # Is this really true? probably no
-        start_times = np.hstack((0, end_times[:-1])).astype(np.int)
+        start_times = np.hstack((offset, end_times[:-1])).astype(np.int)
         self.start_times, self.end_times = start_times, end_times
 
     def load(self, path=None, lines=None):
@@ -302,7 +325,7 @@ def load(path=None, lines=None):
     return labels.load(path, lines)
 
 
-def wildcards2regex(question, convert_number_pattern=False):
+def wildcards2regex(question, convert_number_pattern=False, convert_note_pattern=True):
     """subphone_features
     Convert HTK-style question into regular expression for searching labels.
     If convert_number_pattern, keep the following sequences unescaped for
@@ -329,17 +352,25 @@ def wildcards2regex(question, convert_number_pattern=False):
         question = question.replace('\\(\\\\d\\+\\)', '(\d+)')
         question = question.replace(
             '\\(\\[\\\\d\\\\\\.\\]\\+\\)', '([\d\.]+)')
+    if convert_note_pattern:
+        question = question.replace(
+            '\\(\\[A\\-Z\\]\\[b\\]\\?\\[0\\-9\\]\\+\\)', '([A-Z][b]?[0-9]+)')
+        question = question.replace('\\(\\\\NOTE\\)', '([A-Z][b]?[0-9]+)')
     return question
 
 
-def load_question_set(qs_file_name):
+def load_question_set(qs_file_name, append_hat_for_LL=True):
     """Load HTS-style question and convert it to binary/continuous feature
     extraction regexes.
 
     This code was taken from Merlin.
 
     Args:
         qs_file_name (str): Input HTS-style question file path
+        append_hat_for_LL (bool): Append ^ for LL regex search.
+            Note that the most left context is assumed to be phoneme identity
+            before the previous phoneme (i.e. LL-xx). This parameter should be False
+            for the HTS-demo_NIT-SONG070-F001 demo.
 
     Returns:
         (binary_dict, continuous_dict): Binary/continuous feature extraction
@@ -356,8 +387,7 @@ def load_question_set(qs_file_name):
     continuous_qs_index = 0
     binary_dict = {}
     continuous_dict = {}
-    # I guess `LL` means Left-left, but it doesn't seem to be docmented
-    # anywhere
+
     LL = re.compile(re.escape('LL-'))
 
     for line in lines:
@@ -383,10 +413,9 @@ def load_question_set(qs_file_name):
             continuous_qs_index = continuous_qs_index + 1
         elif temp_list[0] == 'QS':
             re_list = []
-            # import ipdb; ipdb.set_trace()
             for temp_question in question_list:
                 processed_question = wildcards2regex(temp_question)
-                if LL.search(question_key) and processed_question[0] != '^':
+                if append_hat_for_LL and LL.search(question_key) and processed_question[0] != '^':
                     processed_question = '^' + processed_question
                 re_list.append(re.compile(processed_question))