Fix some bugs (#12153) (#12201)

ko3n1g · blisc · web-flow · commit 798b6763f349 · 2025-02-17T10:19:49.000+01:00
* update outdated image to np code; update random sampling



* code cleanup



---------

Signed-off-by: Jason &lt;jasoli@nvidia.com&gt;
Co-authored-by: Jason &lt;jasoli@nvidia.com&gt;
diff --git a/nemo/collections/tts/data/dataset.py b/nemo/collections/tts/data/dataset.py
@@ -190,10 +190,10 @@ def __init__(
             self.phoneme_probability = getattr(self.text_tokenizer, "phoneme_probability", None)
         else:
             if text_tokenizer_pad_id is None:
-                raise ValueError(f"text_tokenizer_pad_id must be specified if text_tokenizer is not BaseTokenizer")
+                raise ValueError("text_tokenizer_pad_id must be specified if text_tokenizer is not BaseTokenizer")
 
             if tokens is None:
-                raise ValueError(f"tokens must be specified if text_tokenizer is not BaseTokenizer")
+                raise ValueError("tokens must be specified if text_tokenizer is not BaseTokenizer")
 
             self.text_tokenizer_pad_id = text_tokenizer_pad_id
         self.cache_text = True if self.phoneme_probability is None else False
@@ -496,7 +496,7 @@ def add_reference_audio(self, **kwargs):
                 speaker_to_index_map[d["speaker_id"]].add(i)
             # Random sample a reference audio from the same speaker
             self.get_reference_for_sample = lambda sample: self.data[
-                random.sample(speaker_to_index_map[sample["speaker_id"]], 1)[0]
+                random.choice(speaker_to_index_map[tuple(sample["speaker_id"])])
             ]
         elif reference_audio_type == "ground-truth":
             # Use ground truth audio as reference audio
@@ -679,7 +679,7 @@ def __getitem__(self, index):
                     sample_pitch_mean = pitch_stats["pitch_mean"]
                     sample_pitch_std = pitch_stats["pitch_std"]
                 else:
-                    raise ValueError(f"Missing statistics for pitch normalization.")
+                    raise ValueError("Missing statistics for pitch normalization.")
 
                 pitch -= sample_pitch_mean
                 pitch[pitch == -sample_pitch_mean] = 0.0  # Zero out values that were previously zero
diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py
@@ -632,10 +632,8 @@ def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
 
 
 def save_figure_to_numpy(fig):
-    # save it to a numpy array.
-    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
-    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-    return data
+    img_array = np.array(fig.canvas.renderer.buffer_rgba())
+    return img_array
 
 
 @rank_zero_only
@@ -802,8 +800,7 @@ def clip_grad_value_(parameters, clip_value, norm_type=2):
 
 
 def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
+    pad_shape = [item for sublist in pad_shape[::-1] for item in sublist]
     return pad_shape