@@ -49,6 +49,12 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
4949 model .make_generation_fast_ ()
5050
5151 mel_org = np .load (join (in_dir , mel_filename ))
52+ # zero padd
53+ b_pad = r # imitates initial state
54+ e_pad = r - len (mel_org ) % r if len (mel_org ) % r > 0 else 0
55+ mel_org = np .pad (mel_org , [(b_pad , e_pad ), (0 , 0 )],
56+ mode = "constant" , constant_values = 0 )
57+
5258 mel = Variable (torch .from_numpy (mel_org )).unsqueeze (0 ).contiguous ()
5359
5460 # Downsample mel spectrogram
@@ -78,10 +84,10 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
7884 frame_positions = frame_positions , speaker_ids = speaker_ids )
7985
8086 mel_output = mel_outputs [0 ].data .cpu ().numpy ()
81-
8287 # **Time resolution adjustment**
83- # remove begenning audio used for first mel prediction
84- wav = np .load (join (in_dir , audio_filename ))[hparams .hop_size * downsample_step :]
88+ mel_output = mel_output [:- (b_pad + e_pad )]
89+
90+ wav = np .load (join (in_dir , audio_filename ))
8591 assert len (wav ) % hparams .hop_size == 0
8692
8793 # Coarse upsample just for convenience
@@ -102,8 +108,6 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
102108 timesteps = len (wav )
103109
104110 # save
105- np .save (join (out_dir , audio_filename ), wav .astype (np .int16 ),
106- allow_pickle = False )
107111 np .save (join (out_dir , mel_filename ), mel_output .astype (np .float32 ),
108112 allow_pickle = False )
109113
0 commit comments