ubuntu@ip-172-31-93-8:~/radtts$ python3 train.py -c ./configs/config_ljs_decoder.json -p train_config.output_directory=./output
train_config.output_directory=./output
output_directory=./output
overriding output_directory with ./output
{'train_config': {'output_directory': './output', 'epochs': 10000000, 'optim_algo': 'RAdam', 'learning_rate': 0.0001, 'weight_decay': 1e-06, 'sigma': 1.0, 'iters_per_checkpoint': 2500, 'batch_size': 16, 'seed': None, 'checkpoint_path': '', 'ignore_layers': [], 'ignore_layers_warmstart': [], 'finetune_layers': [], 'include_layers': [], 'vocoder_config_path': 'models/hifigan_config_22khz.json', 'vocoder_checkpoint_path': 'models/hifigan_ljs_generator_v1', 'log_attribute_samples': False, 'log_decoder_samples': True, 'warmstart_checkpoint_path': '', 'use_amp': False, 'grad_clip_val': 1.0, 'loss_weights': {'blank_logprob': -1, 'ctc_loss_weight': 0.1, 'binarization_loss_weight': 1.0, 'dur_loss_weight': 1.0, 'f0_loss_weight': 1.0, 'energy_loss_weight': 1.0, 'vpred_loss_weight': 1.0}, 'binarization_start_iter': 6000, 'kl_loss_start_iter': 18000, 'unfreeze_modules': 'all'}, 'data_config': {'training_files': {'LJS': {'basedir': 'filelists/', 'audiodir': 'wavs', 'filelist': 'ljs_audiopath_text_speaker_train_filelist.txt', 'lmdbpath': ''}}, 'validation_files': {'LJS': {'basedir': 'filelists/', 'audiodir': 'wavs', 'filelist': 'ljs_audiopath_text_speaker_val_filelist.txt', 'lmdbpath': ''}}, 'dur_min': 0.1, 'dur_max': 10.2, 'sampling_rate': 22050, 'filter_length': 1024, 'hop_length': 256, 'win_length': 1024, 'n_mel_channels': 80, 'mel_fmin': 0.0, 'mel_fmax': 8000.0, 'f0_min': 80.0, 'f0_max': 640.0, 'max_wav_value': 32768.0, 'use_f0': True, 'use_log_f0': 0, 'use_energy_avg': True, 'use_scaled_energy': True, 'symbol_set': 'radtts', 'cleaner_names': ['radtts_cleaners'], 'heteronyms_path': 'tts_text_processing/heteronyms', 'phoneme_dict_path': 'tts_text_processing/cmudict-0.7b', 'p_phoneme': 1.0, 'handle_phoneme': 'word', 'handle_phoneme_ambiguous': 'ignore', 'include_speakers': None, 'n_frames': -1, 'betabinom_cache_path': 'data_cache/', 'lmdb_cache_path': '', 'use_attn_prior_masking': True, 'prepend_space_to_text': True, 'append_space_to_text': True, 'add_bos_eos_to_text': False, 'betabinom_scaling_factor': 1.0, 'distance_tx_unvoiced': False, 'mel_noise_scale': 0.0}, 'dist_config': {'dist_backend': 'nccl', 'dist_url': 'tcp://localhost:54321'}, 'model_config': {'n_speakers': 1, 'n_speaker_dim': 16, 'n_text': 185, 'n_text_dim': 512, 'n_flows': 8, 'n_conv_layers_per_step': 4, 'n_mel_channels': 80, 'n_hidden': 1024, 'mel_encoder_n_hidden': 512, 'dummy_speaker_embedding': False, 'n_early_size': 2, 'n_early_every': 2, 'n_group_size': 2, 'affine_model': 'wavenet', 'include_modules': 'decatnvpred', 'scaling_fn': 'tanh', 'matrix_decomposition': 'LUS', 'learn_alignments': True, 'use_speaker_emb_for_alignment': False, 'attn_straight_through_estimator': True, 'use_context_lstm': True, 'context_lstm_norm': 'spectral', 'context_lstm_w_f0_and_energy': True, 'text_encoder_lstm_norm': 'spectral', 'n_f0_dims': 1, 'n_energy_avg_dims': 1, 'use_first_order_features': False, 'unvoiced_bias_activation': 'relu', 'decoder_use_partial_padding': True, 'decoder_use_unvoiced_bias': True, 'ap_pred_log_f0': True, 'ap_use_unvoiced_bias': True, 'ap_use_voiced_embeddings': True, 'dur_model_config': None, 'f0_model_config': None, 'energy_model_config': None, 'v_model_config': {'name': 'dap', 'hparams': {'n_speaker_dim': 16, 'take_log_of_input': False, 'bottleneck_hparams': {'in_dim': 512, 'reduction_factor': 16, 'norm': 'weightnorm', 'non_linearity': 'relu'}, 'arch_hparams': {'out_dim': 1, 'n_layers': 2, 'n_channels': 256, 'kernel_size': 3, 'p_dropout': 0.5, 'lstm_type': '', 'use_linear': 1}}}}}
> got rank 0 and world size 1 ...
./output
Using seed 1806
Applying spectral norm to text encoder LSTM
Applying spectral norm to context encoder LSTM
/home/ubuntu/radtts/common.py:391: UserWarning: torch.qr is deprecated in favor of torch.linalg.qr and will be removed in a future PyTorch release.
The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at ../aten/src/ATen/native/BatchLinearAlgebra.cpp:2497.)
W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
Initializing RAdam optimizer
RADTTS(
(speaker_embedding): Embedding(1, 16)
(embedding): Embedding(185, 512)
(flows): ModuleList(
(0): FlowStep(
(invtbl_conv): Invertible1x1ConvLUS()
(affine_tfn): AffineTransformationLayer(
(affine_param_predictor): WN(
(in_layers): ModuleList(
(0): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
)
(2): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,))
)
(3): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,))
)
)
(res_skip_layers): ModuleList(
(0): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(1120, 1024, kernel_size=(1,), stride=(1,))
(softplus): Softplus(beta=1, threshold=20)
(end): Conv1d(1024, 160, kernel_size=(1,), stride=(1,))
)
)
)
(1): FlowStep(
(invtbl_conv): Invertible1x1ConvLUS()
(affine_tfn): AffineTransformationLayer(
(affine_param_predictor): WN(
(in_layers): ModuleList(
(0): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
)
(2): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,))
)
(3): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,))
)
)
(res_skip_layers): ModuleList(
(0): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(1120, 1024, kernel_size=(1,), stride=(1,))
(softplus): Softplus(beta=1, threshold=20)
(end): Conv1d(1024, 160, kernel_size=(1,), stride=(1,))
)
)
)
(2): FlowStep(
(invtbl_conv): Invertible1x1ConvLUS()
(affine_tfn): AffineTransformationLayer(
(affine_param_predictor): WN(
(in_layers): ModuleList(
(0): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
)
(2): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,))
)
(3): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,))
)
)
(res_skip_layers): ModuleList(
(0): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(1119, 1024, kernel_size=(1,), stride=(1,))
(softplus): Softplus(beta=1, threshold=20)
(end): Conv1d(1024, 158, kernel_size=(1,), stride=(1,))
)
)
)
(3): FlowStep(
(invtbl_conv): Invertible1x1ConvLUS()
(affine_tfn): AffineTransformationLayer(
(affine_param_predictor): WN(
(in_layers): ModuleList(
(0): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
)
(2): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,))
)
(3): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,))
)
)
(res_skip_layers): ModuleList(
(0): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(1119, 1024, kernel_size=(1,), stride=(1,))
(softplus): Softplus(beta=1, threshold=20)
(end): Conv1d(1024, 158, kernel_size=(1,), stride=(1,))
)
)
)
(4): FlowStep(
(invtbl_conv): Invertible1x1ConvLUS()
(affine_tfn): AffineTransformationLayer(
(affine_param_predictor): WN(
(in_layers): ModuleList(
(0): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
)
(2): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,))
)
(3): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,))
)
)
(res_skip_layers): ModuleList(
(0): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(1118, 1024, kernel_size=(1,), stride=(1,))
(softplus): Softplus(beta=1, threshold=20)
(end): Conv1d(1024, 156, kernel_size=(1,), stride=(1,))
)
)
)
(5): FlowStep(
(invtbl_conv): Invertible1x1ConvLUS()
(affine_tfn): AffineTransformationLayer(
(affine_param_predictor): WN(
(in_layers): ModuleList(
(0): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
)
(2): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,))
)
(3): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,))
)
)
(res_skip_layers): ModuleList(
(0): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(1118, 1024, kernel_size=(1,), stride=(1,))
(softplus): Softplus(beta=1, threshold=20)
(end): Conv1d(1024, 156, kernel_size=(1,), stride=(1,))
)
)
)
(6): FlowStep(
(invtbl_conv): Invertible1x1ConvLUS()
(affine_tfn): AffineTransformationLayer(
(affine_param_predictor): WN(
(in_layers): ModuleList(
(0): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
)
(2): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,))
)
(3): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,))
)
)
(res_skip_layers): ModuleList(
(0): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(1117, 1024, kernel_size=(1,), stride=(1,))
(softplus): Softplus(beta=1, threshold=20)
(end): Conv1d(1024, 154, kernel_size=(1,), stride=(1,))
)
)
)
(7): FlowStep(
(invtbl_conv): Invertible1x1ConvLUS()
(affine_tfn): AffineTransformationLayer(
(affine_param_predictor): WN(
(in_layers): ModuleList(
(0): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
)
(2): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,))
)
(3): ConvNorm(
(conv): PartialConv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,))
)
)
(res_skip_layers): ModuleList(
(0): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(1117, 1024, kernel_size=(1,), stride=(1,))
(softplus): Softplus(beta=1, threshold=20)
(end): Conv1d(1024, 154, kernel_size=(1,), stride=(1,))
)
)
)
)
(encoder): Encoder(
(convolutions): ModuleList(
(0): Sequential(
(0): ConvNorm(
(conv): PartialConv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
)
(1): Sequential(
(0): ConvNorm(
(conv): PartialConv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
)
(2): Sequential(
(0): ConvNorm(
(conv): PartialConv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
)
)
(lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
)
(length_regulator): LengthRegulator()
(attention): ConvAttention(
(softmax): Softmax(dim=3)
(log_softmax): LogSoftmax(dim=3)
(key_proj): Sequential(
(0): ConvNorm(
(conv): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
)
(1): ReLU()
(2): ConvNorm(
(conv): Conv1d(1024, 80, kernel_size=(1,), stride=(1,))
)
)
(query_proj): Sequential(
(0): ConvNorm(
(conv): Conv1d(80, 160, kernel_size=(3,), stride=(1,), padding=(1,))
)
(1): ReLU()
(2): ConvNorm(
(conv): Conv1d(160, 80, kernel_size=(1,), stride=(1,))
)
(3): ReLU()
(4): ConvNorm(
(conv): Conv1d(80, 80, kernel_size=(1,), stride=(1,))
)
)
)
(context_lstm): LSTM(1044, 520, batch_first=True, bidirectional=True)
(unfold): Unfold(kernel_size=(2, 1), dilation=1, padding=0, stride=2)
(unvoiced_bias_module): Sequential(
(0): LinearNorm(
(linear_layer): Linear(in_features=512, out_features=1, bias=True)
)
(1): ReLU()
)
(v_pred_module): DAP(
(bottleneck_layer): BottleneckLayerLayer(
(projection_fn): ConvNorm(
(conv): Conv1d(512, 32, kernel_size=(3,), stride=(1,), padding=(1,))
)
(non_linearity): ReLU()
)
(feat_pred_fn): ConvLSTMLinear(
(dropout): Dropout(p=0.5, inplace=False)
(convolutions): ModuleList(
(0): Conv1d(48, 256, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
)
(dense): Linear(in_features=256, out_features=1, bias=True)
)
)
(v_embeddings): Embedding(4, 512)
)
initializing training dataloader
Number of speakers: 1
Speaker IDS {'ljs': 0}
Number of files 12442
Number of files after duration filtering 12442
Dataloader initialized with no augmentations
initializing validation dataloader
Number of files 58
Number of files after duration filtering 58
Dataloader initialized with no augmentations
/home/ubuntu/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py:563: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 4, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
warnings.warn(_create_warning_msg(
saving current configuration in output dir
alignment.py
attribute_prediction_model.py
audio_processing.py
autoregressive_flow.py
common.py
data.py
distributed.py
hifigan_denoiser.py
hifigan_env.py
hifigan_models.py
hifigan_utils.py
inference.py
inference_voice_conversion.py
loss.py
partialconv1d.py
plotting_utils.py
radam.py
radtts.py
splines.py
train.py
transformer.py
setting up tboard log in ./output/logs
Training everything
Epoch: 0
Traceback (most recent call last):
File "train.py", line 498, in <module>
train(n_gpus, rank, **train_config)
File "train.py", line 382, in train
for batch in train_loader:
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 681, in __next__
data = self._next_data()
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1402, in _process_data
data.reraise()
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/_utils.py", line 461, in reraise
raise exception
FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ubuntu/radtts/data.py", line 318, in __getitem__
audio, sampling_rate = load_wav_to_torch(audiopath)
File "/home/ubuntu/radtts/data.py", line 74, in load_wav_to_torch
sampling_rate, data = read(full_path)
File "/home/ubuntu/.local/lib/python3.8/site-packages/scipy/io/wavfile.py", line 647, in read
fid = open(filename, 'rb')
FileNotFoundError: [Errno 2] No such file or directory: 'filelists/wavs/LJ048-0022.wav'
Apologies if these are dilettante questions, I'm a web dev by trade, definitely not an ML person. I'd appreciate any help you have to offer. I'm still going through a couple Udemy courses to learn more.