......
......
......
......
......
......
Torch distributed is available.
Torch distributed is initialized.
Torch distributed is available.
Torch distributed is initialized.
Traceback (most recent call last):
Traceback (most recent call last):
File "/workspace/bert/run_pretraining.py", line 1592, in <module>
File "/workspace/bert/run_pretraining.py", line 1592, in <module>
args, final_loss, train_time_raw = main()
File "/workspace/bert/run_pretraining.py", line 1141, in main
args, final_loss, train_time_raw = main()
File "/workspace/bert/run_pretraining.py", line 1141, in main
model = fwd_loss_bwd_trainer.capture_bert_model_segment_graph(model, use_cuda_graph)
File "/workspace/bert/fwd_loss_bwd_trainer.py", line 43, in capture_bert_model_segment_graph
model = fwd_loss_bwd_trainer.capture_bert_model_segment_graph(model, use_cuda_graph)bert_model_segment = graph(bert_model_segment,
File "/workspace/bert/fwd_loss_bwd_trainer.py", line 43, in capture_bert_model_segment_graph
File "/workspace/bert/function.py", line 66, in graph
bert_model_segment = graph(bert_model_segment,
File "/workspace/bert/function.py", line 66, in graph
outputs = func_or_module(*sample_args)outputs = func_or_module(*sample_args)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 1009, in forward
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 1009, in forward
sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 901, in forward
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 901, in forward
encoded_layers = self.encoder(embedding_output,
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
encoded_layers = self.encoder(embedding_output,
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 577, in forward
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 577, in forward
hidden_states = layer_module(hidden_states, cu_seqlens, actual_seqlens, maxseqlen_in_batch)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
hidden_states = layer_module(hidden_states, cu_seqlens, actual_seqlens, maxseqlen_in_batch)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 500, in forward
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 500, in forward
attention_output = self.attention(hidden_states, attention_mask, seqlen, batch)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
attention_output = self.attention(hidden_states, attention_mask, seqlen, batch)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 424, in forward
return forward_call(*input, **kwargs)
File "/workspace/bert/modeling.py", line 424, in forward
self_output = self.self(input_tensor, attention_mask, seqlen, batch, is_training = self.training)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
self_output = self.self(input_tensor, attention_mask, seqlen, batch, is_training = self.training)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1015, in _call_impl
return forward_call(*input, **kwargs)
File "/workspace/bert/fmha.py", line 161, in forward
return forward_call(*input, **kwargs)
File "/workspace/bert/fmha.py", line 161, in forward
ctx = FMHAFun.apply(qkv.view(-1, 3, self.h, self.d), cu_seqlens, seqlens, p_dropout, max_s, is_training)ctx = FMHAFun.apply(qkv.view(-1, 3, self.h, self.d), cu_seqlens, seqlens, p_dropout, max_s, is_training)
File "/opt/conda/lib/python3.8/site-packages/apex/contrib/fmha/fmha.py", line 36, in forward
File "/opt/conda/lib/python3.8/site-packages/apex/contrib/fmha/fmha.py", line 36, in forward
context, S_dmask = mha.fwd(qkv, cu_seqlens, seqlens, p_dropout, max_s, is_training, None)
context, S_dmask = mha.fwd(qkv, cu_seqlens, seqlens, p_dropout, max_s, is_training, None)
RuntimeError: RuntimeErrorExpected dprops->major == 8 && dprops->minor == 0 to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.):
Expected dprops->major == 8 && dprops->minor == 0 to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.)
......