I've been trying testing the pre-trained model on cityperson dataset under the cityscape dataset.
The code perfectly went through when the input_size was setting as (256, 256) for Human3.6M as default. But obviously, the output root location was not correct due to the scaling problem. The proper input_size of the cityperson image is (2048, 1024). However, if I set the size to (2048, 1024) I got the following error:
06-15 11:35:20 Creating dataset...
loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
06-15 11:35:21 Load checkpoint from /dump/algopre/c-szan/github/3DMPPE_ROOTNET_RELEASE/main/../output/model_dump/snapshot_18.pth.tar
06-15 11:35:21 Creating graph...
0%| | 0/33 [00:00<?, ?it/s]THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1607370193460/work/aten/src/THC/THCCachingHostAllocator.cpp line=278 error=700 : an illegal memory access was encountered
0%| | 0/33 [02:02<?, ?it/s]
Traceback (most recent call last):
File "test.py", line 54, in <module>
main()
File "test.py", line 45, in main
coord_out = tester.model(input_img, cam_param)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 161, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 171, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/_utils.py", line 428, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/dump/algopre/c-szan/github/3DMPPE_ROOTNET_RELEASE/main/model.py", line 100, in forward
fm = self.backbone(input_img)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/dump/algopre/c-szan/github/3DMPPE_ROOTNET_RELEASE/main/../common/nets/resnet.py", line 64, in forward
x = self.layer4(x)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torchvision/models/resnet.py", line 116, in forward
identity = self.downsample(x)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 423, in forward
return self._conv_forward(input, self.weight)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 420, in _conv_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.allow_tf32 = True
data = torch.randn([32, 1024, 128, 64], dtype=torch.float, device='cuda', requires_grad=True)
net = torch.nn.Conv2d(1024, 2048, kernel_size=[1, 1], padding=[0, 0], stride=[2, 2], dilation=[1, 1], groups=1)
net = net.cuda().float()
out = net(data)
out.backward(torch.randn_like(out))
torch.cuda.synchronize()
ConvolutionParams
data_type = CUDNN_DATA_FLOAT
padding = [0, 0, 0]
stride = [2, 2, 0]
dilation = [1, 1, 0]
groups = 1
deterministic = false
allow_tf32 = true
input: TensorDescriptor 0x7f260a187ca0
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 32, 1024, 128, 64,
strideA = 8388608, 8192, 64, 1,
output: TensorDescriptor 0x7f2ed40a1160
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 32, 2048, 64, 32,
strideA = 4194304, 2048, 32, 1,
weight: FilterDescriptor 0x7f2ed40a2b40
type = CUDNN_DATA_FLOAT
tensor_format = CUDNN_TENSOR_NCHW
nbDims = 4
dimA = 2048, 1024, 1, 1,
Pointer addresses:
input: 0x7f1abc000000
output: 0x7f1a7c000000
weight: 0x7f3006a00000
terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: an illegal memory access was encountered
Exception raised from create_event_internal at /opt/conda/conda-bld/pytorch_1607370193460/work/c10/cuda/CUDACachingAllocator.cpp:687 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f305a5d18b2 in /dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: c10::cuda::CUDACachingAllocator::raw_delete(void*) + 0xad2 (0x7f305a823982 in /dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10::TensorImpl::release_resources() + 0x4d (0x7f305a5bcb7d in /dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #3: <unknown function> + 0x5fe1ea (0x7f309bd8e1ea in /dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #4: <unknown function> + 0x5fe296 (0x7f309bd8e296 in /dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
<omitting python frames>
frame #19: __libc_start_main + 0xf0 (0x7f30cd22f840 in /lib/x86_64-linux-gnu/libc.so.6)
Aborted (core dumped)
To debug, I then make the input_size to (1024,1024). Then it perfectly went through and the result makes more sense than when I set it to (256, 256). I also tried (2048, 2048) and it gave a different error:
06-15 11:45:01 Creating dataset...
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
06-15 11:45:02 Load checkpoint from /dump/algopre/c-szan/github/3DMPPE_ROOTNET_RELEASE/main/../output/model_dump/snapshot_18.pth.tar
06-15 11:45:02 Creating graph...
0%| | 0/33 [02:14<?, ?it/s]
Traceback (most recent call last):
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 872, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/queue.py", line 173, in get
self.not_empty.wait(remaining)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/threading.py", line 299, in wait
gotit = waiter.acquire(True, timeout)
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 85820) is killed by signal: Killed.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "test.py", line 54, in <module>
main()
File "test.py", line 43, in main
for itr, (input_img, cam_param) in enumerate(tqdm(tester.batch_generator)):
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/tqdm/std.py", line 1178, in __iter__
for obj in iterable:
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
data = self._next_data()
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1068, in _next_data
idx, data = self._get_data()
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1024, in _get_data
success, data = self._try_get_data()
File "/dump/algopre/c-szan/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 885, in _try_get_data
raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 85820) exited unexpectedly
I don't know what is going on when the input_size is different. Can you please look at the problem and offer me some clues?