[2022-11-21 18:27:12,121 u22:6465][file_handler.py:79][INFO] No benchmark config provided, using config file /home/edison/.local/lib/python3.10/site-packages/superbench/config/default.yaml.
[2022-11-21 18:27:12,156 u22:6465][ansible.py:59][INFO] {'host_pattern': 'all', 'cmdline': '--forks 1 --inventory /home/edison/Downloads/superbenchmark/local.ini'}
[2022-11-21 18:27:12,163 u22:6465][runner.py:42][INFO] Runner uses config: {'superbench': {'benchmarks': {'bert_models': {'enable': True,
'frameworks': ['pytorch'],
'models': ['bert-base',
'bert-large'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}],
'parameters': {'batch_size': 1,
'duration': 0,
'model_action': ['train'],
'num_steps': 128,
'num_warmup': 16,
'precision': ['float32',
'float16']}},
'computation-communication-overlap': {'enable': True,
'frameworks': ['pytorch'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}]},
'cpu-memory-bw-latency': {'enable': False,
'modes': [{'name': 'local',
'parallel': False,
'proc_num': 1}],
'parameters': {'tests': ['bandwidth_matrix',
'latency_matrix',
'max_bandwidth']}},
'cublas-function': {'enable': True,
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank}',
'proc_num': 8}]},
'cudnn-function': {'enable': True,
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank}',
'proc_num': 8}]},
'densenet_models': {'enable': True,
'frameworks': ['pytorch'],
'models': ['densenet169',
'densenet201'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}],
'parameters': {'batch_size': 1,
'duration': 0,
'model_action': ['train'],
'num_steps': 128,
'num_warmup': 16,
'precision': ['float32',
'float16']}},
'disk-benchmark': {'enable': False,
'modes': [{'name': 'local',
'parallel': False,
'proc_num': 1}],
'parameters': {'block_devices': ['/dev/nvme0n1']}},
'gemm-flops': {'enable': True,
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank}',
'proc_num': 8}]},
'gpcnet-network-load-test': {'enable': False,
'modes': [{'env': {'UCX_NET_DEVICES': 'mlx5_0:1'},
'mca': {'btl': '^uct',
'btl_tcp_if_include': 'eth0',
'pml': 'ucx'},
'name': 'mpi',
'proc_num': 1}]},
'gpcnet-network-test': {'enable': False,
'modes': [{'env': {'UCX_NET_DEVICES': 'mlx5_0:1'},
'mca': {'btl': '^uct',
'btl_tcp_if_include': 'eth0',
'pml': 'ucx'},
'name': 'mpi',
'proc_num': 1}]},
'gpt_models': {'enable': True,
'frameworks': ['pytorch'],
'models': ['gpt2-small',
'gpt2-large'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}],
'parameters': {'batch_size': 1,
'duration': 0,
'model_action': ['train'],
'num_steps': 128,
'num_warmup': 16,
'precision': ['float32',
'float16']}},
'gpu-burn': {'enable': True,
'modes': [{'name': 'local',
'parallel': False,
'proc_num': 1}],
'parameters': {'doubles': True,
'tensor_core': True,
'time': 300}},
'gpu-copy-bw:correctness': {'enable': True,
'modes': [{'name': 'local',
'parallel': False}],
'parameters': {'check_data': True,
'copy_type': ['sm',
'dma'],
'mem_type': ['htod',
'dtoh',
'dtod'],
'num_loops': 1,
'num_warm_up': 0,
'size': 4096}},
'gpu-copy-bw:perf': {'enable': True,
'modes': [{'name': 'local',
'parallel': False}],
'parameters': {'copy_type': ['sm',
'dma'],
'mem_type': ['htod',
'dtoh',
'dtod']}},
'ib-loopback': {'enable': True,
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'PROC_RANK={proc_rank} '
'IB_DEVICES=0,2,4,6 '
'NUMA_NODES=1,0,3,2',
'proc_num': 4},
{'name': 'local',
'parallel': True,
'prefix': 'PROC_RANK={proc_rank} '
'IB_DEVICES=1,3,5,7 '
'NUMA_NODES=1,0,3,2',
'proc_num': 4}]},
'ib-traffic': {'enable': False,
'modes': [{'name': 'mpi',
'proc_num': 8}],
'parameters': {'gpu_dev': '$LOCAL_RANK',
'ib_dev': 'mlx5_$LOCAL_RANK',
'msg_size': 8388608,
'numa_dev': '$((LOCAL_RANK/2))'}},
'kernel-launch': {'enable': True,
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank}',
'proc_num': 8}]},
'lstm_models': {'enable': True,
'frameworks': ['pytorch'],
'models': ['lstm'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}],
'parameters': {'batch_size': 1,
'duration': 0,
'model_action': ['train'],
'num_steps': 128,
'num_warmup': 16,
'precision': ['float32',
'float16']}},
'matmul': {'enable': True,
'frameworks': ['pytorch'],
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank}',
'proc_num': 8}]},
'mem-bw': {'enable': True,
'modes': [{'name': 'local',
'parallel': False,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank} '
'numactl -N '
'$(({proc_rank}/2))',
'proc_num': 8}]},
'nccl-bw:default': {'enable': True,
'modes': [{'name': 'local',
'parallel': False,
'proc_num': 1}],
'parameters': {'ngpus': 8}},
'nccl-bw:gdr-only': {'enable': True,
'modes': [{'env': {'NCCL_IB_DISABLE': '0',
'NCCL_IB_PCI_RELAXED_ORDERING': '1',
'NCCL_MIN_NCHANNELS': '16',
'NCCL_NET_GDR_LEVEL': '5',
'NCCL_P2P_DISABLE': '1',
'NCCL_SHM_DISABLE': '1'},
'name': 'local',
'parallel': False,
'proc_num': 1}],
'parameters': {'ngpus': 8}},
'ort-inference': {'enable': True,
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank}',
'proc_num': 8}],
'parameters': {'batch_size': 1}},
'resnet_models': {'enable': True,
'frameworks': ['pytorch'],
'models': ['resnet50',
'resnet101',
'resnet152'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}],
'parameters': {'batch_size': 1,
'duration': 0,
'model_action': ['train'],
'num_steps': 128,
'num_warmup': 16,
'precision': ['float32',
'float16']}},
'sharding-matmul': {'enable': True,
'frameworks': ['pytorch'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}]},
'tcp-connectivity': {'enable': False,
'modes': [{'name': 'local',
'parallel': False}],
'parameters': {'port': 22}},
'tensorrt-inference': {'enable': True,
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank}',
'proc_num': 8}],
'parameters': {'batch_size': 1,
'precision': 'int8',
'pytorch_models': ['resnet50',
'resnet101',
'resnet152',
'densenet169',
'densenet201',
'bert-base',
'bert-large'],
'seq_length': 224}},
'vgg_models': {'enable': True,
'frameworks': ['pytorch'],
'models': ['vgg11',
'vgg13',
'vgg16',
'vgg19'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}],
'parameters': {'batch_size': 1,
'duration': 0,
'model_action': ['train'],
'num_steps': 128,
'num_warmup': 16,
'precision': ['float32',
'float16']}}},
'enable': None,
'monitor': {'enable': True,
'sample_duration': 1,
'sample_interval': 10},
'var': {'common_model_config': {'batch_size': 1,
'duration': 0,
'model_action': ['train'],
'num_steps': 128,
'num_warmup': 16,
'precision': ['float32',
'float16']},
'default_local_mode': {'enable': True,
'modes': [{'name': 'local',
'parallel': True,
'prefix': 'CUDA_VISIBLE_DEVICES={proc_rank}',
'proc_num': 8}]},
'default_pytorch_mode': {'enable': True,
'frameworks': ['pytorch'],
'modes': [{'name': 'torch.distributed',
'node_num': 1,
'proc_num': 8}]}}},
'version': 'v0.6'}.
[2022-11-21 18:27:12,163 u22:6465][runner.py:43][INFO] Runner writes to: /home/edison/Downloads/superbenchmark/outputs/2022-11-21_18-27-12.
[2022-11-21 18:27:12,179 u22:6465][runner.py:48][INFO] Runner will run: ['gpu-burn', 'nccl-bw:default', 'nccl-bw:gdr-only', 'ib-loopback', 'mem-bw', 'gpu-copy-bw:correctness', 'gpu-copy-bw:perf', 'kernel-launch', 'gemm-flops', 'cudnn-function', 'cublas-function', 'matmul', 'sharding-matmul', 'computation-communication-overlap', 'ort-inference', 'tensorrt-inference', 'gpt_models', 'bert_models', 'lstm_models', 'resnet_models', 'densenet_models', 'vgg_models']
[2022-11-21 18:27:12,179 u22:6465][runner.py:165][INFO] Preparing SuperBench environment.
[2022-11-21 18:27:12,179 u22:6465][ansible.py:125][INFO] Run playbook deploy.yaml ...
PLAY [Facts Gathering] *********************************************************
TASK [Gathering Facts] *********************************************************
ok: [localhost]
PLAY [Context Preparation] *****************************************************
TASK [Generating SSH Config] ***************************************************
changed: [localhost]
TASK [Generating SSH Key Pair] *************************************************
changed: [localhost]
PLAY [Check GPU Environment] ***************************************************
TASK [Checking NVIDIA GPU Environment] *****************************************
ok: [localhost] => (item=/dev/nvidiactl)
ok: [localhost] => (item=/dev/nvidia-uvm)
TASK [Checking AMD GPU Environment] ********************************************
ok: [localhost] => (item=/dev/kfd)
ok: [localhost] => (item=/dev/dri)
TASK [Set GPU Facts] ***********************************************************
ok: [localhost]
TASK [Print GPU Checking Result] ***********************************************
ok: [localhost] => {
"msg": [
"NVIDIA GPU detected",
"AMD GPU not operational, pls confirm amdgpu kernel module is loaded"
]
}
PLAY [Remote Deployment] *******************************************************
TASK [Creating Workspace] ******************************************************
ok: [localhost] => (item=/home/edison/sb-workspace)
ok: [localhost] => (item=/home/edison/sb-workspace/.ssh)
TASK [Copying Context] *********************************************************
fatal: [localhost]: FAILED! => {"msg": "Failed to get information on remote file (/home/edison/sb-workspace/.ssh/config): sudo: a password is required\n"}
PLAY RECAP *********************************************************************
localhost : ok=8 changed=2 unreachable=0 failed=1 skipped=0 rescued=0 ignored=0
[2022-11-21 18:27:14,383 u22:6465][ansible.py:80][WARNING] Run failed, return code 2.