TED has good results on the detection of car, but I found that the performance of pedestrian and cyclist is poor after adding them. The configuration file I use is as follows. I wrote it with reference to the configuration files of TED and Voxel RCNN. Is there anything inappropriate?
CLASS_NAMES: ['Car', 'Pedestrian', 'Cyclist']
DATA_CONFIG:
_BASE_CONFIG_: cfgs/dataset_configs/kitti_dataset.yaml
DATASET: 'KittiDataset'
ROT_NUM: 3
USE_VAN: True
DATA_SPLIT: {
'train': train,
'test': val
}
INFO_PATH: {
'train': [kitti_infos_train.pkl],
'test': [kitti_infos_val.pkl],
}
DATA_AUGMENTOR:
DISABLE_AUG_LIST: ['placeholder']
AUG_CONFIG_LIST:
- NAME: gt_sampling
USE_ROAD_PLANE: True
DB_INFO_PATH:
- kitti_dbinfos_train.pkl
PREPARE: {
filter_by_min_points: ['Car:5', 'Pedestrian:5', 'Cyclist:5'],
filter_by_difficulty: [-1],
}
SAMPLE_GROUPS: ['Car:10', 'Pedestrian:10', 'Cyclist:10']
NUM_POINT_FEATURES: 4
DATABASE_WITH_FAKELIDAR: False
REMOVE_EXTRA_WIDTH: [0.0, 0.0, -0.2]
LIMIT_WHOLE_SCENE: False
- NAME: da_sampling
USE_ROAD_PLANE: True
DB_INFO_PATH:
- kitti_dbinfos_train.pkl
PREPARE: {
filter_by_min_points: ['Car:5', 'Pedestrian:5', 'Cyclist:5'],
filter_by_difficulty: [-1],
}
SAMPLE_GROUPS: ['Car:10', 'Pedestrian:10', 'Cyclist:10']
MIN_SAMPLING_DIS: 0
MAX_SAMPLING_DIS: 20
OCCLUSION_NOISE: 0.2
OCCLUSION_OFFSET: 2.
SAMPLING_METHOD: 'LiDAR-aware'
VERT_RES: 0.006
HOR_RES: 0.003
NUM_POINT_FEATURES: 4
DATABASE_WITH_FAKELIDAR: False
REMOVE_EXTRA_WIDTH: [0.0, 0.0, -0.2]
LIMIT_WHOLE_SCENE: False
- NAME: random_local_noise
LOCAL_ROT_RANGE: [-0.78539816, 0.78539816]
TRANSLATION_STD: [1.0, 1.0, 0.5]
GLOBAL_ROT_RANGE: [0.0, 0.0]
EXTRA_WIDTH: [0.2, 0.2, 0.]
- NAME: random_world_rotation
WORLD_ROT_ANGLE: [-0.39269908, 0.39269908]
- NAME: random_world_scaling
WORLD_SCALE_RANGE: [0.95, 1.05]
- NAME: random_local_pyramid_aug
DROP_PROB: 0.25
SPARSIFY_PROB: 0.05
SPARSIFY_MAX_NUM: 50
SWAP_PROB: 0.1
SWAP_MAX_NUM: 50
X_TRANS:
AUG_CONFIG_LIST:
- NAME: world_rotation
WORLD_ROT_ANGLE: [0.39269908, 0, 0.39269908, -0.39269908, -0.39269908, 0]
- NAME: world_flip
ALONG_AXIS_LIST: [0, 1, 1, 0, 1, 0]
- NAME: world_scaling
WORLD_SCALE_RANGE: [ 0.98, 1.02, 1., 0.98, 1.02, 1.]
POINT_FEATURE_ENCODING: {
encoding_type: absolute_coordinates_encoding_mm,
used_feature_list: ['x', 'y', 'z', 'intensity'],
src_feature_list: ['x', 'y', 'z', 'intensity'],
num_features: 4
}
DATA_PROCESSOR:
- NAME: mask_points_and_boxes_outside_range
REMOVE_OUTSIDE_BOXES: True
- NAME: shuffle_points
SHUFFLE_ENABLED: {
'train': True,
'test': True
}
- NAME: transform_points_to_voxels
VOXEL_SIZE: [0.05, 0.05, 0.05]
MAX_POINTS_PER_VOXEL: 5
MAX_NUMBER_OF_VOXELS: {
'train': 16000,
'test': 40000
}
MODEL:
NAME: VoxelRCNN
VFE:
NAME: MeanVFE
MODEL: 'max'
BACKBONE_3D:
NAME: TeVoxelBackBone8x
NUM_FILTERS: [16, 32, 64, 64]
RETURN_NUM_FEATURES_AS_DICT: True
OUT_FEATURES: 64
MAP_TO_BEV:
NAME: BEVPool
NUM_BEV_FEATURES: 256
ALIGN_METHOD: 'max'
BACKBONE_2D:
NAME: BaseBEVBackbone
LAYER_NUMS: [4, 4]
LAYER_STRIDES: [1, 2]
NUM_FILTERS: [64, 128]
UPSAMPLE_STRIDES: [1, 2]
NUM_UPSAMPLE_FILTERS: [128, 128]
DENSE_HEAD:
NAME: AnchorHeadSingle
CLASS_AGNOSTIC: False
USE_DIRECTION_CLASSIFIER: True
DIR_OFFSET: 0.78539
DIR_LIMIT_OFFSET: 0.0
NUM_DIR_BINS: 2
ANCHOR_GENERATOR_CONFIG: [
{
'class_name': 'Car',
'anchor_sizes': [[3.9, 1.6, 1.56]],
'anchor_rotations': [0, 1.57],
'anchor_bottom_heights': [-1.78],
'align_center': False,
'feature_map_stride': 8,
'matched_threshold': 0.6,
'unmatched_threshold': 0.45
},
{
'class_name': 'Pedestrian',
'anchor_sizes': [[0.8, 0.6, 1.73]],
'anchor_rotations': [0, 1.57],
'anchor_bottom_heights': [-0.6],
'align_center': False,
'feature_map_stride': 8,
'matched_threshold': 0.5,
'unmatched_threshold': 0.35
},
{
'class_name': 'Cyclist',
'anchor_sizes': [[1.76, 0.6, 1.73]],
'anchor_rotations': [0, 1.57],
'anchor_bottom_heights': [-0.6],
'align_center': False,
'feature_map_stride': 8,
'matched_threshold': 0.5,
'unmatched_threshold': 0.35
}
]
TARGET_ASSIGNER_CONFIG:
NAME: AxisAlignedTargetAssigner
POS_FRACTION: -1.0
SAMPLE_SIZE: 512
NORM_BY_NUM_EXAMPLES: False
MATCH_HEIGHT: False
BOX_CODER: ResidualCoder
LOSS_CONFIG:
LOSS_WEIGHTS: {
'cls_weight': 1.0,
'loc_weight': 2.0,
'dir_weight': 0.2,
'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
}
ROI_HEAD:
NAME: TEDSHead
CLASS_AGNOSTIC: True
SHARED_FC: [256, 256]
CLS_FC: [256, 256]
REG_FC: [256, 256]
DP_RATIO: 0.01
NMS_CONFIG:
TRAIN:
NMS_TYPE: nms_gpu
MULTI_CLASSES_NMS: False
NMS_PRE_MAXSIZE: 4000
NMS_POST_MAXSIZE: 512
NMS_THRESH: 0.8
TEST:
NMS_TYPE: nms_gpu
MULTI_CLASSES_NMS: False
USE_FAST_NMS: True
SCORE_THRESH: 0.0
NMS_PRE_MAXSIZE: 4000
NMS_POST_MAXSIZE: 50
NMS_THRESH: 0.75
ROI_GRID_POOL:
FEATURES_SOURCE: ['x_conv3','x_conv4']
PRE_MLP: True
GRID_SIZE: 6
POOL_LAYERS:
x_conv3:
MLPS: [[32, 32], [32, 32]]
QUERY_RANGES: [[2, 2, 2], [4, 4, 4]]
POOL_RADIUS: [0.4, 0.8]
NSAMPLE: [16, 16]
POOL_METHOD: max_pool
x_conv4:
MLPS: [[32, 32], [32, 32]]
QUERY_RANGES: [[2, 2, 2], [4, 4, 4]]
POOL_RADIUS: [0.8, 1.6]
NSAMPLE: [16, 16]
POOL_METHOD: max_pool
TARGET_CONFIG:
BOX_CODER: ResidualCoder
ROI_PER_IMAGE: 160
FG_RATIO: 0.5
SAMPLE_ROI_BY_EACH_CLASS: True
CLS_SCORE_TYPE: roi_iou_x
CLS_FG_THRESH: [0.75, 0.75, 0.75]
CLS_BG_THRESH: [0.25, 0.25, 0.25]
CLS_BG_THRESH_LO: 0.1
HARD_BG_RATIO: 0.8
REG_FG_THRESH: [0.55, 0.55, 0.55]
ENABLE_HARD_SAMPLING: True
HARD_SAMPLING_THRESH: [0.5, 0.5, 0.5]
HARD_SAMPLING_RATIO: [0.5, 0.5, 0.5]
LOSS_CONFIG:
CLS_LOSS: BinaryCrossEntropy
REG_LOSS: smooth-l1
CORNER_LOSS_REGULARIZATION: True
GRID_3D_IOU_LOSS: False
LOSS_WEIGHTS: {
'rcnn_cls_weight': 1.0,
'rcnn_reg_weight': 1.0,
'rcnn_corner_weight': 1.0,
'rcnn_iou3d_weight': 1.0,
'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
}
POST_PROCESSING:
RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
SCORE_THRESH: 0.25
OUTPUT_RAW_SCORE: False
EVAL_METRIC: kitti
NMS_CONFIG:
MULTI_CLASSES_NMS: False
NMS_TYPE: nms_gpu
NMS_THRESH: 0.1
NMS_PRE_MAXSIZE: 4096
NMS_POST_MAXSIZE: 500
OPTIMIZATION:
BATCH_SIZE_PER_GPU: 2
NUM_EPOCHS: 40
OPTIMIZER: adam_onecycle
LR: 0.01
WEIGHT_DECAY: 0.01
MOMENTUM: 0.9
MOMS: [0.95, 0.85]
PCT_START: 0.4
DIV_FACTOR: 10
DECAY_STEP_LIST: [35, 45]
LR_DECAY: 0.1
LR_CLIP: 0.0000001
LR_WARMUP: False
WARMUP_EPOCH: 1
GRAD_NORM_CLIP: 10