diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..320ad2af --- /dev/null +++ b/.gitignore @@ -0,0 +1,123 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/en/_build/ +docs/zh_cn/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +data/ +data +.vscode +.idea +.DS_Store + +# custom +*.pkl +*.pkl.json +*.log.json +docs/modelzoo_statistics.md +work_dirs/ + +# Pytorch +*.pth +*.py~ +*.sh~ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5fb9225a..44a13a9e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://gitlab.com/pycqa/flake8.git - rev: 3.7.9 + rev: 3.8.3 hooks: - id: flake8 - repo: https://github.com/asottile/seed-isort-config diff --git a/configs/mmdet/_base_/datasets/cityscapes_detection.py b/configs/mmdet/_base_/datasets/cityscapes_detection.py new file mode 100644 index 00000000..e341b59d --- /dev/null +++ b/configs/mmdet/_base_/datasets/cityscapes_detection.py @@ -0,0 +1,56 @@ +# dataset settings +dataset_type = 'CityscapesDataset' +data_root = 'data/cityscapes/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 1024), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=1, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=8, + dataset=dict( + type=dataset_type, + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_train.json', + img_prefix=data_root + 'leftImg8bit/train/', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + img_prefix=data_root + 'leftImg8bit/val/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_test.json', + img_prefix=data_root + 'leftImg8bit/test/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='bbox') diff --git a/configs/mmdet/_base_/datasets/cityscapes_instance.py b/configs/mmdet/_base_/datasets/cityscapes_instance.py new file mode 100644 index 00000000..4e3c34e2 --- /dev/null +++ b/configs/mmdet/_base_/datasets/cityscapes_instance.py @@ -0,0 +1,56 @@ +# dataset settings +dataset_type = 'CityscapesDataset' +data_root = 'data/cityscapes/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 1024), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=1, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=8, + dataset=dict( + type=dataset_type, + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_train.json', + img_prefix=data_root + 'leftImg8bit/train/', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + img_prefix=data_root + 'leftImg8bit/val/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_test.json', + img_prefix=data_root + 'leftImg8bit/test/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/configs/mmdet/_base_/datasets/coco_detection.py b/configs/mmdet/_base_/datasets/coco_detection.py new file mode 100644 index 00000000..149f590b --- /dev/null +++ b/configs/mmdet/_base_/datasets/coco_detection.py @@ -0,0 +1,49 @@ +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='bbox') diff --git a/configs/mmdet/_base_/datasets/coco_instance.py b/configs/mmdet/_base_/datasets/coco_instance.py new file mode 100644 index 00000000..9901a858 --- /dev/null +++ b/configs/mmdet/_base_/datasets/coco_instance.py @@ -0,0 +1,49 @@ +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/configs/mmdet/_base_/datasets/coco_instance_semantic.py b/configs/mmdet/_base_/datasets/coco_instance_semantic.py new file mode 100644 index 00000000..6c8bf07b --- /dev/null +++ b/configs/mmdet/_base_/datasets/coco_instance_semantic.py @@ -0,0 +1,54 @@ +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SegRescale', scale_factor=1 / 8), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + seg_prefix=data_root + 'stuffthingmaps/train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/configs/mmdet/_base_/datasets/coco_panoptic.py b/configs/mmdet/_base_/datasets/coco_panoptic.py new file mode 100644 index 00000000..dbade7c0 --- /dev/null +++ b/configs/mmdet/_base_/datasets/coco_panoptic.py @@ -0,0 +1,59 @@ +# dataset settings +dataset_type = 'CocoPanopticDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SegRescale', scale_factor=1 / 4), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/panoptic_train2017.json', + img_prefix=data_root + 'train2017/', + seg_prefix=data_root + 'annotations/panoptic_train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/panoptic_val2017.json', + img_prefix=data_root + 'val2017/', + seg_prefix=data_root + 'annotations/panoptic_val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/panoptic_val2017.json', + img_prefix=data_root + 'val2017/', + seg_prefix=data_root + 'annotations/panoptic_val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric=['PQ']) diff --git a/configs/mmdet/_base_/datasets/deepfashion.py b/configs/mmdet/_base_/datasets/deepfashion.py new file mode 100644 index 00000000..308b4b2a --- /dev/null +++ b/configs/mmdet/_base_/datasets/deepfashion.py @@ -0,0 +1,53 @@ +# dataset settings +dataset_type = 'DeepFashionDataset' +data_root = 'data/DeepFashion/In-shop/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(750, 1101), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(750, 1101), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=1, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json', + img_prefix=data_root + 'Img/', + pipeline=train_pipeline, + data_root=data_root), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json', + img_prefix=data_root + 'Img/', + pipeline=test_pipeline, + data_root=data_root), + test=dict( + type=dataset_type, + ann_file=data_root + + 'annotations/DeepFashion_segmentation_gallery.json', + img_prefix=data_root + 'Img/', + pipeline=test_pipeline, + data_root=data_root)) +evaluation = dict(interval=5, metric=['bbox', 'segm']) diff --git a/configs/mmdet/_base_/datasets/lvis_v0.5_instance.py b/configs/mmdet/_base_/datasets/lvis_v0.5_instance.py new file mode 100644 index 00000000..207e0053 --- /dev/null +++ b/configs/mmdet/_base_/datasets/lvis_v0.5_instance.py @@ -0,0 +1,24 @@ +# dataset settings +_base_ = 'coco_instance.py' +dataset_type = 'LVISV05Dataset' +data_root = 'data/lvis_v0.5/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + _delete_=True, + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v0.5_train.json', + img_prefix=data_root + 'train2017/')), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v0.5_val.json', + img_prefix=data_root + 'val2017/'), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v0.5_val.json', + img_prefix=data_root + 'val2017/')) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/configs/mmdet/_base_/datasets/lvis_v1_instance.py b/configs/mmdet/_base_/datasets/lvis_v1_instance.py new file mode 100644 index 00000000..be791edd --- /dev/null +++ b/configs/mmdet/_base_/datasets/lvis_v1_instance.py @@ -0,0 +1,24 @@ +# dataset settings +_base_ = 'coco_instance.py' +dataset_type = 'LVISV1Dataset' +data_root = 'data/lvis_v1/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + _delete_=True, + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_train.json', + img_prefix=data_root)), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_val.json', + img_prefix=data_root), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_val.json', + img_prefix=data_root)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/configs/mmdet/_base_/datasets/openimages_detection.py b/configs/mmdet/_base_/datasets/openimages_detection.py new file mode 100644 index 00000000..a65d3063 --- /dev/null +++ b/configs/mmdet/_base_/datasets/openimages_detection.py @@ -0,0 +1,65 @@ +# dataset settings +dataset_type = 'OpenImagesDataset' +data_root = 'data/OpenImages/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, denorm_bbox=True), + dict(type='Resize', img_scale=(1024, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1024, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ], + ), +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=0, # workers_per_gpu > 0 may occur out of memory + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/oidv6-train-annotations-bbox.csv', + img_prefix=data_root + 'OpenImages/train/', + label_file=data_root + 'annotations/class-descriptions-boxable.csv', + hierarchy_file=data_root + + 'annotations/bbox_labels_600_hierarchy.json', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/validation-annotations-bbox.csv', + img_prefix=data_root + 'OpenImages/validation/', + label_file=data_root + 'annotations/class-descriptions-boxable.csv', + hierarchy_file=data_root + + 'annotations/bbox_labels_600_hierarchy.json', + meta_file=data_root + 'annotations/validation-image-metas.pkl', + image_level_ann_file=data_root + + 'annotations/validation-annotations-human-imagelabels-boxable.csv', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/validation-annotations-bbox.csv', + img_prefix=data_root + 'OpenImages/validation/', + label_file=data_root + 'annotations/class-descriptions-boxable.csv', + hierarchy_file=data_root + + 'annotations/bbox_labels_600_hierarchy.json', + meta_file=data_root + 'annotations/validation-image-metas.pkl', + image_level_ann_file=data_root + + 'annotations/validation-annotations-human-imagelabels-boxable.csv', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='mAP') diff --git a/configs/mmdet/_base_/datasets/voc0712.py b/configs/mmdet/_base_/datasets/voc0712.py new file mode 100644 index 00000000..ae09acdd --- /dev/null +++ b/configs/mmdet/_base_/datasets/voc0712.py @@ -0,0 +1,55 @@ +# dataset settings +dataset_type = 'VOCDataset' +data_root = 'data/VOCdevkit/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1000, 600), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1000, 600), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + ann_file=[ + data_root + 'VOC2007/ImageSets/Main/trainval.txt', + data_root + 'VOC2012/ImageSets/Main/trainval.txt' + ], + img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', + img_prefix=data_root + 'VOC2007/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', + img_prefix=data_root + 'VOC2007/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='mAP') diff --git a/configs/mmdet/_base_/datasets/wider_face.py b/configs/mmdet/_base_/datasets/wider_face.py new file mode 100644 index 00000000..d1d649be --- /dev/null +++ b/configs/mmdet/_base_/datasets/wider_face.py @@ -0,0 +1,63 @@ +# dataset settings +dataset_type = 'WIDERFaceDataset' +data_root = 'data/WIDERFace/' +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(300, 300), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(300, 300), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=60, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'train.txt', + img_prefix=data_root + 'WIDER_train/', + min_size=17, + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + 'val.txt', + img_prefix=data_root + 'WIDER_val/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'val.txt', + img_prefix=data_root + 'WIDER_val/', + pipeline=test_pipeline)) diff --git a/configs/mmdet/_base_/default_runtime.py b/configs/mmdet/_base_/default_runtime.py new file mode 100644 index 00000000..5b0b1452 --- /dev/null +++ b/configs/mmdet/_base_/default_runtime.py @@ -0,0 +1,27 @@ +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +custom_hooks = [dict(type='NumClassCheckHook')] + +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] + +# disable opencv multithreading to avoid system being overloaded +opencv_num_threads = 0 +# set multi-process start method as `fork` to speed up the training +mp_start_method = 'fork' + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/configs/mmdet/_base_/models/cascade_mask_rcnn_r50_fpn.py b/configs/mmdet/_base_/models/cascade_mask_rcnn_r50_fpn.py new file mode 100644 index 00000000..2902ccae --- /dev/null +++ b/configs/mmdet/_base_/models/cascade_mask_rcnn_r50_fpn.py @@ -0,0 +1,196 @@ +# model settings +model = dict( + type='CascadeRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/configs/mmdet/_base_/models/cascade_rcnn_r50_fpn.py b/configs/mmdet/_base_/models/cascade_rcnn_r50_fpn.py new file mode 100644 index 00000000..42f74ae7 --- /dev/null +++ b/configs/mmdet/_base_/models/cascade_rcnn_r50_fpn.py @@ -0,0 +1,179 @@ +# model settings +model = dict( + type='CascadeRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/configs/mmdet/_base_/models/fast_rcnn_r50_fpn.py b/configs/mmdet/_base_/models/fast_rcnn_r50_fpn.py new file mode 100644 index 00000000..9982fe09 --- /dev/null +++ b/configs/mmdet/_base_/models/fast_rcnn_r50_fpn.py @@ -0,0 +1,62 @@ +# model settings +model = dict( + type='FastRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/configs/mmdet/_base_/models/faster_rcnn_r50_caffe_c4.py b/configs/mmdet/_base_/models/faster_rcnn_r50_caffe_c4.py new file mode 100644 index 00000000..dbf965af --- /dev/null +++ b/configs/mmdet/_base_/models/faster_rcnn_r50_caffe_c4.py @@ -0,0 +1,117 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='FasterRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=3, + strides=(1, 2, 2), + dilations=(1, 1, 1), + out_indices=(2, ), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + type='RPNHead', + in_channels=1024, + feat_channels=1024, + anchor_generator=dict( + type='AnchorGenerator', + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + shared_head=dict( + type='ResLayer', + depth=50, + stage=3, + stride=2, + dilation=1, + style='caffe', + norm_cfg=norm_cfg, + norm_eval=True, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=1024, + featmap_strides=[16]), + bbox_head=dict( + type='BBoxHead', + with_avg_pool=True, + roi_feat_size=7, + in_channels=2048, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=6000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/configs/mmdet/_base_/models/faster_rcnn_r50_caffe_dc5.py b/configs/mmdet/_base_/models/faster_rcnn_r50_caffe_dc5.py new file mode 100644 index 00000000..a377a6f0 --- /dev/null +++ b/configs/mmdet/_base_/models/faster_rcnn_r50_caffe_dc5.py @@ -0,0 +1,105 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='FasterRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + strides=(1, 2, 2, 1), + dilations=(1, 1, 1, 2), + out_indices=(3, ), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + type='RPNHead', + in_channels=2048, + feat_channels=2048, + anchor_generator=dict( + type='AnchorGenerator', + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=2048, + featmap_strides=[16]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=2048, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms=dict(type='nms', iou_threshold=0.7), + nms_pre=6000, + max_per_img=1000, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/configs/mmdet/_base_/models/faster_rcnn_r50_fpn.py b/configs/mmdet/_base_/models/faster_rcnn_r50_fpn.py new file mode 100644 index 00000000..1ef8e7b2 --- /dev/null +++ b/configs/mmdet/_base_/models/faster_rcnn_r50_fpn.py @@ -0,0 +1,108 @@ +# model settings +model = dict( + type='FasterRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + )) diff --git a/configs/mmdet/_base_/models/mask_rcnn_r50_caffe_c4.py b/configs/mmdet/_base_/models/mask_rcnn_r50_caffe_c4.py new file mode 100644 index 00000000..122202e1 --- /dev/null +++ b/configs/mmdet/_base_/models/mask_rcnn_r50_caffe_c4.py @@ -0,0 +1,125 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='MaskRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=3, + strides=(1, 2, 2), + dilations=(1, 1, 1), + out_indices=(2, ), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + type='RPNHead', + in_channels=1024, + feat_channels=1024, + anchor_generator=dict( + type='AnchorGenerator', + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + shared_head=dict( + type='ResLayer', + depth=50, + stage=3, + stride=2, + dilation=1, + style='caffe', + norm_cfg=norm_cfg, + norm_eval=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=1024, + featmap_strides=[16]), + bbox_head=dict( + type='BBoxHead', + with_avg_pool=True, + roi_feat_size=7, + in_channels=2048, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=None, + mask_head=dict( + type='FCNMaskHead', + num_convs=0, + in_channels=2048, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=14, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=6000, + nms=dict(type='nms', iou_threshold=0.7), + max_per_img=1000, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/configs/mmdet/_base_/models/mask_rcnn_r50_fpn.py b/configs/mmdet/_base_/models/mask_rcnn_r50_fpn.py new file mode 100644 index 00000000..d903e55e --- /dev/null +++ b/configs/mmdet/_base_/models/mask_rcnn_r50_fpn.py @@ -0,0 +1,120 @@ +# model settings +model = dict( + type='MaskRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/configs/mmdet/_base_/models/retinanet_r50_fpn.py b/configs/mmdet/_base_/models/retinanet_r50_fpn.py new file mode 100644 index 00000000..56e43fa7 --- /dev/null +++ b/configs/mmdet/_base_/models/retinanet_r50_fpn.py @@ -0,0 +1,60 @@ +# model settings +model = dict( + type='RetinaNet', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5), + bbox_head=dict( + type='RetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)) diff --git a/configs/mmdet/_base_/models/rpn_r50_caffe_c4.py b/configs/mmdet/_base_/models/rpn_r50_caffe_c4.py new file mode 100644 index 00000000..8b32ca99 --- /dev/null +++ b/configs/mmdet/_base_/models/rpn_r50_caffe_c4.py @@ -0,0 +1,58 @@ +# model settings +model = dict( + type='RPN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=3, + strides=(1, 2, 2), + dilations=(1, 1, 1), + out_indices=(2, ), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=None, + rpn_head=dict( + type='RPNHead', + in_channels=1024, + feat_channels=1024, + anchor_generator=dict( + type='AnchorGenerator', + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/configs/mmdet/_base_/models/rpn_r50_fpn.py b/configs/mmdet/_base_/models/rpn_r50_fpn.py new file mode 100644 index 00000000..edaf4d4b --- /dev/null +++ b/configs/mmdet/_base_/models/rpn_r50_fpn.py @@ -0,0 +1,58 @@ +# model settings +model = dict( + type='RPN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/configs/mmdet/_base_/models/ssd300.py b/configs/mmdet/_base_/models/ssd300.py new file mode 100644 index 00000000..f17df010 --- /dev/null +++ b/configs/mmdet/_base_/models/ssd300.py @@ -0,0 +1,56 @@ +# model settings +input_size = 300 +model = dict( + type='SingleStageDetector', + backbone=dict( + type='SSDVGG', + depth=16, + with_last_pool=False, + ceil_mode=True, + out_indices=(3, 4), + out_feature_indices=(22, 34), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')), + neck=dict( + type='SSDNeck', + in_channels=(512, 1024), + out_channels=(512, 1024, 512, 256, 256, 256), + level_strides=(2, 2, 1, 1), + level_paddings=(1, 1, 0, 0), + l2_norm_scale=20), + bbox_head=dict( + type='SSDHead', + in_channels=(512, 1024, 512, 256, 256, 256), + num_classes=80, + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + input_size=input_size, + basesize_ratio_range=(0.15, 0.9), + strides=[8, 16, 32, 64, 100, 300], + ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2])), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False), + test_cfg=dict( + nms_pre=1000, + nms=dict(type='nms', iou_threshold=0.45), + min_bbox_size=0, + score_thr=0.02, + max_per_img=200)) +cudnn_benchmark = True diff --git a/configs/mmdet/_base_/schedules/schedule_1x.py b/configs/mmdet/_base_/schedules/schedule_1x.py new file mode 100644 index 00000000..13b3783c --- /dev/null +++ b/configs/mmdet/_base_/schedules/schedule_1x.py @@ -0,0 +1,11 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/_base_/schedules/schedule_20e.py b/configs/mmdet/_base_/schedules/schedule_20e.py new file mode 100644 index 00000000..00e85902 --- /dev/null +++ b/configs/mmdet/_base_/schedules/schedule_20e.py @@ -0,0 +1,11 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/_base_/schedules/schedule_2x.py b/configs/mmdet/_base_/schedules/schedule_2x.py new file mode 100644 index 00000000..69dc9ee8 --- /dev/null +++ b/configs/mmdet/_base_/schedules/schedule_2x.py @@ -0,0 +1,11 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/albu_example/README.md b/configs/mmdet/albu_example/README.md new file mode 100644 index 00000000..49edbf3f --- /dev/null +++ b/configs/mmdet/albu_example/README.md @@ -0,0 +1,31 @@ +# Albu Example + +> [Albumentations: fast and flexible image augmentations](https://arxiv.org/abs/1809.06839) + + + +## Abstract + +Data augmentation is a commonly used technique for increasing both the size and the diversity of labeled training sets by leveraging input transformations that preserve output labels. In computer vision domain, image augmentations have become a common implicit regularization technique to combat overfitting in deep convolutional neural networks and are ubiquitously used to improve performance. While most deep learning frameworks implement basic image transformations, the list is typically limited to some variations and combinations of flipping, rotating, scaling, and cropping. Moreover, the image processing speed varies in existing tools for image augmentation. We present Albumentations, a fast and flexible library for image augmentations with many various image transform operations available, that is also an easy-to-use wrapper around other augmentation libraries. We provide examples of image augmentations for different computer vision tasks and show that Albumentations is faster than other commonly used image augmentation tools on the most of commonly used image transformations. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:---------:|:-------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50 | pytorch | 1x | 4.4 | 16.6 | 38.0 | 34.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208-ab203bcd.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208_225520.log.json) | + +## Citation + +```latex +@article{2018arXiv180906839B, + author = {A. Buslaev, A. Parinov, E. Khvedchenya, V.~I. Iglovikov and A.~A. Kalinin}, + title = "{Albumentations: fast and flexible image augmentations}", + journal = {ArXiv e-prints}, + eprint = {1809.06839}, + year = 2018 +} +``` diff --git a/configs/mmdet/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py b/configs/mmdet/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py new file mode 100644 index 00000000..b3f879a6 --- /dev/null +++ b/configs/mmdet/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py @@ -0,0 +1,73 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +albu_train_transforms = [ + dict( + type='ShiftScaleRotate', + shift_limit=0.0625, + scale_limit=0.0, + rotate_limit=0, + interpolation=1, + p=0.5), + dict( + type='RandomBrightnessContrast', + brightness_limit=[0.1, 0.3], + contrast_limit=[0.1, 0.3], + p=0.2), + dict( + type='OneOf', + transforms=[ + dict( + type='RGBShift', + r_shift_limit=10, + g_shift_limit=10, + b_shift_limit=10, + p=1.0), + dict( + type='HueSaturationValue', + hue_shift_limit=20, + sat_shift_limit=30, + val_shift_limit=20, + p=1.0) + ], + p=0.1), + dict(type='JpegCompression', quality_lower=85, quality_upper=95, p=0.2), + dict(type='ChannelShuffle', p=0.1), + dict( + type='OneOf', + transforms=[ + dict(type='Blur', blur_limit=3, p=1.0), + dict(type='MedianBlur', blur_limit=3, p=1.0) + ], + p=0.1), +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='Pad', size_divisor=32), + dict( + type='Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_labels'], + min_visibility=0.0, + filter_lost_elements=True), + keymap={ + 'img': 'image', + 'gt_masks': 'masks', + 'gt_bboxes': 'bboxes' + }, + update_pad_shape=False, + skip_img_without_anno=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'img_norm_cfg', + 'pad_shape', 'scale_factor')) +] +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/atss/README.md b/configs/mmdet/atss/README.md new file mode 100644 index 00000000..1bf69498 --- /dev/null +++ b/configs/mmdet/atss/README.md @@ -0,0 +1,31 @@ +# ATSS + +> [Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection](https://arxiv.org/abs/1912.02424) + + + +## Abstract + +Object detection has been dominated by anchor-based detectors for several years. Recently, anchor-free detectors have become popular due to the proposal of FPN and Focal Loss. In this paper, we first point out that the essential difference between anchor-based and anchor-free detection is actually how to define positive and negative training samples, which leads to the performance gap between them. If they adopt the same definition of positive and negative samples during training, there is no obvious difference in the final performance, no matter regressing from a box or a point. This shows that how to select positive and negative training samples is important for current object detectors. Then, we propose an Adaptive Training Sample Selection (ATSS) to automatically select positive and negative samples according to statistical characteristics of object. It significantly improves the performance of anchor-based and anchor-free detectors and bridges the gap between them. Finally, we discuss the necessity of tiling multiple anchors per location on the image to detect objects. Extensive experiments conducted on MS COCO support our aforementioned analysis and conclusions. With the newly introduced ATSS, we improve state-of-the-art detectors by a large margin to 50.7% AP without introducing any overhead. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | pytorch | 1x | 3.7 | 19.7 | 39.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/atss/atss_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209_102539.log.json) | +| R-101 | pytorch | 1x | 5.6 | 12.3 | 41.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/atss/atss_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.log.json) | + +## Citation + +```latex +@article{zhang2019bridging, + title = {Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection}, + author = {Zhang, Shifeng and Chi, Cheng and Yao, Yongqiang and Lei, Zhen and Li, Stan Z.}, + journal = {arXiv preprint arXiv:1912.02424}, + year = {2019} +} +``` diff --git a/configs/mmdet/atss/atss_r101_fpn_1x_coco.py b/configs/mmdet/atss/atss_r101_fpn_1x_coco.py new file mode 100644 index 00000000..5225d2ab --- /dev/null +++ b/configs/mmdet/atss/atss_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './atss_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/atss/atss_r50_fpn_1x_coco.py b/configs/mmdet/atss/atss_r50_fpn_1x_coco.py new file mode 100644 index 00000000..42ff4c59 --- /dev/null +++ b/configs/mmdet/atss/atss_r50_fpn_1x_coco.py @@ -0,0 +1,62 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='ATSS', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='ATSSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/atss/metafile.yml b/configs/mmdet/atss/metafile.yml new file mode 100644 index 00000000..f4c567ef --- /dev/null +++ b/configs/mmdet/atss/metafile.yml @@ -0,0 +1,60 @@ +Collections: + - Name: ATSS + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ATSS + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1912.02424 + Title: 'Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection' + README: configs/atss/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/atss.py#L6 + Version: v2.0.0 + +Models: + - Name: atss_r50_fpn_1x_coco + In Collection: ATSS + Config: configs/atss/atss_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.7 + inference time (ms/im): + - value: 50.76 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth + + - Name: atss_r101_fpn_1x_coco + In Collection: ATSS + Config: configs/atss/atss_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.6 + inference time (ms/im): + - value: 81.3 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.pth diff --git a/configs/mmdet/autoassign/README.md b/configs/mmdet/autoassign/README.md new file mode 100644 index 00000000..8e8341a7 --- /dev/null +++ b/configs/mmdet/autoassign/README.md @@ -0,0 +1,35 @@ +# AutoAssign + +> [AutoAssign: Differentiable Label Assignment for Dense Object Detection](https://arxiv.org/abs/2007.03496) + + + +## Abstract + +Determining positive/negative samples for object detection is known as label assignment. Here we present an anchor-free detector named AutoAssign. It requires little human knowledge and achieves appearance-aware through a fully differentiable weighting mechanism. During training, to both satisfy the prior distribution of data and adapt to category characteristics, we present Center Weighting to adjust the category-specific prior distributions. To adapt to object appearances, Confidence Weighting is proposed to adjust the specific assign strategy of each instance. The two weighting modules are then combined to generate positive and negative weights to adjust each location's confidence. Extensive experiments on the MS COCO show that our method steadily surpasses other best sampling strategies by large margins with various backbones. Moreover, our best model achieves 52.1% AP, outperforming all existing one-stage detectors. Besides, experiments on other datasets, e.g., PASCAL VOC, Objects365, and WiderFace, demonstrate the broad applicability of AutoAssign. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:--------:|:------:|:------:|:--------:| +| R-50 | caffe | 1x | 4.08 | 40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/autoassign/autoassign_r50_fpn_8x2_1x_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.log.json) | + +**Note**: + +1. We find that the performance is unstable with 1x setting and may fluctuate by about 0.3 mAP. mAP 40.3 ~ 40.6 is acceptable. Such fluctuation can also be found in the original implementation. +2. You can get a more stable results ~ mAP 40.6 with a schedule total 13 epoch, and learning rate is divided by 10 at 10th and 13th epoch. + +## Citation + +```latex +@article{zhu2020autoassign, + title={AutoAssign: Differentiable Label Assignment for Dense Object Detection}, + author={Zhu, Benjin and Wang, Jianfeng and Jiang, Zhengkai and Zong, Fuhang and Liu, Songtao and Li, Zeming and Sun, Jian}, + journal={arXiv preprint arXiv:2007.03496}, + year={2020} +} +``` diff --git a/configs/mmdet/autoassign/autoassign_r50_fpn_8x2_1x_coco.py b/configs/mmdet/autoassign/autoassign_r50_fpn_8x2_1x_coco.py new file mode 100644 index 00000000..db548dc3 --- /dev/null +++ b/configs/mmdet/autoassign/autoassign_r50_fpn_8x2_1x_coco.py @@ -0,0 +1,85 @@ +# We follow the original implementation which +# adopts the Caffe pre-trained backbone. +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='AutoAssign', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + relu_before_extra_convs=True, + init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')), + bbox_head=dict( + type='AutoAssignHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_bbox=dict(type='GIoULoss', loss_weight=5.0)), + train_cfg=None, + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(lr=0.01, paramwise_cfg=dict(norm_decay_mult=0.)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[8, 11]) +total_epochs = 12 diff --git a/configs/mmdet/autoassign/metafile.yml b/configs/mmdet/autoassign/metafile.yml new file mode 100644 index 00000000..f1e90519 --- /dev/null +++ b/configs/mmdet/autoassign/metafile.yml @@ -0,0 +1,33 @@ +Collections: + - Name: AutoAssign + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - AutoAssign + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2007.03496 + Title: 'AutoAssign: Differentiable Label Assignment for Dense Object Detection' + README: configs/autoassign/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/autoassign.py#L6 + Version: v2.12.0 + +Models: + - Name: autoassign_r50_fpn_8x2_1x_coco + In Collection: AutoAssign + Config: configs/autoassign/autoassign_r50_fpn_8x2_1x_coco.py + Metadata: + Training Memory (GB): 4.08 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth diff --git a/configs/mmdet/carafe/README.md b/configs/mmdet/carafe/README.md new file mode 100644 index 00000000..983aafb4 --- /dev/null +++ b/configs/mmdet/carafe/README.md @@ -0,0 +1,42 @@ +# CARAFE + +> [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188) + + + +## Abstract + +Feature upsampling is a key operation in a number of modern convolutional network architectures, e.g. feature pyramids. Its design is critical for dense prediction tasks such as object detection and semantic/instance segmentation. In this work, we propose Content-Aware ReAssembly of FEatures (CARAFE), a universal, lightweight and highly effective operator to fulfill this goal. CARAFE has several appealing properties: (1) Large field of view. Unlike previous works (e.g. bilinear interpolation) that only exploit sub-pixel neighborhood, CARAFE can aggregate contextual information within a large receptive field. (2) Content-aware handling. Instead of using a fixed kernel for all samples (e.g. deconvolution), CARAFE enables instance-specific content-aware handling, which generates adaptive kernels on-the-fly. (3) Lightweight and fast to compute. CARAFE introduces little computational overhead and can be readily integrated into modern network architectures. We conduct comprehensive evaluations on standard benchmarks in object detection, instance/semantic segmentation and inpainting. CARAFE shows consistent and substantial gains across all the tasks (1.2%, 1.3%, 1.8%, 1.1db respectively) with negligible computational overhead. It has great potential to serve as a strong building block for future research. It has great potential to serve as a strong building block for future research. + +
+ +
+ +## Results and Models + +The results on COCO 2017 val is shown in the below table. + +| Method | Backbone | Style | Lr schd | Test Proposal Num | Inf time (fps) | Box AP | Mask AP | Config | Download | +|:--------------------:|:--------:|:-------:|:-------:|:-----------------:|:--------------:|:------:|:-------:|:------:|:--------:| +| Faster R-CNN w/ CARAFE | R-50-FPN | pytorch | 1x | 1000 | 16.5 | 38.6 | 38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_20200504_175733.log.json) | +| - | - | - | - | 2000 | | | | | +| Mask R-CNN w/ CARAFE | R-50-FPN | pytorch | 1x | 1000 | 14.0 | 39.3 | 35.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.393__segm_mAP-0.358_20200503_135957-8687f195.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_20200503_135957.log.json) | +| - | - | - | - | 2000 | | | | | + +## Implementation + +The CUDA implementation of CARAFE can be find at https://github.com/myownskyW7/CARAFE. + +## Citation + +We provide config files to reproduce the object detection & instance segmentation results in the ICCV 2019 Oral paper for [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188). + +```latex +@inproceedings{Wang_2019_ICCV, + title = {CARAFE: Content-Aware ReAssembly of FEatures}, + author = {Wang, Jiaqi and Chen, Kai and Xu, Rui and Liu, Ziwei and Loy, Chen Change and Lin, Dahua}, + booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2019} +} +``` diff --git a/configs/mmdet/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py b/configs/mmdet/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py new file mode 100644 index 00000000..dedac3f4 --- /dev/null +++ b/configs/mmdet/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py @@ -0,0 +1,50 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + neck=dict( + type='FPN_CARAFE', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5, + start_level=0, + end_level=-1, + norm_cfg=None, + act_cfg=None, + order=('conv', 'norm', 'act'), + upsample_cfg=dict( + type='carafe', + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1, + compressed_channels=64))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=64), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=64), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py b/configs/mmdet/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py new file mode 100644 index 00000000..668c0239 --- /dev/null +++ b/configs/mmdet/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py @@ -0,0 +1,60 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + neck=dict( + type='FPN_CARAFE', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5, + start_level=0, + end_level=-1, + norm_cfg=None, + act_cfg=None, + order=('conv', 'norm', 'act'), + upsample_cfg=dict( + type='carafe', + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1, + compressed_channels=64)), + roi_head=dict( + mask_head=dict( + upsample_cfg=dict( + type='carafe', + scale_factor=2, + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1, + compressed_channels=64)))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=64), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=64), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/carafe/metafile.yml b/configs/mmdet/carafe/metafile.yml new file mode 100644 index 00000000..b58a3f69 --- /dev/null +++ b/configs/mmdet/carafe/metafile.yml @@ -0,0 +1,55 @@ +Collections: + - Name: CARAFE + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RPN + - FPN_CARAFE + - ResNet + - RoIPool + Paper: + URL: https://arxiv.org/abs/1905.02188 + Title: 'CARAFE: Content-Aware ReAssembly of FEatures' + README: configs/carafe/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/necks/fpn_carafe.py#L11 + Version: v2.12.0 + +Models: + - Name: faster_rcnn_r50_fpn_carafe_1x_coco + In Collection: CARAFE + Config: configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py + Metadata: + Training Memory (GB): 4.26 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth + + - Name: mask_rcnn_r50_fpn_carafe_1x_coco + In Collection: CARAFE + Config: configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py + Metadata: + Training Memory (GB): 4.31 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.393__segm_mAP-0.358_20200503_135957-8687f195.pth diff --git a/configs/mmdet/cascade_rcnn/README.md b/configs/mmdet/cascade_rcnn/README.md new file mode 100644 index 00000000..109fd7c3 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/README.md @@ -0,0 +1,79 @@ +# Cascade R-CNN + +> [Cascade R-CNN: High Quality Object Detection and Instance Segmentation](https://arxiv.org/abs/1906.09756) + + + +## Abstract + +In object detection, the intersection over union (IoU) threshold is frequently used to define positives/negatives. The threshold used to train a detector defines its quality. While the commonly used threshold of 0.5 leads to noisy (low-quality) detections, detection performance frequently degrades for larger thresholds. This paradox of high-quality detection has two causes: 1) overfitting, due to vanishing positive samples for large thresholds, and 2) inference-time quality mismatch between detector and test hypotheses. A multi-stage object detection architecture, the Cascade R-CNN, composed of a sequence of detectors trained with increasing IoU thresholds, is proposed to address these problems. The detectors are trained sequentially, using the output of a detector as training set for the next. This resampling progressively improves hypotheses quality, guaranteeing a positive training set of equivalent size for all detectors and minimizing overfitting. The same cascade is applied at inference, to eliminate quality mismatches between hypotheses and detectors. An implementation of the Cascade R-CNN without bells or whistles achieves state-of-the-art performance on the COCO dataset, and significantly improves high-quality detection on generic and specific object detection datasets, including VOC, KITTI, CityPerson, and WiderFace. Finally, the Cascade R-CNN is generalized to instance segmentation, with nontrivial improvements over the Mask R-CNN. + +
+ +
+ +## Results and Models + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: |:------:|:--------:| +| R-50-FPN | caffe | 1x | 4.2 | | 40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.404_20200504_174853-b857be87.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_20200504_174853.log.json) | +| R-50-FPN | pytorch | 1x | 4.4 | 16.1 | 40.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316_214748.log.json) | +| R-50-FPN | pytorch | 20e | - | - | 41.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_bbox_mAP-0.41_20200504_175131-e9872a90.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_20200504_175131.log.json) | +| R-101-FPN | caffe | 1x | 6.2 | | 42.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.423_20200504_175649-cab8dbd5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_20200504_175649.log.json) | +| R-101-FPN | pytorch | 1x | 6.4 | 13.5 | 42.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317_101744.log.json) | +| R-101-FPN | pytorch | 20e | - | - | 42.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_bbox_mAP-0.425_20200504_231812-5057dcc5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_20200504_231812.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.6 | 10.9 | 43.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316-95c2deb6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316_055608.log.json) | +| X-101-32x4d-FPN | pytorch | 20e | 7.6 | | 43.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608-9ae0a720.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.7 | | 44.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702.log.json) | +| X-101-64x4d-FPN | pytorch | 20e | 10.7 | | 44.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357.log.json)| + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| R-50-FPN | caffe | 1x | 5.9 | | 41.2 | 36.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.412__segm_mAP-0.36_20200504_174659-5004b251.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_20200504_174659.log.json) | +| R-50-FPN | pytorch | 1x | 6.0 | 11.2 | 41.2 | 35.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203_170449.log.json) | +| R-50-FPN | pytorch | 20e | - | - | 41.9 | 36.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_20200504_174711.log.json)| +| R-101-FPN | caffe | 1x | 7.8 | | 43.2 | 37.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.432__segm_mAP-0.376_20200504_174813-5c1e9599.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_20200504_174813.log.json)| +| R-101-FPN | pytorch | 1x | 7.9 | 9.8 | 42.9 | 37.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203-befdf6ee.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203_092521.log.json) | +| R-101-FPN | pytorch | 20e | - | - | 43.4 | 37.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_bbox_mAP-0.434__segm_mAP-0.378_20200504_174836-005947da.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_20200504_174836.log.json)| +| X-101-32x4d-FPN | pytorch | 1x | 9.2 | 8.6 | 44.3 | 38.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201-0f411b1f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201_052416.log.json) | +| X-101-32x4d-FPN | pytorch | 20e | 9.2 | - | 45.0 | 39.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917-ed1f4751.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 12.2 | 6.7 | 45.3 | 39.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203-9a2db89d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203_044059.log.json) | +| X-101-64x4d-FPN | pytorch | 20e | 12.2 | | 45.6 |39.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033-bdb5126a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033.log.json)| + +**Notes:** + +- The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs. + +## Pre-trained Models + +We also train some models with longer schedules and multi-scale training for Cascade Mask R-CNN. The users could finetune them for downstream tasks. + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| R-50-FPN | caffe | 3x | 5.7 | | 44.0 | 38.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651-6e29b3a6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651.log.json) +| R-50-FPN | pytorch| 3x | 5.9 | | 44.3 | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719-5bdc3824.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719.log.json) +| R-101-FPN | caffe | 3x | 7.7 | | 45.4 | 39.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620-a5bd2389.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620.log.json) +| R-101-FPN | pytorch| 3x | 7.8 | | 45.5 | 39.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236-51a2d363.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236.log.json) +| X-101-32x4d-FPN | pytorch| 3x | 9.0 | | 46.3 | 40.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234-40773067.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234.log.json) +| X-101-32x8d-FPN | pytorch| 3x | 12.1 | | 46.1 | 39.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640-9ff7e76f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640.log.json) +| X-101-64x4d-FPN | pytorch| 3x | 12.0 | | 46.6 | 40.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311-d3e64ba0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311.log.json) + +## Citation + +```latex +@article{Cai_2019, + title={Cascade R-CNN: High Quality Object Detection and Instance Segmentation}, + ISSN={1939-3539}, + url={http://dx.doi.org/10.1109/tpami.2019.2956516}, + DOI={10.1109/tpami.2019.2956516}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher={Institute of Electrical and Electronics Engineers (IEEE)}, + author={Cai, Zhaowei and Vasconcelos, Nuno}, + year={2019}, + pages={1–1} +} +``` diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..5ee62310 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..1df87fc6 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..f59c1558 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py new file mode 100644 index 00000000..45ab7edf --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..1b20f167 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..12d37efc --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,41 @@ +_base_ = ['./cascade_mask_rcnn_r50_fpn_1x_coco.py'] + +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..9fb817e8 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py @@ -0,0 +1,49 @@ +_base_ = ['./cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py'] +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) + +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..49ab539a --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/cascade_mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py new file mode 100644 index 00000000..1296dc45 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/cascade_mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_20e.py', '../_base_/default_runtime.py' +] diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..ed0c6d1a --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py @@ -0,0 +1,4 @@ +_base_ = [ + '../common/mstrain_3x_coco_instance.py', + '../_base_/models/cascade_mask_rcnn_r50_fpn.py' +] diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..06cbbe70 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py new file mode 100644 index 00000000..4e352362 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..7d37d17d --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..eeec1aa1 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py @@ -0,0 +1,60 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py' + +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) + +# ResNeXt-101-32x8d model trained with Caffe2 at FB, +# so the mean and std need to be changed. +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + to_rgb=False) + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..7dbef5fa --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py new file mode 100644 index 00000000..579b1aca --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..ed6cf4b5 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..1e90f4bb --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade_rcnn_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..5c077760 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py new file mode 100644 index 00000000..b1719c25 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade_rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..696bcfb9 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,42 @@ +_base_ = './cascade_rcnn_r50_fpn_1x_coco.py' + +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) + +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..87e21fbf --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/cascade_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py new file mode 100644 index 00000000..6f886e1c --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py @@ -0,0 +1,4 @@ +_base_ = './cascade_rcnn_r50_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..5ac02c10 --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py new file mode 100644 index 00000000..486e45ea --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..78229f0d --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,15 @@ +_base_ = './cascade_rcnn_r50_fpn_1x_coco.py' +model = dict( + type='CascadeRCNN', + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py b/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py new file mode 100644 index 00000000..58812dec --- /dev/null +++ b/configs/mmdet/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py @@ -0,0 +1,15 @@ +_base_ = './cascade_rcnn_r50_fpn_20e_coco.py' +model = dict( + type='CascadeRCNN', + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/cascade_rcnn/metafile.yml b/configs/mmdet/cascade_rcnn/metafile.yml new file mode 100644 index 00000000..1007f2eb --- /dev/null +++ b/configs/mmdet/cascade_rcnn/metafile.yml @@ -0,0 +1,525 @@ +Collections: + - Name: Cascade R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Cascade R-CNN + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: http://dx.doi.org/10.1109/tpami.2019.2956516 + Title: 'Cascade R-CNN: Delving into High Quality Object Detection' + README: configs/cascade_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/cascade_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: cascade_rcnn_r50_caffe_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.404_20200504_174853-b857be87.pth + + - Name: cascade_rcnn_r50_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 62.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth + + - Name: cascade_rcnn_r50_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 62.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_bbox_mAP-0.41_20200504_175131-e9872a90.pth + + - Name: cascade_rcnn_r101_caffe_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.423_20200504_175649-cab8dbd5.pth + + - Name: cascade_rcnn_r101_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 74.07 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth + + - Name: cascade_rcnn_r101_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 74.07 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_bbox_mAP-0.425_20200504_231812-5057dcc5.pth + + - Name: cascade_rcnn_x101_32x4d_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 91.74 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316-95c2deb6.pth + + - Name: cascade_rcnn_x101_32x4d_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 7.6 + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608-9ae0a720.pth + + - Name: cascade_rcnn_x101_64x4d_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth + + - Name: cascade_rcnn_x101_64x4d_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth + + - Name: cascade_mask_rcnn_r50_caffe_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.9 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.412__segm_mAP-0.36_20200504_174659-5004b251.pth + + - Name: cascade_mask_rcnn_r50_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 89.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth + + - Name: cascade_mask_rcnn_r50_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 89.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth + + - Name: cascade_mask_rcnn_r101_caffe_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.432__segm_mAP-0.376_20200504_174813-5c1e9599.pth + + - Name: cascade_mask_rcnn_r101_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.9 + inference time (ms/im): + - value: 102.04 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203-befdf6ee.pth + + - Name: cascade_mask_rcnn_r101_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 7.9 + inference time (ms/im): + - value: 102.04 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_bbox_mAP-0.434__segm_mAP-0.378_20200504_174836-005947da.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.2 + inference time (ms/im): + - value: 116.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201-0f411b1f.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 9.2 + inference time (ms/im): + - value: 116.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917-ed1f4751.pth + + - Name: cascade_mask_rcnn_x101_64x4d_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 12.2 + inference time (ms/im): + - value: 149.25 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203-9a2db89d.pth + + - Name: cascade_mask_rcnn_x101_64x4d_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033-bdb5126a.pth + + - Name: cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 5.7 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651-6e29b3a6.pth + + - Name: cascade_mask_rcnn_r50_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 5.9 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719-5bdc3824.pth + + - Name: cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 7.7 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620-a5bd2389.pth + + - Name: cascade_mask_rcnn_r101_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236-51a2d363.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 9.0 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234-40773067.pth + + - Name: cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 12.1 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640-9ff7e76f.pth + + - Name: cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 12.0 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311-d3e64ba0.pth diff --git a/configs/mmdet/cascade_rpn/README.md b/configs/mmdet/cascade_rpn/README.md new file mode 100644 index 00000000..900dc291 --- /dev/null +++ b/configs/mmdet/cascade_rpn/README.md @@ -0,0 +1,41 @@ +# Cascade RPN + +> [Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution](https://arxiv.org/abs/1909.06720) + + + +## Abstract + +This paper considers an architecture referred to as Cascade Region Proposal Network (Cascade RPN) for improving the region-proposal quality and detection performance by systematically addressing the limitation of the conventional RPN that heuristically defines the anchors and aligns the features to the anchors. First, instead of using multiple anchors with predefined scales and aspect ratios, Cascade RPN relies on a single anchor per location and performs multi-stage refinement. Each stage is progressively more stringent in defining positive samples by starting out with an anchor-free metric followed by anchor-based metrics in the ensuing stages. Second, to attain alignment between the features and the anchors throughout the stages, adaptive convolution is proposed that takes the anchors in addition to the image features as its input and learns the sampled features guided by the anchors. A simple implementation of a two-stage Cascade RPN achieves AR 13.4 points higher than that of the conventional RPN, surpassing any existing region proposal methods. When adopting to Fast R-CNN and Faster R-CNN, Cascade RPN can improve the detection mAP by 3.1 and 3.5 points, respectively. + +
+ +
+ +## Results and Models + +### Region proposal performance + +| Method | Backbone | Style | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR 1000 | Config | Download | +|:------:|:--------:|:-----:|:--------:|:-------------------:|:--------------:|:-------:|:-------:|:--------------------------------------:| +| CRPN | R-50-FPN | caffe | - | - | - | 72.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rpn/crpn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_r50_caffe_fpn_1x_coco/cascade_rpn_r50_caffe_fpn_1x_coco-7aa93cef.pth) | + +### Detection performance + +| Method | Proposal | Backbone | Style | Schedule | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Config | Download | +|:-------------:|:-----------:|:--------:|:-------:|:--------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------------------------------------------:| +| Fast R-CNN | Cascade RPN | R-50-FPN | caffe | 1x | - | - | - | 39.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco/crpn_fast_rcnn_r50_caffe_fpn_1x_coco-cb486e66.pth) | +| Faster R-CNN | Cascade RPN | R-50-FPN | caffe | 1x | - | - | - | 40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco/crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth) | + +## Citation + +We provide the code for reproducing experiment results of [Cascade RPN](https://arxiv.org/abs/1909.06720). + +```latex +@inproceedings{vu2019cascade, + title={Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution}, + author={Vu, Thang and Jang, Hyunjun and Pham, Trung X and Yoo, Chang D}, + booktitle={Conference on Neural Information Processing Systems (NeurIPS)}, + year={2019} +} +``` diff --git a/configs/mmdet/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..29f5d074 --- /dev/null +++ b/configs/mmdet/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,77 @@ +_base_ = '../fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + roi_head=dict( + bbox_head=dict( + bbox_coder=dict(target_stds=[0.04, 0.04, 0.08, 0.08]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.5), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + assigner=dict( + pos_iou_thr=0.65, neg_iou_thr=0.65, min_pos_iou=0.65), + sampler=dict(num=256))), + test_cfg=dict(rcnn=dict(score_thr=1e-3))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=300), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=300), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['proposals']), + dict( + type='ToDataContainer', + fields=[dict(key='proposals', stack=False)]), + dict(type='Collect', keys=['img', 'proposals']), + ]) +] +data = dict( + train=dict( + proposal_file=data_root + + 'proposals/crpn_r50_caffe_fpn_1x_train2017.pkl', + pipeline=train_pipeline), + val=dict( + proposal_file=data_root + + 'proposals/crpn_r50_caffe_fpn_1x_val2017.pkl', + pipeline=test_pipeline), + test=dict( + proposal_file=data_root + + 'proposals/crpn_r50_caffe_fpn_1x_val2017.pkl', + pipeline=test_pipeline)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..bad86e6d --- /dev/null +++ b/configs/mmdet/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,92 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py' +rpn_weight = 0.7 +model = dict( + rpn_head=dict( + _delete_=True, + type='CascadeRPNHead', + num_stages=2, + stages=[ + dict( + type='StageCascadeRPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[1.0], + strides=[4, 8, 16, 32, 64]), + adapt_cfg=dict(type='dilation', dilation=3), + bridged_feature=True, + sampling=False, + with_cls=False, + reg_decoded_bbox=True, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=(.0, .0, .0, .0), + target_stds=(0.1, 0.1, 0.5, 0.5)), + loss_bbox=dict( + type='IoULoss', linear=True, + loss_weight=10.0 * rpn_weight)), + dict( + type='StageCascadeRPNHead', + in_channels=256, + feat_channels=256, + adapt_cfg=dict(type='offset'), + bridged_feature=False, + sampling=True, + with_cls=True, + reg_decoded_bbox=True, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=(.0, .0, .0, .0), + target_stds=(0.05, 0.05, 0.1, 0.1)), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0 * rpn_weight), + loss_bbox=dict( + type='IoULoss', linear=True, + loss_weight=10.0 * rpn_weight)) + ]), + roi_head=dict( + bbox_head=dict( + bbox_coder=dict(target_stds=[0.04, 0.04, 0.08, 0.08]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.5), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=[ + dict( + assigner=dict( + type='RegionAssigner', center_ratio=0.2, ignore_ratio=0.5), + allowed_border=-1, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False) + ], + rpn_proposal=dict(max_per_img=300, nms=dict(iou_threshold=0.8)), + rcnn=dict( + assigner=dict( + pos_iou_thr=0.65, neg_iou_thr=0.65, min_pos_iou=0.65), + sampler=dict(type='RandomSampler', num=256))), + test_cfg=dict( + rpn=dict(max_per_img=300, nms=dict(iou_threshold=0.8)), + rcnn=dict(score_thr=1e-3))) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/cascade_rpn/crpn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/cascade_rpn/crpn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..5562e696 --- /dev/null +++ b/configs/mmdet/cascade_rpn/crpn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,77 @@ +_base_ = '../rpn/rpn_r50_caffe_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='CascadeRPNHead', + num_stages=2, + stages=[ + dict( + type='StageCascadeRPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[1.0], + strides=[4, 8, 16, 32, 64]), + adapt_cfg=dict(type='dilation', dilation=3), + bridged_feature=True, + sampling=False, + with_cls=False, + reg_decoded_bbox=True, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=(.0, .0, .0, .0), + target_stds=(0.1, 0.1, 0.5, 0.5)), + loss_bbox=dict(type='IoULoss', linear=True, loss_weight=10.0)), + dict( + type='StageCascadeRPNHead', + in_channels=256, + feat_channels=256, + adapt_cfg=dict(type='offset'), + bridged_feature=False, + sampling=True, + with_cls=True, + reg_decoded_bbox=True, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=(.0, .0, .0, .0), + target_stds=(0.05, 0.05, 0.1, 0.1)), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', linear=True, loss_weight=10.0)) + ]), + train_cfg=dict(rpn=[ + dict( + assigner=dict( + type='RegionAssigner', center_ratio=0.2, ignore_ratio=0.5), + allowed_border=-1, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.3, + ignore_iof_thr=-1, + iou_calculator=dict(type='BboxOverlaps2D')), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.8), + min_bbox_size=0))) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/cascade_rpn/metafile.yml b/configs/mmdet/cascade_rpn/metafile.yml new file mode 100644 index 00000000..335b2bc7 --- /dev/null +++ b/configs/mmdet/cascade_rpn/metafile.yml @@ -0,0 +1,44 @@ +Collections: + - Name: Cascade RPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Cascade RPN + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1909.06720 + Title: 'Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution' + README: configs/cascade_rpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.8.0/mmdet/models/dense_heads/cascade_rpn_head.py#L538 + Version: v2.8.0 + +Models: + - Name: crpn_fast_rcnn_r50_caffe_fpn_1x_coco + In Collection: Cascade RPN + Config: configs/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco/crpn_fast_rcnn_r50_caffe_fpn_1x_coco-cb486e66.pth + + - Name: crpn_faster_rcnn_r50_caffe_fpn_1x_coco + In Collection: Cascade RPN + Config: configs/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco/crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth diff --git a/configs/mmdet/centernet/README.md b/configs/mmdet/centernet/README.md new file mode 100644 index 00000000..ffc1d8c2 --- /dev/null +++ b/configs/mmdet/centernet/README.md @@ -0,0 +1,40 @@ +# CenterNet + +> [Objects as Points](https://arxiv.org/abs/1904.07850) + + + +## Abstract + +Detection identifies objects as axis-aligned boxes in an image. Most successful object detectors enumerate a nearly exhaustive list of potential object locations and classify each. This is wasteful, inefficient, and requires additional post-processing. In this paper, we take a different approach. We model an object as a single point --- the center point of its bounding box. Our detector uses keypoint estimation to find center points and regresses to all other object properties, such as size, 3D location, orientation, and even pose. Our center point based approach, CenterNet, is end-to-end differentiable, simpler, faster, and more accurate than corresponding bounding box based detectors. CenterNet achieves the best speed-accuracy trade-off on the MS COCO dataset, with 28.1% AP at 142 FPS, 37.4% AP at 52 FPS, and 45.1% AP with multi-scale testing at 1.4 FPS. We use the same approach to estimate 3D bounding box in the KITTI benchmark and human pose on the COCO keypoint dataset. Our method performs competitively with sophisticated multi-stage methods and runs in real-time. + +
+ +
+ +## Results and Models + +| Backbone | DCN | Mem (GB) | Box AP | Flip box AP| Config | Download | +| :-------------: | :--------: |:----------------: | :------: | :------------: | :----: | :----: | +| ResNet-18 | N | 3.45 | 25.9 | 27.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/centernet/centernet_resnet18_140e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630-bb5b3bf7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630.log.json) | +| ResNet-18 | Y | 3.47 | 29.5 | 30.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/centernet/centernet_resnet18_dcnv2_140e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131.log.json) | + +Note: + +- Flip box AP setting is single-scale and `flip=True`. +- Due to complex data enhancement, we find that the performance is unstable and may fluctuate by about 0.4 mAP. mAP 29.4 ~ 29.8 is acceptable in ResNet-18-DCNv2. +- Compared to the source code, we refer to [CenterNet-Better](https://github.com/FateScript/CenterNet-better), and make the following changes + - fix wrong image mean and variance in image normalization to be compatible with the pre-trained backbone. + - Use SGD rather than ADAM optimizer and add warmup and grad clip. + - Use DistributedDataParallel as other models in MMDetection rather than using DataParallel. + +## Citation + +```latex +@article{zhou2019objects, + title={Objects as Points}, + author={Zhou, Xingyi and Wang, Dequan and Kr{\"a}henb{\"u}hl, Philipp}, + booktitle={arXiv preprint arXiv:1904.07850}, + year={2019} +} +``` diff --git a/configs/mmdet/centernet/centernet_resnet18_140e_coco.py b/configs/mmdet/centernet/centernet_resnet18_140e_coco.py new file mode 100644 index 00000000..52c86a5e --- /dev/null +++ b/configs/mmdet/centernet/centernet_resnet18_140e_coco.py @@ -0,0 +1,3 @@ +_base_ = './centernet_resnet18_dcnv2_140e_coco.py' + +model = dict(neck=dict(use_dcn=False)) diff --git a/configs/mmdet/centernet/centernet_resnet18_dcnv2_140e_coco.py b/configs/mmdet/centernet/centernet_resnet18_dcnv2_140e_coco.py new file mode 100644 index 00000000..b8a0bb10 --- /dev/null +++ b/configs/mmdet/centernet/centernet_resnet18_dcnv2_140e_coco.py @@ -0,0 +1,127 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='CenterNet', + backbone=dict( + type='ResNet', + depth=18, + norm_eval=False, + norm_cfg=dict(type='BN'), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict( + type='CTResNetNeck', + in_channel=512, + num_deconv_filters=(256, 128, 64), + num_deconv_kernels=(4, 4, 4), + use_dcn=True), + bbox_head=dict( + type='CenterNetHead', + num_classes=80, + in_channel=64, + feat_channel=64, + loss_center_heatmap=dict(type='GaussianFocalLoss', loss_weight=1.0), + loss_wh=dict(type='L1Loss', loss_weight=0.1), + loss_offset=dict(type='L1Loss', loss_weight=1.0)), + train_cfg=None, + test_cfg=dict(topk=100, local_maximum_kernel=3, max_per_img=100)) + +# We fixed the incorrect img_norm_cfg problem in the source code. +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True, color_type='color'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='RandomCenterCropPad', + crop_size=(512, 512), + ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3), + mean=[0, 0, 0], + std=[1, 1, 1], + to_rgb=True, + test_pad_mode=None), + dict(type='Resize', img_scale=(512, 512), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict( + type='RandomCenterCropPad', + ratios=None, + border=None, + mean=[0, 0, 0], + std=[1, 1, 1], + to_rgb=True, + test_mode=True, + test_pad_mode=['logical_or', 31], + test_pad_add_pix=1), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + meta_keys=('filename', 'ori_filename', 'ori_shape', + 'img_shape', 'pad_shape', 'scale_factor', 'flip', + 'flip_direction', 'img_norm_cfg', 'border'), + keys=['img']) + ]) +] + +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Use RepeatDataset to speed up training +data = dict( + samples_per_gpu=16, + workers_per_gpu=4, + train=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +# optimizer +# Based on the default settings of modern detectors, the SGD effect is better +# than the Adam in the source code, so we use SGD default settings and +# if you use adam+lr5e-4, the map is 29.1. +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) + +# learning policy +# Based on the default settings of modern detectors, we added warmup settings. +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[18, 24]) # the real step is [18*5, 24*5] +runner = dict(max_epochs=28) # the real epoch is 28*5=140 + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (16 samples per GPU) +auto_scale_lr = dict(base_batch_size=128) diff --git a/configs/mmdet/centernet/metafile.yml b/configs/mmdet/centernet/metafile.yml new file mode 100644 index 00000000..e86e57b5 --- /dev/null +++ b/configs/mmdet/centernet/metafile.yml @@ -0,0 +1,46 @@ +Collections: + - Name: CenterNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x TITANXP GPUs + Architecture: + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.07850 + Title: 'Objects as Points' + README: configs/centernet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.13.0/mmdet/models/detectors/centernet.py#L10 + Version: v2.13.0 + +Models: + - Name: centernet_resnet18_dcnv2_140e_coco + In Collection: CenterNet + Config: configs/centernet/centernet_resnet18_dcnv2_140e_coco.py + Metadata: + Batch Size: 128 + Training Memory (GB): 3.47 + Epochs: 140 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 29.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth + + - Name: centernet_resnet18_140e_coco + In Collection: CenterNet + Config: configs/centernet/centernet_resnet18_140e_coco.py + Metadata: + Batch Size: 128 + Training Memory (GB): 3.45 + Epochs: 140 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 25.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630-bb5b3bf7.pth diff --git a/configs/mmdet/centripetalnet/README.md b/configs/mmdet/centripetalnet/README.md new file mode 100644 index 00000000..1a5a346b --- /dev/null +++ b/configs/mmdet/centripetalnet/README.md @@ -0,0 +1,36 @@ +# CentripetalNet + +> [CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection](https://arxiv.org/abs/2003.09119) + + + +## Abstract + +Keypoint-based detectors have achieved pretty-well performance. However, incorrect keypoint matching is still widespread and greatly affects the performance of the detector. In this paper, we propose CentripetalNet which uses centripetal shift to pair corner keypoints from the same instance. CentripetalNet predicts the position and the centripetal shift of the corner points and matches corners whose shifted results are aligned. Combining position information, our approach matches corner points more accurately than the conventional embedding approaches do. Corner pooling extracts information inside the bounding boxes onto the border. To make this information more aware at the corners, we design a cross-star deformable convolution network to conduct feature adaption. Furthermore, we explore instance segmentation on anchor-free detectors by equipping our CentripetalNet with a mask prediction module. On MS-COCO test-dev, our CentripetalNet not only outperforms all existing anchor-free detectors with an AP of 48.0% but also achieves comparable performance to the state-of-the-art instance segmentation approaches with a 40.2% MaskAP. + +
+ +
+ +## Results and Models + +| Backbone | Batch Size | Step/Total Epochs | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :--------: |:----------------: | :------: | :------------: | :----: | :------: | :--------: | +| HourglassNet-104 | [16 x 6](./centripetalnet_hourglass104_mstest_16x6_210e_coco.py) | 190/210 | 16.7 | 3.7 | 44.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804.log.json) | + +Note: + +- TTA setting is single-scale and `flip=True`. +- The model we released is the best checkpoint rather than the latest checkpoint (box AP 44.8 vs 44.6 in our experiment). + +## Citation + +```latex +@InProceedings{Dong_2020_CVPR, +author = {Dong, Zhiwei and Li, Guoxuan and Liao, Yue and Wang, Fei and Ren, Pengju and Qian, Chen}, +title = {CentripetalNet: Pursuing High-Quality Keypoint Pairs for Object Detection}, +booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, +month = {June}, +year = {2020} +} +``` diff --git a/configs/mmdet/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py b/configs/mmdet/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py new file mode 100644 index 00000000..5281c5bf --- /dev/null +++ b/configs/mmdet/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py @@ -0,0 +1,110 @@ +_base_ = [ + '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py' +] + +# model settings +model = dict( + type='CornerNet', + backbone=dict( + type='HourglassNet', + downsample_times=5, + num_stacks=2, + stage_channels=[256, 256, 384, 384, 384, 512], + stage_blocks=[2, 2, 2, 2, 2, 4], + norm_cfg=dict(type='BN', requires_grad=True)), + neck=None, + bbox_head=dict( + type='CentripetalHead', + num_classes=80, + in_channels=256, + num_feat_levels=2, + corner_emb_channels=0, + loss_heatmap=dict( + type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1), + loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1), + loss_guiding_shift=dict( + type='SmoothL1Loss', beta=1.0, loss_weight=0.05), + loss_centripetal_shift=dict( + type='SmoothL1Loss', beta=1.0, loss_weight=1)), + # training and testing settings + train_cfg=None, + test_cfg=dict( + corner_topk=100, + local_maximum_kernel=3, + distance_threshold=0.5, + score_thr=0.05, + max_per_img=100, + nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'))) +# data settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='RandomCenterCropPad', + crop_size=(511, 511), + ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3), + test_mode=False, + test_pad_mode=None, + **img_norm_cfg), + dict(type='Resize', img_scale=(511, 511), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=True, + transforms=[ + dict(type='Resize'), + dict( + type='RandomCenterCropPad', + crop_size=None, + ratios=None, + border=None, + test_mode=True, + test_pad_mode=['logical_or', 127], + **img_norm_cfg), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape', + 'scale_factor', 'flip', 'img_norm_cfg', 'border')), + ]) +] +data = dict( + samples_per_gpu=6, + workers_per_gpu=3, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='Adam', lr=0.0005) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[190]) +runner = dict(type='EpochBasedRunner', max_epochs=210) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (16 GPUs) x (6 samples per GPU) +auto_scale_lr = dict(base_batch_size=96) diff --git a/configs/mmdet/centripetalnet/metafile.yml b/configs/mmdet/centripetalnet/metafile.yml new file mode 100644 index 00000000..61aed3e5 --- /dev/null +++ b/configs/mmdet/centripetalnet/metafile.yml @@ -0,0 +1,39 @@ +Collections: + - Name: CentripetalNet + Metadata: + Training Data: COCO + Training Techniques: + - Adam + Training Resources: 16x V100 GPUs + Architecture: + - Corner Pooling + - Stacked Hourglass Network + Paper: + URL: https://arxiv.org/abs/2003.09119 + Title: 'CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection' + README: configs/centripetalnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.5.0/mmdet/models/detectors/cornernet.py#L9 + Version: v2.5.0 + +Models: + - Name: centripetalnet_hourglass104_mstest_16x6_210e_coco + In Collection: CentripetalNet + Config: configs/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py + Metadata: + Batch Size: 96 + Training Memory (GB): 16.7 + inference time (ms/im): + - value: 270.27 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 210 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth diff --git a/configs/mmdet/cityscapes/README.md b/configs/mmdet/cityscapes/README.md new file mode 100644 index 00000000..7522ffe4 --- /dev/null +++ b/configs/mmdet/cityscapes/README.md @@ -0,0 +1,46 @@ +# Cityscapes + +> [The Cityscapes Dataset for Semantic Urban Scene Understanding](https://arxiv.org/abs/1604.01685) + + + +## Abstract + +Visual understanding of complex urban street scenes is an enabling factor for a wide range of applications. Object detection has benefited enormously from large-scale datasets, especially in the context of deep learning. For semantic urban scene understanding, however, no current dataset adequately captures the complexity of real-world urban scenes. +To address this, we introduce Cityscapes, a benchmark suite and large-scale dataset to train and test approaches for pixel-level and instance-level semantic labeling. Cityscapes is comprised of a large, diverse set of stereo video sequences recorded in streets from 50 different cities. 5000 of these images have high quality pixel-level annotations; 20000 additional images have coarse annotations to enable methods that leverage large volumes of weakly-labeled data. Crucially, our effort exceeds previous attempts in terms of dataset size, annotation richness, scene variability, and complexity. Our accompanying empirical study provides an in-depth analysis of the dataset characteristics, as well as a performance evaluation of several state-of-the-art approaches based on our benchmark. + +
+ +
+ +## Common settings + +- All baselines were trained using 8 GPU with a batch size of 8 (1 images per GPU) using the [linear scaling rule](https://arxiv.org/abs/1706.02677) to scale the learning rate. +- All models were trained on `cityscapes_train`, and tested on `cityscapes_val`. +- 1x training schedule indicates 64 epochs which corresponds to slightly less than the 24k iterations reported in the original schedule from the [Mask R-CNN paper](https://arxiv.org/abs/1703.06870) +- COCO pre-trained weights are used to initialize. +- A conversion [script](../../tools/dataset_converters/cityscapes.py) is provided to convert Cityscapes into COCO format. Please refer to [install.md](../../docs/1_exist_data_model.md#prepare-datasets) for details. +- `CityscapesDataset` implemented three evaluation methods. `bbox` and `segm` are standard COCO bbox/mask AP. `cityscapes` is the cityscapes dataset official evaluation, which may be slightly higher than COCO. + +### Faster R-CNN + +| Backbone | Style | Lr schd | Scale | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :---: | :------: | :------------: | :----: | :------: | :--------: | +| R-50-FPN | pytorch | 1x | 800-1024 | 5.2 | - | 40.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes_20200502-829424c0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes_20200502_114915.log.json) | + +### Mask R-CNN + +| Backbone | Style | Lr schd | Scale | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------: | :------------: | :----: | :-----: | :------: | :------: | +| R-50-FPN | pytorch | 1x | 800-1024 | 5.3 | - | 40.9 | 36.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes/mask_rcnn_r50_fpn_1x_cityscapes_20201211_133733-d2858245.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes/mask_rcnn_r50_fpn_1x_cityscapes_20201211_133733.log.json) | + +## Citation + +```latex +@inproceedings{Cordts2016Cityscapes, + title={The Cityscapes Dataset for Semantic Urban Scene Understanding}, + author={Cordts, Marius and Omran, Mohamed and Ramos, Sebastian and Rehfeld, Timo and Enzweiler, Markus and Benenson, Rodrigo and Franke, Uwe and Roth, Stefan and Schiele, Bernt}, + booktitle={Proc. of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2016} +} +``` diff --git a/configs/mmdet/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes.py b/configs/mmdet/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes.py new file mode 100644 index 00000000..ca636bda --- /dev/null +++ b/configs/mmdet/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes.py @@ -0,0 +1,44 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/cityscapes_detection.py', + '../_base_/default_runtime.py' +] +model = dict( + backbone=dict(init_cfg=None), + roi_head=dict( + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)))) +# optimizer +# lr is set for a batch size of 8 +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + # [7] yields higher performance than [6] + step=[7]) +runner = dict( + type='EpochBasedRunner', max_epochs=8) # actual epoch = 8 * 8 = 64 +log_config = dict(interval=100) +# For better, more stable performance initialize from COCO +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth' # noqa + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/configs/mmdet/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py b/configs/mmdet/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py new file mode 100644 index 00000000..83ea058d --- /dev/null +++ b/configs/mmdet/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py @@ -0,0 +1,51 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py' +] +model = dict( + backbone=dict(init_cfg=None), + roi_head=dict( + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=8, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))) +# optimizer +# lr is set for a batch size of 8 +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + # [7] yields higher performance than [6] + step=[7]) +runner = dict( + type='EpochBasedRunner', max_epochs=8) # actual epoch = 8 * 8 = 64 +log_config = dict(interval=100) +# For better, more stable performance initialize from COCO +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth' # noqa + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/configs/mmdet/common/lsj_100e_coco_instance.py b/configs/mmdet/common/lsj_100e_coco_instance.py new file mode 100644 index 00000000..cacf23d7 --- /dev/null +++ b/configs/mmdet/common/lsj_100e_coco_instance.py @@ -0,0 +1,90 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +image_size = (1024, 1024) + +file_client_args = dict(backend='disk') +# comment out the code below to use different file client +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) + +train_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=image_size), # padding to image_size leads 0.5+ mAP + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# Use RepeatDataset to speed up training +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=4, # simply change this from 2 to 16 for 50e - 400e training. + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=5, metric=['bbox', 'segm']) + +# optimizer assumes bs=64 +optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004) +optimizer_config = dict(grad_clip=None) + +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.067, + step=[22, 24]) +runner = dict(type='EpochBasedRunner', max_epochs=25) diff --git a/configs/mmdet/common/mstrain-poly_3x_coco_instance.py b/configs/mmdet/common/mstrain-poly_3x_coco_instance.py new file mode 100644 index 00000000..c22ed945 --- /dev/null +++ b/configs/mmdet/common/mstrain-poly_3x_coco_instance.py @@ -0,0 +1,80 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# Use RepeatDataset to speed up training +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric=['bbox', 'segm']) + +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) + +# learning policy +# Experiments show that using step=[9, 11] has higher performance +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[9, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/common/mstrain_3x_coco.py b/configs/mmdet/common/mstrain_3x_coco.py new file mode 100644 index 00000000..80ec8b8d --- /dev/null +++ b/configs/mmdet/common/mstrain_3x_coco.py @@ -0,0 +1,76 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# Use RepeatDataset to speed up training +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='bbox') + +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) + +# learning policy +# Experiments show that using step=[9, 11] has higher performance +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[9, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/common/mstrain_3x_coco_instance.py b/configs/mmdet/common/mstrain_3x_coco_instance.py new file mode 100644 index 00000000..50f39bef --- /dev/null +++ b/configs/mmdet/common/mstrain_3x_coco_instance.py @@ -0,0 +1,76 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# Use RepeatDataset to speed up training +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric=['bbox', 'segm']) + +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) + +# learning policy +# Experiments show that using step=[9, 11] has higher performance +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[9, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/common/ssj_270k_coco_instance.py b/configs/mmdet/common/ssj_270k_coco_instance.py new file mode 100644 index 00000000..851098f8 --- /dev/null +++ b/configs/mmdet/common/ssj_270k_coco_instance.py @@ -0,0 +1,91 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +image_size = (1024, 1024) + +file_client_args = dict(backend='disk') + +# Standard Scale Jittering (SSJ) resizes and crops an image +# with a resize range of 0.8 to 1.25 of the original image size. +train_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.8, 1.25), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=image_size), # padding to image_size leads 0.5+ mAP + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) + +evaluation = dict(interval=6000, metric=['bbox', 'segm']) + +# optimizer assumes batch_size = (32 GPUs) x (2 samples per GPU) +optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004) +optimizer_config = dict(grad_clip=None) + +# lr steps at [0.9, 0.95, 0.975] of the maximum iterations +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[243000, 256500, 263250]) +checkpoint_config = dict(interval=6000) +# The model is trained by 270k iterations with batch_size 64, +# which is roughly equivalent to 144 epochs. +runner = dict(type='IterBasedRunner', max_iters=270000) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/common/ssj_scp_270k_coco_instance.py b/configs/mmdet/common/ssj_scp_270k_coco_instance.py new file mode 100644 index 00000000..540839ff --- /dev/null +++ b/configs/mmdet/common/ssj_scp_270k_coco_instance.py @@ -0,0 +1,97 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +image_size = (1024, 1024) + +file_client_args = dict(backend='disk') + +# Standard Scale Jittering (SSJ) resizes and crops an image +# with a resize range of 0.8 to 1.25 of the original image size. +load_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.8, 1.25), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Pad', size=image_size), +] +train_pipeline = [ + dict(type='CopyPaste', max_num_pasted=100), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='MultiImageMixDataset', + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=load_pipeline), + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) + +evaluation = dict(interval=6000, metric=['bbox', 'segm']) + +# optimizer assumes batch_size = (32 GPUs) x (2 samples per GPU) +optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004) +optimizer_config = dict(grad_clip=None) + +# lr steps at [0.9, 0.95, 0.975] of the maximum iterations +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[243000, 256500, 263250]) +checkpoint_config = dict(interval=6000) +# The model is trained by 270k iterations with batch_size 64, +# which is roughly equivalent to 144 epochs. +runner = dict(type='IterBasedRunner', max_iters=270000) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/cornernet/README.md b/configs/mmdet/cornernet/README.md new file mode 100644 index 00000000..55877c4c --- /dev/null +++ b/configs/mmdet/cornernet/README.md @@ -0,0 +1,43 @@ +# CornerNet + +> [Cornernet: Detecting objects as paired keypoints](https://arxiv.org/abs/1808.01244) + + + +## Abstract + +We propose CornerNet, a new approach to object detection where we detect an object bounding box as a pair of keypoints, the top-left corner and the bottom-right corner, using a single convolution neural network. By detecting objects as paired keypoints, we eliminate the need for designing a set of anchor boxes commonly used in prior single-stage detectors. In addition to our novel formulation, we introduce corner pooling, a new type of pooling layer that helps the network better localize corners. Experiments show that CornerNet achieves a 42.2% AP on MS COCO, outperforming all existing one-stage detectors. + +
+ +
+ +## Results and Models + +| Backbone | Batch Size | Step/Total Epochs | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :--------: |:----------------: | :------: | :------------: | :----: | :------: | :--------: | +| HourglassNet-104 | [10 x 5](./cornernet_hourglass104_mstest_10x5_210e_coco.py) | 180/210 | 13.9 | 4.2 | 41.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720-5fefbf1c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720.log.json) | +| HourglassNet-104 | [8 x 6](./cornernet_hourglass104_mstest_8x6_210e_coco.py) | 180/210 | 15.9 | 4.2 | 41.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618.log.json) | +| HourglassNet-104 | [32 x 3](./cornernet_hourglass104_mstest_32x3_210e_coco.py) | 180/210 | 9.5 | 3.9 | 40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110-1efaea91.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110.log.json) | + +Note: + +- TTA setting is single-scale and `flip=True`. +- Experiments with `images_per_gpu=6` are conducted on Tesla V100-SXM2-32GB, `images_per_gpu=3` are conducted on GeForce GTX 1080 Ti. +- Here are the descriptions of each experiment setting: + - 10 x 5: 10 GPUs with 5 images per gpu. This is the same setting as that reported in the original paper. + - 8 x 6: 8 GPUs with 6 images per gpu. The total batchsize is similar to paper and only need 1 node to train. + - 32 x 3: 32 GPUs with 3 images per gpu. The default setting for 1080TI and need 4 nodes to train. + +## Citation + +```latex +@inproceedings{law2018cornernet, + title={Cornernet: Detecting objects as paired keypoints}, + author={Law, Hei and Deng, Jia}, + booktitle={15th European Conference on Computer Vision, ECCV 2018}, + pages={765--781}, + year={2018}, + organization={Springer Verlag} +} +``` diff --git a/configs/mmdet/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py b/configs/mmdet/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py new file mode 100644 index 00000000..6cb05a78 --- /dev/null +++ b/configs/mmdet/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py @@ -0,0 +1,110 @@ +_base_ = [ + '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py' +] + +# model settings +model = dict( + type='CornerNet', + backbone=dict( + type='HourglassNet', + downsample_times=5, + num_stacks=2, + stage_channels=[256, 256, 384, 384, 384, 512], + stage_blocks=[2, 2, 2, 2, 2, 4], + norm_cfg=dict(type='BN', requires_grad=True)), + neck=None, + bbox_head=dict( + type='CornerHead', + num_classes=80, + in_channels=256, + num_feat_levels=2, + corner_emb_channels=1, + loss_heatmap=dict( + type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1), + loss_embedding=dict( + type='AssociativeEmbeddingLoss', + pull_weight=0.10, + push_weight=0.10), + loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1)), + # training and testing settings + train_cfg=None, + test_cfg=dict( + corner_topk=100, + local_maximum_kernel=3, + distance_threshold=0.5, + score_thr=0.05, + max_per_img=100, + nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'))) +# data settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='RandomCenterCropPad', + crop_size=(511, 511), + ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3), + test_mode=False, + test_pad_mode=None, + **img_norm_cfg), + dict(type='Resize', img_scale=(511, 511), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=True, + transforms=[ + dict(type='Resize'), + dict( + type='RandomCenterCropPad', + crop_size=None, + ratios=None, + border=None, + test_mode=True, + test_pad_mode=['logical_or', 127], + **img_norm_cfg), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape', + 'scale_factor', 'flip', 'img_norm_cfg', 'border')), + ]) +] +data = dict( + samples_per_gpu=5, + workers_per_gpu=3, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='Adam', lr=0.0005) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[180]) +runner = dict(type='EpochBasedRunner', max_epochs=210) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (10 GPUs) x (5 samples per GPU) +auto_scale_lr = dict(base_batch_size=50) diff --git a/configs/mmdet/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py b/configs/mmdet/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py new file mode 100644 index 00000000..f539cdb8 --- /dev/null +++ b/configs/mmdet/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py @@ -0,0 +1,110 @@ +_base_ = [ + '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py' +] + +# model settings +model = dict( + type='CornerNet', + backbone=dict( + type='HourglassNet', + downsample_times=5, + num_stacks=2, + stage_channels=[256, 256, 384, 384, 384, 512], + stage_blocks=[2, 2, 2, 2, 2, 4], + norm_cfg=dict(type='BN', requires_grad=True)), + neck=None, + bbox_head=dict( + type='CornerHead', + num_classes=80, + in_channels=256, + num_feat_levels=2, + corner_emb_channels=1, + loss_heatmap=dict( + type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1), + loss_embedding=dict( + type='AssociativeEmbeddingLoss', + pull_weight=0.10, + push_weight=0.10), + loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1)), + # training and testing settings + train_cfg=None, + test_cfg=dict( + corner_topk=100, + local_maximum_kernel=3, + distance_threshold=0.5, + score_thr=0.05, + max_per_img=100, + nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'))) +# data settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='RandomCenterCropPad', + crop_size=(511, 511), + ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3), + test_mode=False, + test_pad_mode=None, + **img_norm_cfg), + dict(type='Resize', img_scale=(511, 511), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=True, + transforms=[ + dict(type='Resize'), + dict( + type='RandomCenterCropPad', + crop_size=None, + ratios=None, + border=None, + test_mode=True, + test_pad_mode=['logical_or', 127], + **img_norm_cfg), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape', + 'scale_factor', 'flip', 'img_norm_cfg', 'border')), + ]) +] +data = dict( + samples_per_gpu=3, + workers_per_gpu=3, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='Adam', lr=0.0005) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[180]) +runner = dict(type='EpochBasedRunner', max_epochs=210) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (3 samples per GPU) +auto_scale_lr = dict(base_batch_size=96) diff --git a/configs/mmdet/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py b/configs/mmdet/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py new file mode 100644 index 00000000..9b115d78 --- /dev/null +++ b/configs/mmdet/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py @@ -0,0 +1,110 @@ +_base_ = [ + '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py' +] + +# model settings +model = dict( + type='CornerNet', + backbone=dict( + type='HourglassNet', + downsample_times=5, + num_stacks=2, + stage_channels=[256, 256, 384, 384, 384, 512], + stage_blocks=[2, 2, 2, 2, 2, 4], + norm_cfg=dict(type='BN', requires_grad=True)), + neck=None, + bbox_head=dict( + type='CornerHead', + num_classes=80, + in_channels=256, + num_feat_levels=2, + corner_emb_channels=1, + loss_heatmap=dict( + type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1), + loss_embedding=dict( + type='AssociativeEmbeddingLoss', + pull_weight=0.10, + push_weight=0.10), + loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1)), + # training and testing settings + train_cfg=None, + test_cfg=dict( + corner_topk=100, + local_maximum_kernel=3, + distance_threshold=0.5, + score_thr=0.05, + max_per_img=100, + nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'))) +# data settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='RandomCenterCropPad', + crop_size=(511, 511), + ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3), + test_mode=False, + test_pad_mode=None, + **img_norm_cfg), + dict(type='Resize', img_scale=(511, 511), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=True, + transforms=[ + dict(type='Resize'), + dict( + type='RandomCenterCropPad', + crop_size=None, + ratios=None, + border=None, + test_mode=True, + test_pad_mode=['logical_or', 127], + **img_norm_cfg), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape', + 'scale_factor', 'flip', 'img_norm_cfg', 'border')), + ]) +] +data = dict( + samples_per_gpu=6, + workers_per_gpu=3, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='Adam', lr=0.0005) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[180]) +runner = dict(type='EpochBasedRunner', max_epochs=210) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (6 samples per GPU) +auto_scale_lr = dict(base_batch_size=48) diff --git a/configs/mmdet/cornernet/metafile.yml b/configs/mmdet/cornernet/metafile.yml new file mode 100644 index 00000000..c2f6143a --- /dev/null +++ b/configs/mmdet/cornernet/metafile.yml @@ -0,0 +1,83 @@ +Collections: + - Name: CornerNet + Metadata: + Training Data: COCO + Training Techniques: + - Adam + Training Resources: 8x V100 GPUs + Architecture: + - Corner Pooling + - Stacked Hourglass Network + Paper: + URL: https://arxiv.org/abs/1808.01244 + Title: 'CornerNet: Detecting Objects as Paired Keypoints' + README: configs/cornernet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.3.0/mmdet/models/detectors/cornernet.py#L9 + Version: v2.3.0 + +Models: + - Name: cornernet_hourglass104_mstest_10x5_210e_coco + In Collection: CornerNet + Config: configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py + Metadata: + Training Resources: 10x V100 GPUs + Batch Size: 50 + Training Memory (GB): 13.9 + inference time (ms/im): + - value: 238.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 210 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720-5fefbf1c.pth + + - Name: cornernet_hourglass104_mstest_8x6_210e_coco + In Collection: CornerNet + Config: configs/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py + Metadata: + Batch Size: 48 + Training Memory (GB): 15.9 + inference time (ms/im): + - value: 238.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 210 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth + + - Name: cornernet_hourglass104_mstest_32x3_210e_coco + In Collection: CornerNet + Config: configs/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py + Metadata: + Training Resources: 32x V100 GPUs + Batch Size: 96 + Training Memory (GB): 9.5 + inference time (ms/im): + - value: 256.41 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 210 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110-1efaea91.pth diff --git a/configs/mmdet/dcn/README.md b/configs/mmdet/dcn/README.md new file mode 100644 index 00000000..7866078a --- /dev/null +++ b/configs/mmdet/dcn/README.md @@ -0,0 +1,48 @@ +# DCN + +> [Deformable Convolutional Networks](https://arxiv.org/abs/1703.06211) + + + +## Abstract + +Convolutional neural networks (CNNs) are inherently limited to model geometric transformations due to the fixed geometric structures in its building modules. In this work, we introduce two new modules to enhance the transformation modeling capacity of CNNs, namely, deformable convolution and deformable RoI pooling. Both are based on the idea of augmenting the spatial sampling locations in the modules with additional offsets and learning the offsets from target tasks, without additional supervision. The new modules can readily replace their plain counterparts in existing CNNs and can be easily trained end-to-end by standard back-propagation, giving rise to deformable convolutional networks. Extensive experiments validate the effectiveness of our approach on sophisticated vision tasks of object detection and semantic segmentation. + +
+ +
+ +## Results and Models + +| Backbone | Model | Style | Conv | Pool | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:----------------:|:------------:|:-------:|:-------------:|:------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 4.0 | 17.8 | 41.3 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130_212941.log.json) | +| R-50-FPN | Faster | pytorch | - | dpool | 1x | 5.0 | 17.2 | 38.9 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307-90d3c01d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307_203250.log.json) | +| R-101-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 6.0 | 12.5 | 42.7 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-1377f13d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203_230019.log.json) | +| X-101-32x4d-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 7.3 | 10.0 | 44.5 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203-4f85c69c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203_001325.log.json) | +| R-50-FPN | Mask | pytorch | dconv(c3-c5) | - | 1x | 4.5 | 15.4 | 41.8 | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203-4d9ad43b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203_061339.log.json) | +| R-101-FPN | Mask | pytorch | dconv(c3-c5) | - | 1x | 6.5 | 11.7 | 43.5 | 38.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216-a71f5bce.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216_191601.log.json) | +| R-50-FPN | Cascade | pytorch | dconv(c3-c5) | - | 1x | 4.5 | 14.6 | 43.8 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-2f1fca44.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130_220843.log.json) | +| R-101-FPN | Cascade | pytorch | dconv(c3-c5) | - | 1x | 6.4 | 11.0 | 45.0 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-3b2f0594.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203_224829.log.json) | +| R-50-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 6.0 | 10.0 | 44.4 | 38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202-42e767a2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202_010309.log.json) | +| R-101-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 8.0 | 8.6 | 45.8 | 39.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204-df0c5f10.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204_134006.log.json) | +| X-101-32x4d-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 9.2 | | 47.3 | 41.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-e75f90c8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-20200606_183737.log.json) | +| R-50-FPN (FP16) | Mask | pytorch | dconv(c3-c5) | - | 1x | 3.0 | | 41.9 | 37.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247-c06429d2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247.log.json) | + +**Notes:** + +- `dconv` denotes deformable convolution, `c3-c5` means adding dconv in resnet stage 3 to 5. `dpool` denotes deformable roi pooling. +- The dcn ops are modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch, which should be more memory efficient and slightly faster. +- (*) For R-50-FPN (dg=4), dg is short for deformable_group. This model is trained and tested on Amazon EC2 p3dn.24xlarge instance. +- **Memory, Train/Inf time is outdated.** + +## Citation + +```latex +@inproceedings{dai2017deformable, + title={Deformable Convolutional Networks}, + author={Dai, Jifeng and Qi, Haozhi and Xiong, Yuwen and Li, Yi and Zhang, Guodong and Hu, Han and Wei, Yichen}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + year={2017} +} +``` diff --git a/configs/mmdet/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..081b998f --- /dev/null +++ b/configs/mmdet/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..3b3683af --- /dev/null +++ b/configs/mmdet/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..daaa4729 --- /dev/null +++ b/configs/mmdet/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..a01df33c --- /dev/null +++ b/configs/mmdet/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..aa664bd6 --- /dev/null +++ b/configs/mmdet/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..f5fee7e1 --- /dev/null +++ b/configs/mmdet/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..8787088f --- /dev/null +++ b/configs/mmdet/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py b/configs/mmdet/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py new file mode 100644 index 00000000..1b695f0e --- /dev/null +++ b/configs/mmdet/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py @@ -0,0 +1,12 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + _delete_=True, + type='DeformRoIPoolPack', + output_size=7, + output_channels=256), + out_channels=256, + featmap_strides=[4, 8, 16, 32]))) diff --git a/configs/mmdet/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..e3bea195 --- /dev/null +++ b/configs/mmdet/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..cb340022 --- /dev/null +++ b/configs/mmdet/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..ababe58d --- /dev/null +++ b/configs/mmdet/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcn/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py b/configs/mmdet/dcn/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..ee5cca7d --- /dev/null +++ b/configs/mmdet/dcn/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) + +fp16 = dict(loss_scale=512.) diff --git a/configs/mmdet/dcn/metafile.yml b/configs/mmdet/dcn/metafile.yml new file mode 100644 index 00000000..36f38871 --- /dev/null +++ b/configs/mmdet/dcn/metafile.yml @@ -0,0 +1,272 @@ +Collections: + - Name: Deformable Convolutional Networks + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Deformable Convolution + Paper: + URL: https://arxiv.org/abs/1703.06211 + Title: "Deformable Convolutional Networks" + README: configs/dcn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/dcn/deform_conv.py#L15 + Version: v2.0.0 + +Models: + - Name: faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 56.18 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth + + - Name: faster_rcnn_r50_fpn_dpool_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + inference time (ms/im): + - value: 58.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307-90d3c01d.pth + + - Name: faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 80 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-1377f13d.pth + + - Name: faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 7.3 + inference time (ms/im): + - value: 100 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203-4f85c69c.pth + + - Name: mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + inference time (ms/im): + - value: 64.94 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203-4d9ad43b.pth + + - Name: mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py + Metadata: + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + Training Memory (GB): 3.0 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247-c06429d2.pth + + - Name: mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 6.5 + inference time (ms/im): + - value: 85.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216-a71f5bce.pth + + - Name: cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + inference time (ms/im): + - value: 68.49 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-2f1fca44.pth + + - Name: cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-3b2f0594.pth + + - Name: cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 100 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202-42e767a2.pth + + - Name: cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 8.0 + inference time (ms/im): + - value: 116.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204-df0c5f10.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 9.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-e75f90c8.pth diff --git a/configs/mmdet/dcnv2/README.md b/configs/mmdet/dcnv2/README.md new file mode 100644 index 00000000..1e7e3201 --- /dev/null +++ b/configs/mmdet/dcnv2/README.md @@ -0,0 +1,37 @@ +# DCNv2 + +> [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168) + + + +## Abstract + +The superior performance of Deformable Convolutional Networks arises from its ability to adapt to the geometric variations of objects. Through an examination of its adaptive behavior, we observe that while the spatial support for its neural features conforms more closely than regular ConvNets to object structure, this support may nevertheless extend well beyond the region of interest, causing features to be influenced by irrelevant image content. To address this problem, we present a reformulation of Deformable ConvNets that improves its ability to focus on pertinent image regions, through increased modeling power and stronger training. The modeling power is enhanced through a more comprehensive integration of deformable convolution within the network, and by introducing a modulation mechanism that expands the scope of deformation modeling. To effectively harness this enriched modeling capability, we guide network training via a proposed feature mimicking scheme that helps the network to learn features that reflect the object focus and classification power of RCNN features. With the proposed contributions, this new version of Deformable ConvNets yields significant performance gains over the original model and produces leading results on the COCO benchmark for object detection and instance segmentation. + +## Results and Models + +| Backbone | Model | Style | Conv | Pool | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:----------------:|:------------:|:-------:|:-------------:|:------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50-FPN | Faster | pytorch | mdconv(c3-c5) | - | 1x | 4.1 | 17.6 | 41.4 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130-d099253b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130_222144.log.json) | +| *R-50-FPN (dg=4) | Faster | pytorch | mdconv(c3-c5) | - | 1x | 4.2 | 17.4 | 41.5 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130-01262257.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130_222058.log.json) | +| R-50-FPN | Faster | pytorch | - | mdpool | 1x | 5.8 | 16.6 | 38.7 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307_203304.log.json) | +| R-50-FPN | Mask | pytorch | mdconv(c3-c5) | - | 1x | 4.5 | 15.1 | 41.5 | 37.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203-ad97591f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203_063443.log.json) | +| R-50-FPN (FP16) | Mask | pytorch | mdconv(c3-c5)| - | 1x | 3.1 | | 42.0 | 37.6 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434-cf8fefa5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434.log.json) | + +**Notes:** + +- `mdconv` denotes modulated deformable convolution, `c3-c5` means adding dconv in resnet stage 3 to 5. `mdpool` denotes modulated deformable roi pooling. +- The dcn ops are modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch, which should be more memory efficient and slightly faster. +- (*) For R-50-FPN (dg=4), dg is short for deformable_group. This model is trained and tested on Amazon EC2 p3dn.24xlarge instance. +- **Memory, Train/Inf time is outdated.** + +## Citation + +```latex +@article{zhu2018deformable, + title={Deformable ConvNets v2: More Deformable, Better Results}, + author={Zhu, Xizhou and Hu, Han and Lin, Stephen and Dai, Jifeng}, + journal={arXiv preprint arXiv:1811.11168}, + year={2018} +} +``` diff --git a/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py b/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..d1bcf3c1 --- /dev/null +++ b/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py b/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py new file mode 100644 index 00000000..d0ab89c2 --- /dev/null +++ b/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=4, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py b/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py new file mode 100644 index 00000000..ad7b0346 --- /dev/null +++ b/configs/mmdet/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py @@ -0,0 +1,12 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + _delete_=True, + type='ModulatedDeformRoIPoolPack', + output_size=7, + output_channels=256), + out_channels=256, + featmap_strides=[4, 8, 16, 32]))) diff --git a/configs/mmdet/dcnv2/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py b/configs/mmdet/dcnv2/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..7e21454b --- /dev/null +++ b/configs/mmdet/dcnv2/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) + +fp16 = dict(loss_scale=512.) diff --git a/configs/mmdet/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py b/configs/mmdet/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..5ca2a67c --- /dev/null +++ b/configs/mmdet/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/dcnv2/metafile.yml b/configs/mmdet/dcnv2/metafile.yml new file mode 100644 index 00000000..90494215 --- /dev/null +++ b/configs/mmdet/dcnv2/metafile.yml @@ -0,0 +1,123 @@ +Collections: + - Name: Deformable Convolutional Networks v2 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Deformable Convolution + Paper: + URL: https://arxiv.org/abs/1811.11168 + Title: "Deformable ConvNets v2: More Deformable, Better Results" + README: configs/dcnv2/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/dcn/deform_conv.py#L15 + Version: v2.0.0 + +Models: + - Name: faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 4.1 + inference time (ms/im): + - value: 56.82 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130-d099253b.pth + + - Name: faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + inference time (ms/im): + - value: 57.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130-01262257.pth + + - Name: faster_rcnn_r50_fpn_mdpool_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco.py + Metadata: + Training Memory (GB): 5.8 + inference time (ms/im): + - value: 60.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth + + - Name: mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + inference time (ms/im): + - value: 66.23 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203-ad97591f.pth + + - Name: mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcn/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 3.1 + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434-cf8fefa5.pth diff --git a/configs/mmdet/deepfashion/README.md b/configs/mmdet/deepfashion/README.md new file mode 100644 index 00000000..dd4f012b --- /dev/null +++ b/configs/mmdet/deepfashion/README.md @@ -0,0 +1,70 @@ +# DeepFashion + +> [DeepFashion: Powering Robust Clothes Recognition and Retrieval With Rich Annotations](https://openaccess.thecvf.com/content_cvpr_2016/html/Liu_DeepFashion_Powering_Robust_CVPR_2016_paper.html) + + + +## Abstract + +Recent advances in clothes recognition have been driven by the construction of clothes datasets. Existing datasets are limited in the amount of annotations and are difficult to cope with the various challenges in real-world applications. In this work, we introduce DeepFashion, a large-scale clothes dataset with comprehensive annotations. It contains over 800,000 images, which are richly annotated with massive attributes, clothing landmarks, and correspondence of images taken under different scenarios including store, street snapshot, and consumer. Such rich annotations enable the development of powerful algorithms in clothes recognition and facilitating future researches. To demonstrate the advantages of DeepFashion, we propose a new deep model, namely FashionNet, which learns clothing features by jointly predicting clothing attributes and landmarks. The estimated landmarks are then employed to pool or gate the learned features. It is optimized in an iterative manner. Extensive experiments demonstrate the effectiveness of FashionNet and the usefulness of DeepFashion. + +
+ +
+ +## Introduction + +[MMFashion](https://github.com/open-mmlab/mmfashion) develops "fashion parsing and segmentation" module +based on the dataset +[DeepFashion-Inshop](https://drive.google.com/drive/folders/0B7EVK8r0v71pVDZFQXRsMDZCX1E?usp=sharing). +Its annotation follows COCO style. +To use it, you need to first download the data. Note that we only use "img_highres" in this task. +The file tree should be like this: + +```sh +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── DeepFashion +│ │ ├── In-shop +│ │ ├── Anno +│ │ │   ├── segmentation +│ │ │   | ├── DeepFashion_segmentation_train.json +│ │ │   | ├── DeepFashion_segmentation_query.json +│ │ │   | ├── DeepFashion_segmentation_gallery.json +│ │ │   ├── list_bbox_inshop.txt +│ │ │   ├── list_description_inshop.json +│ │ │   ├── list_item_inshop.txt +│ │ │   └── list_landmarks_inshop.txt +│ │ ├── Eval +│ │ │ └── list_eval_partition.txt +│ │ ├── Img +│ │ │ ├── img +│ │ │ │ ├──XXX.jpg +│ │ │ ├── img_highres +│ │ │ └── ├──XXX.jpg + +``` + +After that you can train the Mask RCNN r50 on DeepFashion-In-shop dataset by launching training with the `mask_rcnn_r50_fpn_1x.py` config +or creating your own config file. + +## Results and Models + +| Backbone | Model type | Dataset | bbox detection Average Precision | segmentation Average Precision | Config | Download (Google) | +| :---------: | :----------: | :-----------------: | :--------------------------------: | :----------------------------: | :---------:| :-------------------------: | +| ResNet50 | Mask RCNN | DeepFashion-In-shop | 0.599 | 0.584 |[config](https://github.com/open-mmlab/mmdetection/blob/master/configs/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion/mask_rcnn_r50_fpn_15e_deepfashion_20200329_192752.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion/20200329_192752.log.json) | + +## Citation + +```latex +@inproceedings{liuLQWTcvpr16DeepFashion, + author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou}, + title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations}, + booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2016} +} +``` diff --git a/configs/mmdet/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion.py b/configs/mmdet/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion.py new file mode 100644 index 00000000..c4e86387 --- /dev/null +++ b/configs/mmdet/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion.py @@ -0,0 +1,10 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/deepfashion.py', '../_base_/schedules/schedule_1x.py', + '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict(num_classes=15), mask_head=dict(num_classes=15))) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=15) diff --git a/configs/mmdet/deformable_detr/README.md b/configs/mmdet/deformable_detr/README.md new file mode 100644 index 00000000..f415be35 --- /dev/null +++ b/configs/mmdet/deformable_detr/README.md @@ -0,0 +1,41 @@ +# Deformable DETR + +> [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) + + + +## Abstract + +DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However, it suffers from slow convergence and limited feature spatial resolution, due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues, we proposed Deformable DETR, whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach. + +
+ +
+ +## Results and Models + +| Backbone | Model | Lr schd | box AP | Config | Download | +|:------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | Deformable DETR |50e | 44.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/deformable_detr/deformable_detr_r50_16x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_r50_16x2_50e_coco/deformable_detr_r50_16x2_50e_coco_20210419_220030-a12b9512.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_r50_16x2_50e_coco/deformable_detr_r50_16x2_50e_coco_20210419_220030-a12b9512.log.json) | +| R-50 | + iterative bounding box refinement |50e | 46.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco/deformable_detr_refine_r50_16x2_50e_coco_20210419_220503-5f5dff21.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco/deformable_detr_refine_r50_16x2_50e_coco_20210419_220503-5f5dff21.log.json) | +| R-50 | ++ two-stage Deformable DETR |50e | 46.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco/deformable_detr_twostage_refine_r50_16x2_50e_coco_20210419_220613-9d28ab72.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco/deformable_detr_twostage_refine_r50_16x2_50e_coco_20210419_220613-9d28ab72.log.json) | + +# NOTE + +1. All models are trained with batch size 32. +2. The performance is unstable. `Deformable DETR` and `iterative bounding box refinement` may fluctuate about 0.3 mAP. `two-stage Deformable DETR` may fluctuate about 0.2 mAP. + +## Citation + +We provide the config files for Deformable DETR: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159). + +```latex +@inproceedings{ +zhu2021deformable, +title={Deformable DETR: Deformable Transformers for End-to-End Object Detection}, +author={Xizhou Zhu and Weijie Su and Lewei Lu and Bin Li and Xiaogang Wang and Jifeng Dai}, +booktitle={International Conference on Learning Representations}, +year={2021}, +url={https://openreview.net/forum?id=gZ9hCDWe6ke} +} +``` diff --git a/configs/mmdet/deformable_detr/deformable_detr_r50_16x2_50e_coco.py b/configs/mmdet/deformable_detr/deformable_detr_r50_16x2_50e_coco.py new file mode 100644 index 00000000..c64d09fe --- /dev/null +++ b/configs/mmdet/deformable_detr/deformable_detr_r50_16x2_50e_coco.py @@ -0,0 +1,177 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +model = dict( + type='DeformableDETR', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + bbox_head=dict( + type='DeformableDETRHead', + num_query=300, + num_classes=80, + in_channels=2048, + sync_cls_avg_factor=True, + as_two_stage=False, + transformer=dict( + type='DeformableDetrTransformer', + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', embed_dims=256), + feedforward_channels=1024, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'ffn', 'norm'))), + decoder=dict( + type='DeformableDetrTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + dict( + type='MultiScaleDeformableAttention', + embed_dims=256) + ], + feedforward_channels=1024, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + positional_encoding=dict( + type='SinePositionalEncoding', + num_feats=128, + normalize=True, + offset=-0.5), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), + test_cfg=dict(max_per_img=100)) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[ + [ + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict( + type='Resize', + # The radio of all image in train dataset < 7 + # follow the original impl + img_scale=[(400, 4200), (500, 4200), (600, 4200)], + multiscale_mode='value', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ] + ]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +# test_pipeline, NOTE the Pad's size_divisor is different from the default +# setting (size_divisor=32). While there is little effect on the performance +# whether we use the default setting or use size_divisor=1. +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(filter_empty_gt=False, pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='AdamW', + lr=2e-4, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1), + 'sampling_offsets': dict(lr_mult=0.1), + 'reference_points': dict(lr_mult=0.1) + })) +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[40]) +runner = dict(type='EpochBasedRunner', max_epochs=50) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (16 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=32) diff --git a/configs/mmdet/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py b/configs/mmdet/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py new file mode 100644 index 00000000..01f13df4 --- /dev/null +++ b/configs/mmdet/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py @@ -0,0 +1,2 @@ +_base_ = 'deformable_detr_r50_16x2_50e_coco.py' +model = dict(bbox_head=dict(with_box_refine=True)) diff --git a/configs/mmdet/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py b/configs/mmdet/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py new file mode 100644 index 00000000..2aa840d9 --- /dev/null +++ b/configs/mmdet/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py @@ -0,0 +1,2 @@ +_base_ = 'deformable_detr_refine_r50_16x2_50e_coco.py' +model = dict(bbox_head=dict(as_two_stage=True)) diff --git a/configs/mmdet/deformable_detr/metafile.yml b/configs/mmdet/deformable_detr/metafile.yml new file mode 100644 index 00000000..873292db --- /dev/null +++ b/configs/mmdet/deformable_detr/metafile.yml @@ -0,0 +1,56 @@ +Collections: + - Name: Deformable DETR + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - Transformer + Paper: + URL: https://openreview.net/forum?id=gZ9hCDWe6ke + Title: 'Deformable DETR: Deformable Transformers for End-to-End Object Detection' + README: configs/deformable_detr/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/deformable_detr.py#L6 + Version: v2.12.0 + +Models: + - Name: deformable_detr_r50_16x2_50e_coco + In Collection: Deformable DETR + Config: configs/deformable_detr/deformable_detr_r50_16x2_50e_coco.py + Metadata: + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_r50_16x2_50e_coco/deformable_detr_r50_16x2_50e_coco_20210419_220030-a12b9512.pth + + - Name: deformable_detr_refine_r50_16x2_50e_coco + In Collection: Deformable DETR + Config: configs/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py + Metadata: + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco/deformable_detr_refine_r50_16x2_50e_coco_20210419_220503-5f5dff21.pth + + - Name: deformable_detr_twostage_refine_r50_16x2_50e_coco + In Collection: Deformable DETR + Config: configs/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py + Metadata: + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco/deformable_detr_twostage_refine_r50_16x2_50e_coco_20210419_220613-9d28ab72.pth diff --git a/configs/mmdet/detectors/README.md b/configs/mmdet/detectors/README.md new file mode 100644 index 00000000..3504ee27 --- /dev/null +++ b/configs/mmdet/detectors/README.md @@ -0,0 +1,69 @@ +# DetectoRS + +> [DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution](https://arxiv.org/abs/2006.02334) + + + +## Abstract + +Many modern object detectors demonstrate outstanding performances by using the mechanism of looking and thinking twice. In this paper, we explore this mechanism in the backbone design for object detection. At the macro level, we propose Recursive Feature Pyramid, which incorporates extra feedback connections from Feature Pyramid Networks into the bottom-up backbone layers. At the micro level, we propose Switchable Atrous Convolution, which convolves the features with different atrous rates and gathers the results using switch functions. Combining them results in DetectoRS, which significantly improves the performances of object detection. On COCO test-dev, DetectoRS achieves state-of-the-art 55.7% box AP for object detection, 48.5% mask AP for instance segmentation, and 50.0% PQ for panoptic segmentation. + +
+ +
+ +## Introduction + +DetectoRS requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +| | ├── stuffthingmaps +``` + +## Results and Models + +DetectoRS includes two major components: + +- Recursive Feature Pyramid (RFP). +- Switchable Atrous Convolution (SAC). + +They can be used independently. +Combining them together results in DetectoRS. +The results on COCO 2017 val are shown in the below table. + +| Method | Detector | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:------:|:--------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| RFP | Cascade + ResNet-50 | 1x | 7.5 | - | 44.8 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/cascade_rcnn_r50_rfp_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco-8cf51bfd.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco_20200624_104126.log.json) | +| SAC | Cascade + ResNet-50 | 1x | 5.6 | - | 45.0| | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/cascade_rcnn_r50_sac_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco-24bfda62.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco_20200624_104402.log.json) | +| DetectoRS | Cascade + ResNet-50 | 1x | 9.9 | - | 47.4 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/detectors_cascade_rcnn_r50_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco-32a10ba0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco_20200706_001203.log.json) | +| RFP | HTC + ResNet-50 | 1x | 11.2 | - | 46.6 | 40.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/htc_r50_rfp_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco-8ff87c51.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco_20200624_103053.log.json) | +| SAC | HTC + ResNet-50 | 1x | 9.3 | - | 46.4 | 40.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/htc_r50_sac_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco-bfa60c54.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco_20200624_103111.log.json) | +| DetectoRS | HTC + ResNet-50 | 1x | 13.6 | - | 49.1 | 42.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/detectors_htc_r50_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco-329b1453.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco_20200624_103659.log.json) | +| DetectoRS | HTC + ResNet-101 | 20e | 19.6 | | 50.5 | 43.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/detectors_htc_r101_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r101_20e_coco/detectors_htc_r101_20e_coco_20210419_203638-348d533b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r101_20e_coco/detectors_htc_r101_20e_coco_20210419_203638.log.json) | + +*Note*: This is a re-implementation based on MMDetection-V2. +The original implementation is based on MMDetection-V1. + +## Citation + +We provide the config files for [DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution](https://arxiv.org/pdf/2006.02334.pdf). + +```latex +@article{qiao2020detectors, + title={DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution}, + author={Qiao, Siyuan and Chen, Liang-Chieh and Yuille, Alan}, + journal={arXiv preprint arXiv:2006.02334}, + year={2020} +} +``` diff --git a/configs/mmdet/detectors/cascade_rcnn_r50_rfp_1x_coco.py b/configs/mmdet/detectors/cascade_rcnn_r50_rfp_1x_coco.py new file mode 100644 index 00000000..4430d8a6 --- /dev/null +++ b/configs/mmdet/detectors/cascade_rcnn_r50_rfp_1x_coco.py @@ -0,0 +1,28 @@ +_base_ = [ + '../_base_/models/cascade_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + pretrained='torchvision://resnet50', + style='pytorch'))) diff --git a/configs/mmdet/detectors/cascade_rcnn_r50_sac_1x_coco.py b/configs/mmdet/detectors/cascade_rcnn_r50_sac_1x_coco.py new file mode 100644 index 00000000..ccd9319b --- /dev/null +++ b/configs/mmdet/detectors/cascade_rcnn_r50_sac_1x_coco.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/cascade_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True))) diff --git a/configs/mmdet/detectors/detectors_cascade_rcnn_r50_1x_coco.py b/configs/mmdet/detectors/detectors_cascade_rcnn_r50_1x_coco.py new file mode 100644 index 00000000..f7604043 --- /dev/null +++ b/configs/mmdet/detectors/detectors_cascade_rcnn_r50_1x_coco.py @@ -0,0 +1,32 @@ +_base_ = [ + '../_base_/models/cascade_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + pretrained='torchvision://resnet50', + style='pytorch'))) diff --git a/configs/mmdet/detectors/detectors_htc_r101_20e_coco.py b/configs/mmdet/detectors/detectors_htc_r101_20e_coco.py new file mode 100644 index 00000000..93d7d2b1 --- /dev/null +++ b/configs/mmdet/detectors/detectors_htc_r101_20e_coco.py @@ -0,0 +1,28 @@ +_base_ = '../htc/htc_r101_fpn_20e_coco.py' + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + pretrained='torchvision://resnet101', + style='pytorch'))) diff --git a/configs/mmdet/detectors/detectors_htc_r50_1x_coco.py b/configs/mmdet/detectors/detectors_htc_r50_1x_coco.py new file mode 100644 index 00000000..0d2fc4f7 --- /dev/null +++ b/configs/mmdet/detectors/detectors_htc_r50_1x_coco.py @@ -0,0 +1,28 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + pretrained='torchvision://resnet50', + style='pytorch'))) diff --git a/configs/mmdet/detectors/htc_r50_rfp_1x_coco.py b/configs/mmdet/detectors/htc_r50_rfp_1x_coco.py new file mode 100644 index 00000000..496104e1 --- /dev/null +++ b/configs/mmdet/detectors/htc_r50_rfp_1x_coco.py @@ -0,0 +1,24 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + pretrained='torchvision://resnet50', + style='pytorch'))) diff --git a/configs/mmdet/detectors/htc_r50_sac_1x_coco.py b/configs/mmdet/detectors/htc_r50_sac_1x_coco.py new file mode 100644 index 00000000..72d4db96 --- /dev/null +++ b/configs/mmdet/detectors/htc_r50_sac_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True))) diff --git a/configs/mmdet/detectors/metafile.yml b/configs/mmdet/detectors/metafile.yml new file mode 100644 index 00000000..4bed5694 --- /dev/null +++ b/configs/mmdet/detectors/metafile.yml @@ -0,0 +1,114 @@ +Collections: + - Name: DetectoRS + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ASPP + - FPN + - RFP + - RPN + - ResNet + - RoIAlign + - SAC + Paper: + URL: https://arxiv.org/abs/2006.02334 + Title: 'DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution' + README: configs/detectors/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/backbones/detectors_resnet.py#L205 + Version: v2.2.0 + +Models: + - Name: cascade_rcnn_r50_rfp_1x_coco + In Collection: DetectoRS + Config: configs/detectors/cascade_rcnn_r50_rfp_1x_coco.py + Metadata: + Training Memory (GB): 7.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco-8cf51bfd.pth + + - Name: cascade_rcnn_r50_sac_1x_coco + In Collection: DetectoRS + Config: configs/detectors/cascade_rcnn_r50_sac_1x_coco.py + Metadata: + Training Memory (GB): 5.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco-24bfda62.pth + + - Name: detectors_cascade_rcnn_r50_1x_coco + In Collection: DetectoRS + Config: configs/detectors/detectors_cascade_rcnn_r50_1x_coco.py + Metadata: + Training Memory (GB): 9.9 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco-32a10ba0.pth + + - Name: htc_r50_rfp_1x_coco + In Collection: DetectoRS + Config: configs/detectors/htc_r50_rfp_1x_coco.py + Metadata: + Training Memory (GB): 11.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco-8ff87c51.pth + + - Name: htc_r50_sac_1x_coco + In Collection: DetectoRS + Config: configs/detectors/htc_r50_sac_1x_coco.py + Metadata: + Training Memory (GB): 9.3 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco-bfa60c54.pth + + - Name: detectors_htc_r50_1x_coco + In Collection: DetectoRS + Config: configs/detectors/detectors_htc_r50_1x_coco.py + Metadata: + Training Memory (GB): 13.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco-329b1453.pth diff --git a/configs/mmdet/detr/README.md b/configs/mmdet/detr/README.md new file mode 100644 index 00000000..5f25357a --- /dev/null +++ b/configs/mmdet/detr/README.md @@ -0,0 +1,37 @@ +# DETR + +> [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) + + + +## Abstract + +We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries, DETR reasons about the relations of the objects and the global image context to directly output the final set of predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive baselines. + +
+ +
+ +## Results and Models + +| Backbone | Model | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:------:|:--------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | DETR |150e |7.9| | 40.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detr/detr_r50_8x2_150e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detr/detr_r50_8x2_150e_coco/detr_r50_8x2_150e_coco_20201130_194835-2c4b8974.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/detr/detr_r50_8x2_150e_coco/detr_r50_8x2_150e_coco_20201130_194835.log.json) | + +## Citation + +We provide the config files for DETR: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872). + +```latex +@inproceedings{detr, + author = {Nicolas Carion and + Francisco Massa and + Gabriel Synnaeve and + Nicolas Usunier and + Alexander Kirillov and + Sergey Zagoruyko}, + title = {End-to-End Object Detection with Transformers}, + booktitle = {ECCV}, + year = {2020} +} +``` diff --git a/configs/mmdet/detr/detr_r50_8x2_150e_coco.py b/configs/mmdet/detr/detr_r50_8x2_150e_coco.py new file mode 100644 index 00000000..892447de --- /dev/null +++ b/configs/mmdet/detr/detr_r50_8x2_150e_coco.py @@ -0,0 +1,150 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +model = dict( + type='DETR', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + bbox_head=dict( + type='DETRHead', + num_classes=80, + in_channels=2048, + transformer=dict( + type='Transformer', + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1) + ], + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'ffn', 'norm'))), + decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')), + )), + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + loss_cls=dict( + type='CrossEntropyLoss', + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=1.), + reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), + test_cfg=dict(max_per_img=100)) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[[ + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict( + type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ]]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +# test_pipeline, NOTE the Pad's size_divisor is different from the default +# setting (size_divisor=32). While there is little effect on the performance +# whether we use the default setting or use size_divisor=1. +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[100]) +runner = dict(type='EpochBasedRunner', max_epochs=150) diff --git a/configs/mmdet/detr/metafile.yml b/configs/mmdet/detr/metafile.yml new file mode 100644 index 00000000..45622cf9 --- /dev/null +++ b/configs/mmdet/detr/metafile.yml @@ -0,0 +1,33 @@ +Collections: + - Name: DETR + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - Transformer + Paper: + URL: https://arxiv.org/abs/2005.12872 + Title: 'End-to-End Object Detection with Transformers' + README: configs/detr/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/detectors/detr.py#L7 + Version: v2.7.0 + +Models: + - Name: detr_r50_8x2_150e_coco + In Collection: DETR + Config: configs/detr/detr_r50_8x2_150e_coco.py + Metadata: + Training Memory (GB): 7.9 + Epochs: 150 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detr/detr_r50_8x2_150e_coco/detr_r50_8x2_150e_coco_20201130_194835-2c4b8974.pth diff --git a/configs/mmdet/double_heads/README.md b/configs/mmdet/double_heads/README.md new file mode 100644 index 00000000..c7507e86 --- /dev/null +++ b/configs/mmdet/double_heads/README.md @@ -0,0 +1,32 @@ +# Double Heads + +> [Rethinking Classification and Localization for Object Detection](https://arxiv.org/abs/1904.06493) + + + +## Abstract + +Two head structures (i.e. fully connected head and convolution head) have been widely used in R-CNN based detectors for classification and localization tasks. However, there is a lack of understanding of how does these two head structures work for these two tasks. To address this issue, we perform a thorough analysis and find an interesting fact that the two head structures have opposite preferences towards the two tasks. Specifically, the fully connected head (fc-head) is more suitable for the classification task, while the convolution head (conv-head) is more suitable for the localization task. Furthermore, we examine the output feature maps of both heads and find that fc-head has more spatial sensitivity than conv-head. Thus, fc-head has more capability to distinguish a complete object from part of an object, but is not robust to regress the whole object. Based upon these findings, we propose a Double-Head method, which has a fully connected head focusing on classification and a convolution head for bounding box regression. Without bells and whistles, our method gains +3.5 and +2.8 AP on MS COCO dataset from Feature Pyramid Network (FPN) baselines with ResNet-50 and ResNet-101 backbones, respectively. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| R-50-FPN | pytorch | 1x | 6.8 | 9.5 | 40.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130_220238.log.json) | + +## Citation + +```latex +@article{wu2019rethinking, + title={Rethinking Classification and Localization for Object Detection}, + author={Yue Wu and Yinpeng Chen and Lu Yuan and Zicheng Liu and Lijuan Wang and Hongzhi Li and Yun Fu}, + year={2019}, + eprint={1904.06493}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/mmdet/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..9b8118b4 --- /dev/null +++ b/configs/mmdet/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,23 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + type='DoubleHeadRoIHead', + reg_roi_scale_factor=1.3, + bbox_head=dict( + _delete_=True, + type='DoubleConvFCBBoxHead', + num_convs=4, + num_fcs=2, + in_channels=256, + conv_out_channels=1024, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=2.0)))) diff --git a/configs/mmdet/double_heads/metafile.yml b/configs/mmdet/double_heads/metafile.yml new file mode 100644 index 00000000..6fe9b7af --- /dev/null +++ b/configs/mmdet/double_heads/metafile.yml @@ -0,0 +1,41 @@ +Collections: + - Name: Rethinking Classification and Localization for Object Detection + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/pdf/1904.06493 + Title: 'Rethinking Classification and Localization for Object Detection' + README: configs/double_heads/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/roi_heads/double_roi_head.py#L6 + Version: v2.0.0 + +Models: + - Name: dh_faster_rcnn_r50_fpn_1x_coco + In Collection: Rethinking Classification and Localization for Object Detection + Config: configs/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.8 + inference time (ms/im): + - value: 105.26 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth diff --git a/configs/mmdet/dyhead/README.md b/configs/mmdet/dyhead/README.md new file mode 100644 index 00000000..068a35b1 --- /dev/null +++ b/configs/mmdet/dyhead/README.md @@ -0,0 +1,46 @@ +# DyHead + +> [Dynamic Head: Unifying Object Detection Heads with Attentions](https://arxiv.org/abs/2106.08322) + + + +## Abstract + +The complex nature of combining localization and classification in object detection has resulted in the flourished development of methods. Previous works tried to improve the performance in various object detection heads but failed to present a unified view. In this paper, we present a novel dynamic head framework to unify object detection heads with attentions. By coherently combining multiple self-attention mechanisms between feature levels for scale-awareness, among spatial locations for spatial-awareness, and within output channels for task-awareness, the proposed approach significantly improves the representation ability of object detection heads without any computational overhead. Further experiments demonstrate that the effectiveness and efficiency of the proposed dynamic head on the COCO benchmark. With a standard ResNeXt-101-DCN backbone, we largely improve the performance over popular object detectors and achieve a new state-of-the-art at 54.0 AP. Furthermore, with latest transformer backbone and extra data, we can push current best COCO result to a new record at 60.6 AP. + +
+ +
+ +## Results and Models + +| Method | Backbone | Style | Setting | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:------:|:--------:|:-------:|:------------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| ATSS | R-50 | caffe | reproduction | 1x | 5.4 | 13.2 | 42.5 | [config](./atss_r50_caffe_fpn_dyhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939.log.json) | +| ATSS | R-50 | pytorch | simple | 1x | 4.9 | 13.7 | 43.3 | [config](./atss_r50_fpn_dyhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314.log.json) | + +- We trained the above models with 4 GPUs and 4 `samples_per_gpu`. +- The `reproduction` setting aims to reproduce the official implementation based on Detectron2. +- The `simple` setting serves as a minimum example to use DyHead in MMDetection. Specifically, + - it adds `DyHead` to `neck` after `FPN` + - it sets `stacked_convs=0` to `bbox_head` +- The `simple` setting achieves higher AP than the original implementation. + We have not conduct ablation study between the two settings. + `dict(type='Pad', size_divisor=128)` may further improve AP by prefer spatial alignment across pyramid levels, although large padding reduces efficiency. + +## Relation to Other Methods + +- DyHead can be regarded as an improved [SEPC](https://arxiv.org/abs/2005.03101) with [DyReLU modules](https://arxiv.org/abs/2003.10027) and simplified [SE blocks](https://arxiv.org/abs/1709.01507). +- Xiyang Dai et al., the author team of DyHead, adopt it for [Dynamic DETR](https://openaccess.thecvf.com/content/ICCV2021/html/Dai_Dynamic_DETR_End-to-End_Object_Detection_With_Dynamic_Attention_ICCV_2021_paper.html). + The description of Dynamic Encoder in Sec. 3.2 will help you understand DyHead. + +## Citation + +```latex +@inproceedings{DyHead_CVPR2021, + author = {Dai, Xiyang and Chen, Yinpeng and Xiao, Bin and Chen, Dongdong and Liu, Mengchen and Yuan, Lu and Zhang, Lei}, + title = {Dynamic Head: Unifying Object Detection Heads With Attentions}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2021} +} +``` diff --git a/configs/mmdet/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py b/configs/mmdet/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py new file mode 100644 index 00000000..223b6532 --- /dev/null +++ b/configs/mmdet/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py @@ -0,0 +1,112 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='ATSS', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + dict( + type='DyHead', + in_channels=256, + out_channels=256, + num_blocks=6, + # disable zero_init_offset to follow official implementation + zero_init_offset=False) + ], + bbox_head=dict( + type='ATSSHead', + num_classes=80, + in_channels=256, + pred_kernel_size=1, # follow DyHead official implementation + stacked_convs=0, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128], + center_offset=0.5), # follow DyHead official implementation + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) + +# use caffe img_norm, size_divisor=128, pillow resize +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=(1333, 800), + keep_ratio=True, + backend='pillow'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=128), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True, backend='pillow'), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=128), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/dyhead/atss_r50_fpn_dyhead_1x_coco.py b/configs/mmdet/dyhead/atss_r50_fpn_dyhead_1x_coco.py new file mode 100644 index 00000000..8c5109d0 --- /dev/null +++ b/configs/mmdet/dyhead/atss_r50_fpn_dyhead_1x_coco.py @@ -0,0 +1,65 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='ATSS', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + dict(type='DyHead', in_channels=256, out_channels=256, num_blocks=6) + ], + bbox_head=dict( + type='ATSSHead', + num_classes=80, + in_channels=256, + stacked_convs=0, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/dyhead/metafile.yml b/configs/mmdet/dyhead/metafile.yml new file mode 100644 index 00000000..a2e9504e --- /dev/null +++ b/configs/mmdet/dyhead/metafile.yml @@ -0,0 +1,63 @@ +Collections: + - Name: DyHead + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 4x T4 GPUs + Architecture: + - ATSS + - DyHead + - FPN + - ResNet + - Deformable Convolution + - Pyramid Convolution + Paper: + URL: https://arxiv.org/abs/2106.08322 + Title: 'Dynamic Head: Unifying Object Detection Heads with Attentions' + README: configs/dyhead/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/necks/dyhead.py#L130 + Version: v2.22.0 + +Models: + - Name: atss_r50_caffe_fpn_dyhead_1x_coco + In Collection: DyHead + Config: configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py + Metadata: + Training Memory (GB): 5.4 + inference time (ms/im): + - value: 75.7 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth + + - Name: atss_r50_fpn_dyhead_1x_coco + In Collection: DyHead + Config: configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py + Metadata: + Training Memory (GB): 4.9 + inference time (ms/im): + - value: 73.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth diff --git a/configs/mmdet/dynamic_rcnn/README.md b/configs/mmdet/dynamic_rcnn/README.md new file mode 100644 index 00000000..a22138f5 --- /dev/null +++ b/configs/mmdet/dynamic_rcnn/README.md @@ -0,0 +1,30 @@ +# Dynamic R-CNN + +> [Dynamic R-CNN: Towards High Quality Object Detection via Dynamic Training](https://arxiv.org/abs/2004.06002) + + + +## Abstract + +Although two-stage object detectors have continuously advanced the state-of-the-art performance in recent years, the training process itself is far from crystal. In this work, we first point out the inconsistency problem between the fixed network settings and the dynamic training procedure, which greatly affects the performance. For example, the fixed label assignment strategy and regression loss function cannot fit the distribution change of proposals and thus are harmful to training high quality detectors. Consequently, we propose Dynamic R-CNN to adjust the label assignment criteria (IoU threshold) and the shape of regression loss function (parameters of SmoothL1 Loss) automatically based on the statistics of proposals during training. This dynamic design makes better use of the training samples and pushes the detector to fit more high quality samples. Specifically, our method improves upon ResNet-50-FPN baseline with 1.9% AP and 5.5% AP90 on the MS COCO dataset with no extra overhead. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | pytorch | 1x | 3.8 | | 38.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x-62a3f276.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x_20200618_095048.log.json) | + +## Citation + +```latex +@article{DynamicRCNN, + author = {Hongkai Zhang and Hong Chang and Bingpeng Ma and Naiyan Wang and Xilin Chen}, + title = {Dynamic {R-CNN}: Towards High Quality Object Detection via Dynamic Training}, + journal = {arXiv preprint arXiv:2004.06002}, + year = {2020} +} +``` diff --git a/configs/mmdet/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..f2deb99e --- /dev/null +++ b/configs/mmdet/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,28 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + type='DynamicRoIHead', + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict(nms=dict(iou_threshold=0.85)), + rcnn=dict( + dynamic_rcnn=dict( + iou_topk=75, + beta_topk=10, + update_iter_interval=100, + initial_iou=0.4, + initial_beta=1.0))), + test_cfg=dict(rpn=dict(nms=dict(iou_threshold=0.85)))) diff --git a/configs/mmdet/dynamic_rcnn/metafile.yml b/configs/mmdet/dynamic_rcnn/metafile.yml new file mode 100644 index 00000000..fec43db4 --- /dev/null +++ b/configs/mmdet/dynamic_rcnn/metafile.yml @@ -0,0 +1,35 @@ +Collections: + - Name: Dynamic R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Dynamic R-CNN + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/pdf/2004.06002 + Title: 'Dynamic R-CNN: Towards High Quality Object Detection via Dynamic Training' + README: configs/dynamic_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/roi_heads/dynamic_roi_head.py#L11 + Version: v2.2.0 + +Models: + - Name: dynamic_rcnn_r50_fpn_1x_coco + In Collection: Dynamic R-CNN + Config: configs/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x-62a3f276.pth diff --git a/configs/mmdet/efficientnet/README.md b/configs/mmdet/efficientnet/README.md new file mode 100644 index 00000000..44f6df29 --- /dev/null +++ b/configs/mmdet/efficientnet/README.md @@ -0,0 +1,30 @@ +# EfficientNet + +> [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946v5) + + + +## Introduction + +Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are available. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on scaling up MobileNets and ResNet. + +To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves state-of-the-art 84.3% top-1 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet. Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters. + +## Results and Models + +### RetinaNet + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +|Efficientnet-b3 | pytorch | 1x | - | - | 40.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806.log.json) | + +## Citation + +```latex +@article{tan2019efficientnet, + title={Efficientnet: Rethinking model scaling for convolutional neural networks}, + author={Tan, Mingxing and Le, Quoc V}, + journal={arXiv preprint arXiv:1905.11946}, + year={2019} +} +``` diff --git a/configs/mmdet/efficientnet/metafile.yml b/configs/mmdet/efficientnet/metafile.yml new file mode 100644 index 00000000..de40b953 --- /dev/null +++ b/configs/mmdet/efficientnet/metafile.yml @@ -0,0 +1,19 @@ +Models: + - Name: retinanet_effb3_fpn_crop896_8x4_1x_coco + In Collection: RetinaNet + Config: configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth + Paper: + URL: https://arxiv.org/abs/1905.11946v5 + Title: 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks' + README: configs/efficientnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/backbones/efficientnet.py#L159 + Version: v2.23.0 diff --git a/configs/mmdet/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py b/configs/mmdet/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py new file mode 100644 index 00000000..c90bc167 --- /dev/null +++ b/configs/mmdet/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py @@ -0,0 +1,94 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] + +cudnn_benchmark = True +norm_cfg = dict(type='BN', requires_grad=True) +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth' # noqa +model = dict( + backbone=dict( + _delete_=True, + type='EfficientNet', + arch='b3', + drop_path_rate=0.2, + out_indices=(3, 4, 5), + frozen_stages=0, + norm_cfg=dict( + type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01), + norm_eval=False, + init_cfg=dict( + type='Pretrained', prefix='backbone', checkpoint=checkpoint)), + neck=dict( + in_channels=[48, 136, 384], + start_level=0, + out_channels=256, + relu_before_extra_convs=True, + no_norm_on_lateral=True, + norm_cfg=norm_cfg), + bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg), + # training and testing settings + train_cfg=dict(assigner=dict(neg_iou_thr=0.5))) + +# dataset settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +img_size = (896, 896) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=img_size, + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict(type='RandomCrop', crop_size=img_size), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=img_size), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_size, + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=img_size), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer_config = dict(grad_clip=None) +optimizer = dict( + type='SGD', + lr=0.04, + momentum=0.9, + weight_decay=0.0001, + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.1, + step=[8, 11]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=12) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (4 samples per GPU) +auto_scale_lr = dict(base_batch_size=32) diff --git a/configs/mmdet/empirical_attention/README.md b/configs/mmdet/empirical_attention/README.md new file mode 100644 index 00000000..ddf8194b --- /dev/null +++ b/configs/mmdet/empirical_attention/README.md @@ -0,0 +1,33 @@ +# Empirical Attention + +> [An Empirical Study of Spatial Attention Mechanisms in Deep Networks](https://arxiv.org/abs/1904.05873) + + + +## Abstract + +Attention mechanisms have become a popular component in deep neural networks, yet there has been little examination of how different influencing factors and methods for computing attention from these factors affect performance. Toward a better general understanding of attention mechanisms, we present an empirical study that ablates various spatial attention elements within a generalized attention formulation, encompassing the dominant Transformer attention as well as the prevalent deformable convolution and dynamic convolution modules. Conducted on a variety of applications, the study yields significant findings about spatial attention in deep networks, some of which run counter to conventional understanding. For example, we find that the query and key content comparison in Transformer attention is negligible for self-attention, but vital for encoder-decoder attention. A proper combination of deformable convolution with key content only saliency achieves the best accuracy-efficiency tradeoff in self-attention. Our results suggest that there exists much room for improvement in the design of attention mechanisms. + +
+ +
+ +## Results and Models + +| Backbone | Attention Component | DCN | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:-------------------:|:----:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | 1111 | N | 1x | 8.0 | 13.8 | 40.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130_210344.log.json) | +| R-50 | 0010 | N | 1x | 4.2 | 18.4 | 39.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130-7cb0c14d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130_210125.log.json) | +| R-50 | 1111 | Y | 1x | 8.0 | 12.7 | 42.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130-8b2523a6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130_204442.log.json) | +| R-50 | 0010 | Y | 1x | 4.2 | 17.1 | 42.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130-1a2e831d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130_210410.log.json) | + +## Citation + +```latex +@article{zhu2019empirical, + title={An Empirical Study of Spatial Attention Mechanisms in Deep Networks}, + author={Zhu, Xizhou and Cheng, Dazhi and Zhang, Zheng and Lin, Stephen and Dai, Jifeng}, + journal={arXiv preprint arXiv:1904.05873}, + year={2019} +} +``` diff --git a/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py b/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py new file mode 100644 index 00000000..a544e3ab --- /dev/null +++ b/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='0010', + kv_stride=2), + stages=(False, False, True, True), + position='after_conv2') + ])) diff --git a/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py b/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py new file mode 100644 index 00000000..bbefd27a --- /dev/null +++ b/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='0010', + kv_stride=2), + stages=(False, False, True, True), + position='after_conv2') + ], + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py b/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py new file mode 100644 index 00000000..13a4645b --- /dev/null +++ b/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='1111', + kv_stride=2), + stages=(False, False, True, True), + position='after_conv2') + ])) diff --git a/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py b/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py new file mode 100644 index 00000000..b1f26c08 --- /dev/null +++ b/configs/mmdet/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='1111', + kv_stride=2), + stages=(False, False, True, True), + position='after_conv2') + ], + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/configs/mmdet/empirical_attention/metafile.yml b/configs/mmdet/empirical_attention/metafile.yml new file mode 100644 index 00000000..923bcb20 --- /dev/null +++ b/configs/mmdet/empirical_attention/metafile.yml @@ -0,0 +1,103 @@ +Collections: + - Name: Empirical Attention + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Deformable Convolution + - FPN + - RPN + - ResNet + - RoIAlign + - Spatial Attention + Paper: + URL: https://arxiv.org/pdf/1904.05873 + Title: 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks' + README: configs/empirical_attention/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/generalized_attention.py#L10 + Version: v2.0.0 + +Models: + - Name: faster_rcnn_r50_fpn_attention_1111_1x_coco + In Collection: Empirical Attention + Config: configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py + Metadata: + Training Memory (GB): 8.0 + inference time (ms/im): + - value: 72.46 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth + + - Name: faster_rcnn_r50_fpn_attention_0010_1x_coco + In Collection: Empirical Attention + Config: configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + inference time (ms/im): + - value: 54.35 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130-7cb0c14d.pth + + - Name: faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco + In Collection: Empirical Attention + Config: configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py + Metadata: + Training Memory (GB): 8.0 + inference time (ms/im): + - value: 78.74 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130-8b2523a6.pth + + - Name: faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco + In Collection: Empirical Attention + Config: configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + inference time (ms/im): + - value: 58.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130-1a2e831d.pth diff --git a/configs/mmdet/fast_rcnn/README.md b/configs/mmdet/fast_rcnn/README.md new file mode 100644 index 00000000..dbe926d6 --- /dev/null +++ b/configs/mmdet/fast_rcnn/README.md @@ -0,0 +1,72 @@ +# Fast R-CNN + +> [Fast R-CNN](https://arxiv.org/abs/1504.08083) + + + +## Abstract + +This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. + +
+ +
+ +## Introduction + +Before training the Fast R-CNN, users should first train an [RPN](../rpn/README.md), and use the RPN to extract the region proposals. + +- Firstly, extract the region proposals of the val set by this command as below: +```bash +./tools/dist_test.sh \ + configs/rpn_r50_fpn_1x_coco.py \ + checkpoints/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth \ + 8 \ + --out proposals/rpn_r50_fpn_1x_val2017.pkl +``` + +- Then, change the `ann_file` and `img_prefix` of `data.test` in the RPN config to train set as below: + +```python +data = dict( + test=dict( + ann_file='data/coco/annotations/instances_train2017.json', + img_prefix='data/coco/train2017/')) +``` + +- Extract the region proposals of the train set by this command as below: + +```bash +./tools/dist_test.sh \ + configs/rpn_r50_fpn_1x_coco.py \ + checkpoints/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth \ + 8 \ + --out proposals/rpn_r50_fpn_1x_train2017.pkl +``` + +- Modify the path of `proposal_file` in Fast R-CNN config as below: + +```python +data = dict( + train=dict( + proposal_file='proposals/rpn_r50_fpn_1x_train2017.pkl'), + val=dict( + proposal_file='proposals/rpn_r50_fpn_1x_val2017.pkl'), + test=dict( + proposal_file='proposals/rpn_r50_fpn_1x_val2017.pkl')) +``` + +Finally, users can start training the Fast R-CNN. + +## Results and Models + +## Citation + +```latex +@inproceedings{girshick2015fast, + title={Fast r-cnn}, + author={Girshick, Ross}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + year={2015} +} +``` diff --git a/configs/mmdet/fast_rcnn/fast_rcnn_r101_caffe_fpn_1x_coco.py b/configs/mmdet/fast_rcnn/fast_rcnn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..3ab8e981 --- /dev/null +++ b/configs/mmdet/fast_rcnn/fast_rcnn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './fast_rcnn_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/fast_rcnn/fast_rcnn_r101_fpn_1x_coco.py b/configs/mmdet/fast_rcnn/fast_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..83852b24 --- /dev/null +++ b/configs/mmdet/fast_rcnn/fast_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fast_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/fast_rcnn/fast_rcnn_r101_fpn_2x_coco.py b/configs/mmdet/fast_rcnn/fast_rcnn_r101_fpn_2x_coco.py new file mode 100644 index 00000000..c2208857 --- /dev/null +++ b/configs/mmdet/fast_rcnn/fast_rcnn_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fast_rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/fast_rcnn/fast_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/fast_rcnn/fast_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..f1b29ef3 --- /dev/null +++ b/configs/mmdet/fast_rcnn/fast_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,48 @@ +_base_ = './fast_rcnn_r50_fpn_1x_coco.py' + +model = dict( + backbone=dict( + norm_cfg=dict(type='BN', requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) + +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=2000), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=None), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['proposals']), + dict( + type='ToDataContainer', + fields=[dict(key='proposals', stack=False)]), + dict(type='Collect', keys=['img', 'proposals']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..d2f080e9 --- /dev/null +++ b/configs/mmdet/fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,52 @@ +_base_ = [ + '../_base_/models/fast_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=2000), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=None), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['proposals']), + dict( + type='ToDataContainer', + fields=[dict(key='proposals', stack=False)]), + dict(type='Collect', keys=['img', 'proposals']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl', + pipeline=train_pipeline), + val=dict( + proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', + pipeline=test_pipeline), + test=dict( + proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', + pipeline=test_pipeline)) diff --git a/configs/mmdet/fast_rcnn/fast_rcnn_r50_fpn_2x_coco.py b/configs/mmdet/fast_rcnn/fast_rcnn_r50_fpn_2x_coco.py new file mode 100644 index 00000000..228e8564 --- /dev/null +++ b/configs/mmdet/fast_rcnn/fast_rcnn_r50_fpn_2x_coco.py @@ -0,0 +1,5 @@ +_base_ = './fast_rcnn_r50_fpn_1x_coco.py' + +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/faster_rcnn/README.md b/configs/mmdet/faster_rcnn/README.md new file mode 100644 index 00000000..359d0ce6 --- /dev/null +++ b/configs/mmdet/faster_rcnn/README.md @@ -0,0 +1,88 @@ +# Faster R-CNN + +> [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497) + + + +## Abstract + +State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| R-50-C4 | caffe | 1x | - | - | 35.6 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152-3f885b85.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152.log.json) | +| R-50-DC5 | caffe | 1x | - | - | 37.2 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909-531f0f43.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909.log.json) | +| R-50-FPN | caffe | 1x | 3.8 | | 37.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_20200504_180032.log.json) | +| R-50-FPN | pytorch | 1x | 4.0 | 21.4 | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| R-50-FPN (FP16) | pytorch | 1x | 3.4 | 28.8 | 37.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fp16/faster_rcnn_r50_fpn_fp16_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204-d4dc1471.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204_143530.log.json) | +| R-50-FPN | pytorch | 2x | - | - | 38.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_20200504_210434.log.json) | +| R-101-FPN | caffe | 1x | 5.7 | | 39.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.398_20200504_180057-b269e9dd.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_20200504_180057.log.json) | +| R-101-FPN | pytorch | 1x | 6.0 | 15.6 | 39.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130-f513f705.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130_204655.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 39.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_20200504_210455.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.2 | 13.8 | 41.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203-cff10310.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203_000520.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 41.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.412_20200506_041400-64a12c0b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_20200506_041400.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.3 | 9.4 | 42.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204-833ee192.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204_134340.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 41.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033-5961fa95.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033.log.json) | + +## Different regression loss + +We trained with R-50-FPN pytorch style backbone for 1x schedule. + +| Backbone | Loss type | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-------: | :------: | :------------: | :----: | :------: | :--------: | +| R-50-FPN | L1Loss | 4.0 | 21.4 | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| R-50-FPN | IoULoss | | | 37.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco-fdd207f3.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco_20200506_095954.log.json) | +| R-50-FPN | GIoULoss | | | 37.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco_20200505_161120.log.json) | +| R-50-FPN | BoundedIoULoss | | | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco-98ad993b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco_20200505_160738.log.json) | + +## Pre-trained Models + +We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks. + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| [R-50-C4](./faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py) | caffe | 1x | - | | 35.9 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527-db276fed.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527.log.json) | +| [R-50-DC5](./faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py) | caffe | 1x | - | | 37.4 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851-b33d21b9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851.log.json) | +| [R-50-DC5](./faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py) | caffe | 3x | - | | 38.7 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107-34a53b2c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107.log.json) | +| [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py) | caffe | 2x | 3.7 | | 39.7 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_bbox_mAP-0.397_20200504_231813-10b2de58.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_20200504_231813.log.json) | +| [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | caffe | 3x | 3.7 | | 39.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054-1f77628b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054.log.json) | +| [R-50-FPN](./faster_rcnn_r50_fpn_mstrain_3x_coco.py) | pytorch | 3x | 3.9 | | 40.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822.log.json) | +| [R-101-FPN](./faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py) | caffe | 3x | 5.6 | | 42.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742-a7ae426d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742.log.json) | +| [R-101-FPN](./faster_rcnn_r101_fpn_mstrain_3x_coco.py) | pytorch | 3x | 5.8 | | 41.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822-4d4d2ca8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822.log.json) | +| [X-101-32x4d-FPN](./faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py) | pytorch | 3x | 7.0 | | 42.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151-16b9b260.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151.log.json) | +| [X-101-32x8d-FPN](./faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py) | pytorch | 3x | 10.1 | | 42.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954-002e082a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954.log.json) | +| [X-101-64x4d-FPN](./faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py) | pytorch | 3x | 10.0 | | 43.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528-26c63de6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528.log.json) | + +We further finetune some pre-trained models on the COCO subsets, which only contain only a few of the 80 categories. + +| Backbone | Style | Class name | Pre-traind model | Mem (GB) | box AP | Config | Download | +| ------------------------------------------------------------ | ----- | ------------------ | ------------------------------------------------------------ | -------- | ------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) | caffe | person | [R-50-FPN-Caffe-3x](./faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | 3.7 | 55.8 | [config](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929.log.json) | +| [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py) | caffe | person-bicycle-car | [R-50-FPN-Caffe-3x](./faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | 3.7 | 44.1 | [config](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car_20201216_173117-6eda6d92.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car_20201216_173117.log.json) | + +## Torchvision New Receipe (TNR) + +Torchvision released its high-precision ResNet models. The training details can be found on the [Pytorch website](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/). Here, we have done grid searches on learning rate and weight decay and found the optimal hyper-parameter on the detection task. + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| [R-50-TNR](./faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py) | pytorch | 1x | - | | 40.2 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147.log.json) | + +## Citation + +```latex +@article{Ren_2017, + title={Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher={Institute of Electrical and Electronics Engineers (IEEE)}, + author={Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian}, + year={2017}, + month={Jun}, +} +``` diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..c6f078c7 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './faster_rcnn_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..6a13fe9f --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py @@ -0,0 +1,49 @@ +_base_ = 'faster_rcnn_r50_fpn_mstrain_3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) + +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..1de53a6c --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py new file mode 100644 index 00000000..0d415994 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster_rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..0b498bb6 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py @@ -0,0 +1,7 @@ +_base_ = 'faster_rcnn_r50_fpn_mstrain_3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py new file mode 100644 index 00000000..b071962e --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_caffe_c4.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py new file mode 100644 index 00000000..f4d83e6b --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py @@ -0,0 +1,38 @@ +_base_ = './faster_rcnn_r50_caffe_c4_1x_coco.py' +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py new file mode 100644 index 00000000..ee2010c6 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py @@ -0,0 +1,37 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_caffe_dc5.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py new file mode 100644 index 00000000..14eaef2d --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py @@ -0,0 +1,42 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_caffe_dc5.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py new file mode 100644 index 00000000..403747f1 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py @@ -0,0 +1,4 @@ +_base_ = './faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py' +# learning policy +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..56c01bdc --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,41 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_90k_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_90k_coco.py new file mode 100644 index 00000000..b5aea6a7 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_90k_coco.py @@ -0,0 +1,15 @@ +_base_ = 'faster_rcnn_r50_caffe_fpn_1x_coco.py' + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[60000, 80000]) + +# Runner type +runner = dict(_delete_=True, type='IterBasedRunner', max_iters=90000) + +checkpoint_config = dict(interval=10000) +evaluation = dict(interval=10000, metric='bbox') diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py new file mode 100644 index 00000000..4f1f376c --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py @@ -0,0 +1,9 @@ +_base_ = './faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py' +model = dict(roi_head=dict(bbox_head=dict(num_classes=3))) +classes = ('person', 'bicycle', 'car') +data = dict( + train=dict(classes=classes), + val=dict(classes=classes), + test=dict(classes=classes)) + +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_bbox_mAP-0.398_20200504_163323-30042637.pth' # noqa diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py new file mode 100644 index 00000000..b5dfb4fe --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py @@ -0,0 +1,9 @@ +_base_ = './faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py' +model = dict(roi_head=dict(bbox_head=dict(num_classes=1))) +classes = ('person', ) +data = dict( + train=dict(classes=classes), + val=dict(classes=classes), + test=dict(classes=classes)) + +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_bbox_mAP-0.398_20200504_163323-30042637.pth' # noqa diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py new file mode 100644 index 00000000..f807a19a --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py @@ -0,0 +1,46 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..df58973f --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..9eeaacea --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py @@ -0,0 +1,47 @@ +_base_ = 'faster_rcnn_r50_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) + +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_90k_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_90k_coco.py new file mode 100644 index 00000000..74dca24f --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_90k_coco.py @@ -0,0 +1,15 @@ +_base_ = 'faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py' + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[60000, 80000]) + +# Runner type +runner = dict(_delete_=True, type='IterBasedRunner', max_iters=90000) + +checkpoint_config = dict(interval=10000) +evaluation = dict(interval=10000, metric='bbox') diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..009bd93d --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py new file mode 100644 index 00000000..e77a7fa8 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py new file mode 100644 index 00000000..648081f1 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + reg_decoded_bbox=True, + loss_bbox=dict(type='BoundedIoULoss', loss_weight=10.0)))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_ciou_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_ciou_1x_coco.py new file mode 100644 index 00000000..886d5668 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_ciou_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + reg_decoded_bbox=True, + loss_bbox=dict(type='CIoULoss', loss_weight=12.0)))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py new file mode 100644 index 00000000..acd4040c --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py @@ -0,0 +1,3 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +# fp16 settings +fp16 = dict(loss_scale=512.) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py new file mode 100644 index 00000000..5556c497 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + reg_decoded_bbox=True, + loss_bbox=dict(type='GIoULoss', loss_weight=10.0)))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py new file mode 100644 index 00000000..ddf663e4 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + reg_decoded_bbox=True, + loss_bbox=dict(type='IoULoss', loss_weight=10.0)))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..faf8f924 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py @@ -0,0 +1,3 @@ +_base_ = [ + '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py' +] diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py new file mode 100644 index 00000000..f897e7c5 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict(train_cfg=dict(rcnn=dict(sampler=dict(type='OHEMSampler')))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py new file mode 100644 index 00000000..759ae3a7 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + test_cfg=dict( + rcnn=dict( + score_thr=0.05, + nms=dict(type='soft_nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py new file mode 100644 index 00000000..ecbfb928 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.pytorch.org/models/resnet50-11ad3fa6.pth' +model = dict( + backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint))) + +# `lr` and `weight_decay` have been searched to be optimal. +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0001, + weight_decay=0.1, + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..3808c9f2 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py new file mode 100644 index 00000000..e93f5d81 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster_rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..f55985d6 --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py @@ -0,0 +1,16 @@ +_base_ = [ + '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py' +] +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..a5d5aebb --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py @@ -0,0 +1,62 @@ +_base_ = [ + '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py' +] +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) + +# ResNeXt-101-32x8d model trained with Caffe2 at FB, +# so the mean and std need to be changed. +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + to_rgb=False) + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# Use RepeatDataset to speed up training +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..8bf2b65a --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py new file mode 100644 index 00000000..7ea9b2da --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster_rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py b/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..80397f4b --- /dev/null +++ b/configs/mmdet/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py @@ -0,0 +1,16 @@ +_base_ = [ + '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py' +] +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/faster_rcnn/metafile.yml b/configs/mmdet/faster_rcnn/metafile.yml new file mode 100644 index 00000000..91d6751b --- /dev/null +++ b/configs/mmdet/faster_rcnn/metafile.yml @@ -0,0 +1,451 @@ +Collections: + - Name: Faster R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - RPN + - ResNet + - RoIPool + Paper: + URL: https://arxiv.org/abs/1506.01497 + Title: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" + README: configs/faster_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/faster_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: faster_rcnn_r50_caffe_c4_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152-3f885b85.pth + + - Name: faster_rcnn_r50_caffe_c4_mstrain_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527-db276fed.pth + + - Name: faster_rcnn_r50_caffe_dc5_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909-531f0f43.pth + + - Name: faster_rcnn_r50_caffe_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth + + - Name: faster_rcnn_r50_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 46.73 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth + + - Name: faster_rcnn_r50_fpn_fp16_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py + Metadata: + Training Memory (GB): 3.4 + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + inference time (ms/im): + - value: 34.72 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP16 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204-d4dc1471.pth + + - Name: faster_rcnn_r50_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 46.73 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth + + - Name: faster_rcnn_r101_caffe_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.7 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.398_20200504_180057-b269e9dd.pth + + - Name: faster_rcnn_r101_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 64.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130-f513f705.pth + + - Name: faster_rcnn_r101_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 64.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth + + - Name: faster_rcnn_x101_32x4d_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.2 + inference time (ms/im): + - value: 72.46 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203-cff10310.pth + + - Name: faster_rcnn_x101_32x4d_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.2 + inference time (ms/im): + - value: 72.46 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.412_20200506_041400-64a12c0b.pth + + - Name: faster_rcnn_x101_64x4d_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 106.38 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204-833ee192.pth + + - Name: faster_rcnn_x101_64x4d_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 106.38 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033-5961fa95.pth + + - Name: faster_rcnn_r50_fpn_iou_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco-fdd207f3.pth + + - Name: faster_rcnn_r50_fpn_giou_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth + + - Name: faster_rcnn_r50_fpn_bounded_iou_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco-98ad993b.pth + + - Name: faster_rcnn_r50_caffe_dc5_mstrain_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851-b33d21b9.pth + + - Name: faster_rcnn_r50_caffe_dc5_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107-34a53b2c.pth + + - Name: faster_rcnn_r50_caffe_fpn_mstrain_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_bbox_mAP-0.397_20200504_231813-10b2de58.pth + + - Name: faster_rcnn_r50_caffe_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 3.7 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054-1f77628b.pth + + - Name: faster_rcnn_r50_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 3.9 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth + + - Name: faster_rcnn_r101_caffe_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 5.6 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742-a7ae426d.pth + + - Name: faster_rcnn_r101_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 5.8 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822-4d4d2ca8.pth + + - Name: faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 7.0 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151-16b9b260.pth + + - Name: faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 10.1 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954-002e082a.pth + + - Name: faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 10.0 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528-26c63de6.pth + + - Name: faster_rcnn_r50_fpn_tnr-pretrain_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 46.73 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth diff --git a/configs/mmdet/fcos/README.md b/configs/mmdet/fcos/README.md new file mode 100644 index 00000000..706fad56 --- /dev/null +++ b/configs/mmdet/fcos/README.md @@ -0,0 +1,45 @@ +# FCOS + +> [FCOS: Fully Convolutional One-Stage Object Detection](https://arxiv.org/abs/1904.01355) + + + +## Abstract + +We propose a fully convolutional one-stage object detector (FCOS) to solve object detection in a per-pixel prediction fashion, analogue to semantic segmentation. Almost all state-of-the-art object detectors such as RetinaNet, SSD, YOLOv3, and Faster R-CNN rely on pre-defined anchor boxes. In contrast, our proposed detector FCOS is anchor box free, as well as proposal free. By eliminating the predefined set of anchor boxes, FCOS completely avoids the complicated computation related to anchor boxes such as calculating overlapping during training. More importantly, we also avoid all hyper-parameters related to anchor boxes, which are often very sensitive to the final detection performance. With the only post-processing non-maximum suppression (NMS), FCOS with ResNeXt-64x4d-101 achieves 44.7% in AP with single-model and single-scale testing, surpassing previous one-stage detectors with the advantage of being much simpler. For the first time, we demonstrate a much simpler and flexible detection framework achieving improved detection accuracy. We hope that the proposed FCOS framework can serve as a simple and strong alternative for many other instance-level tasks. + +
+ +
+ +## Results and Models + +| Backbone | Style | GN | MS train | Tricks | DCN | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:--------:|:-------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | caffe | Y | N | N | N | 1x | 3.6 | 22.7 | 36.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/20201227_180009.log.json) | +| R-50 | caffe | Y | N | Y | N | 1x | 3.7 | - | 38.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/20210105_135818.log.json)| +| R-50 | caffe | Y | N | Y | Y | 1x | 3.8 | - | 42.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco-ae4d8b3d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/20210105_224556.log.json)| +| R-101 | caffe | Y | N | N | N | 1x | 5.5 | 17.3 | 39.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/fcos_r101_caffe_fpn_gn-head_1x_coco-0e37b982.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/20210103_155046.log.json) | + +| Backbone | Style | GN | MS train | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:--------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | caffe | Y | Y | 2x | 2.6 | 22.9 | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco-d92ceeea.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/20201227_161900.log.json) | +| R-101 | caffe | Y | Y | 2x | 5.5 | 17.3 | 40.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco-511424d6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/20210103_155046.log.json) | +| X-101 | pytorch | Y | Y | 2x | 10.0 | 9.7 | 42.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco-ede514a8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/20210114_133041.log.json) | + +**Notes:** + +- The X-101 backbone is X-101-64x4d. +- Tricks means setting `norm_on_bbox`, `centerness_on_reg`, `center_sampling` as `True`. +- DCN means using `DCNv2` in both backbone and head. + +## Citation + +```latex +@article{tian2019fcos, + title={FCOS: Fully Convolutional One-Stage Object Detection}, + author={Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong}, + journal={arXiv preprint arXiv:1904.01355}, + year={2019} +} +``` diff --git a/configs/mmdet/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py b/configs/mmdet/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py new file mode 100644 index 00000000..2699bdb9 --- /dev/null +++ b/configs/mmdet/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py @@ -0,0 +1,54 @@ +_base_ = 'fcos_r50_caffe_fpn_gn-head_1x_coco.py' + +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + bbox_head=dict( + norm_on_bbox=True, + centerness_on_reg=True, + dcn_on_last_conv=False, + center_sampling=True, + conv_bias=True, + loss_bbox=dict(type='GIoULoss', loss_weight=1.0)), + # training and testing settings + test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6))) + +# dataset settings +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +optimizer_config = dict(_delete_=True, grad_clip=None) + +lr_config = dict(warmup='linear') diff --git a/configs/mmdet/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py b/configs/mmdet/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py new file mode 100644 index 00000000..cf93c91e --- /dev/null +++ b/configs/mmdet/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py @@ -0,0 +1,56 @@ +_base_ = 'fcos_r50_caffe_fpn_gn-head_1x_coco.py' + +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + bbox_head=dict( + norm_on_bbox=True, + centerness_on_reg=True, + dcn_on_last_conv=True, + center_sampling=True, + conv_bias=True, + loss_bbox=dict(type='GIoULoss', loss_weight=1.0)), + # training and testing settings + test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6))) + +# dataset settings +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +optimizer_config = dict(_delete_=True, grad_clip=None) + +lr_config = dict(warmup='linear') diff --git a/configs/mmdet/fcos/fcos_center_r50_caffe_fpn_gn-head_1x_coco.py b/configs/mmdet/fcos/fcos_center_r50_caffe_fpn_gn-head_1x_coco.py new file mode 100644 index 00000000..9f502e7b --- /dev/null +++ b/configs/mmdet/fcos/fcos_center_r50_caffe_fpn_gn-head_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py' +model = dict(bbox_head=dict(center_sampling=True, center_sample_radius=1.5)) diff --git a/configs/mmdet/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py b/configs/mmdet/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py new file mode 100644 index 00000000..45bea48d --- /dev/null +++ b/configs/mmdet/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet101_caffe'))) diff --git a/configs/mmdet/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py b/configs/mmdet/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py new file mode 100644 index 00000000..f4d36f1e --- /dev/null +++ b/configs/mmdet/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py @@ -0,0 +1,47 @@ +_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet101_caffe'))) +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py b/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py new file mode 100644 index 00000000..955787ba --- /dev/null +++ b/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py @@ -0,0 +1,106 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='FCOS', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet50_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)) +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='constant', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py new file mode 100644 index 00000000..2816b16f --- /dev/null +++ b/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py @@ -0,0 +1,4 @@ +# TODO: Remove this config after benchmarking all related configs +_base_ = 'fcos_r50_caffe_fpn_gn-head_1x_coco.py' + +data = dict(samples_per_gpu=4, workers_per_gpu=4) diff --git a/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py b/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py new file mode 100644 index 00000000..497d03f6 --- /dev/null +++ b/configs/mmdet/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py @@ -0,0 +1,39 @@ +_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py' +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py b/configs/mmdet/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py new file mode 100644 index 00000000..e70e4651 --- /dev/null +++ b/configs/mmdet/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py @@ -0,0 +1,60 @@ +_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/fcos/metafile.yml b/configs/mmdet/fcos/metafile.yml new file mode 100644 index 00000000..ae922eb9 --- /dev/null +++ b/configs/mmdet/fcos/metafile.yml @@ -0,0 +1,146 @@ +Collections: + - Name: FCOS + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - Group Normalization + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.01355 + Title: 'FCOS: Fully Convolutional One-Stage Object Detection' + README: configs/fcos/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/fcos.py#L6 + Version: v2.0.0 + +Models: + - Name: fcos_r50_caffe_fpn_gn-head_1x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py + Metadata: + Training Memory (GB): 3.6 + inference time (ms/im): + - value: 44.05 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth + + - Name: fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco + In Collection: FCOS + Config: configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py + Metadata: + Training Memory (GB): 3.7 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth + + - Name: fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco + In Collection: FCOS + Config: configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco-ae4d8b3d.pth + + - Name: fcos_r101_caffe_fpn_gn-head_1x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + inference time (ms/im): + - value: 57.8 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/fcos_r101_caffe_fpn_gn-head_1x_coco-0e37b982.pth + + - Name: fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py + Metadata: + Training Memory (GB): 2.6 + inference time (ms/im): + - value: 43.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco-d92ceeea.pth + + - Name: fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py + Metadata: + Training Memory (GB): 5.5 + inference time (ms/im): + - value: 57.8 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco-511424d6.pth + + - Name: fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco + In Collection: FCOS + Config: configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py + Metadata: + Training Memory (GB): 10.0 + inference time (ms/im): + - value: 103.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco-ede514a8.pth diff --git a/configs/mmdet/foveabox/README.md b/configs/mmdet/foveabox/README.md new file mode 100644 index 00000000..7c82820e --- /dev/null +++ b/configs/mmdet/foveabox/README.md @@ -0,0 +1,53 @@ +# FoveaBox + +> [FoveaBox: Beyond Anchor-based Object Detector](https://arxiv.org/abs/1904.03797) + + + +## Abstract + +We present FoveaBox, an accurate, flexible, and completely anchor-free framework for object detection. While almost all state-of-the-art object detectors utilize predefined anchors to enumerate possible locations, scales and aspect ratios for the search of the objects, their performance and generalization ability are also limited to the design of anchors. Instead, FoveaBox directly learns the object existing possibility and the bounding box coordinates without anchor reference. This is achieved by: (a) predicting category-sensitive semantic maps for the object existing possibility, and (b) producing category-agnostic bounding box for each position that potentially contains an object. The scales of target boxes are naturally associated with feature pyramid representations. In FoveaBox, an instance is assigned to adjacent feature levels to make the model more accurate.We demonstrate its effectiveness on standard benchmarks and report extensive experimental analysis. Without bells and whistles, FoveaBox achieves state-of-the-art single model performance on the standard COCO and Pascal VOC object detection benchmark. More importantly, FoveaBox avoids all computation and hyper-parameters related to anchor boxes, which are often sensitive to the final detection performance. We believe the simple and effective approach will serve as a solid baseline and help ease future research for object detection. + +
+ +
+ +## Introduction + +FoveaBox is an accurate, flexible and completely anchor-free object detection system for object detection framework, as presented in our paper [https://arxiv.org/abs/1904.03797](https://arxiv.org/abs/1904.03797): +Different from previous anchor-based methods, FoveaBox directly learns the object existing possibility and the bounding box coordinates without anchor reference. This is achieved by: (a) predicting category-sensitive semantic maps for the object existing possibility, and (b) producing category-agnostic bounding box for each position that potentially contains an object. + +## Results and Models + +### Results on R50/101-FPN + +| Backbone | Style | align | ms-train| Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | pytorch | N | N | 1x | 5.6 | 24.1 | 36.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219-ee4d5303.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219_223025.log.json) | +| R-50 | pytorch | N | N | 2x | 5.6 | - | 37.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_r50_fpn_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203-2df792b1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203_112043.log.json) | +| R-50 | pytorch | Y | N | 2x | 8.1 | 19.4 | 37.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203_134252.log.json) | +| R-50 | pytorch | Y | Y | 2x | 8.1 | 18.3 | 40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205-85ce26cb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205_112557.log.json) | +| R-101 | pytorch | N | N | 1x | 9.2 | 17.4 | 38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_r101_fpn_4x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219-05e38f1c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219_011740.log.json) | +| R-101 | pytorch | N | N | 2x | 11.7 | - | 40.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_r101_fpn_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208-02320ea4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208_202059.log.json) | +| R-101 | pytorch | Y | N | 2x | 11.7 | 14.7 | 40.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208-c39a027a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208_203337.log.json) | +| R-101 | pytorch | Y | Y | 2x | 11.7 | 14.7 | 42.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208-649c5eb6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208_202124.log.json) | + +[1] *1x and 2x mean the model is trained for 12 and 24 epochs, respectively.* \ +[2] *Align means utilizing deformable convolution to align the cls branch.* \ +[3] *All results are obtained with a single model and without any test time data augmentation.*\ +[4] *We use 4 GPUs for training.* + +Any pull requests or issues are welcome. + +## Citation + +Please consider citing our paper in your publications if the project helps your research. BibTeX reference is as follows. + +```latex +@article{kong2019foveabox, + title={FoveaBox: Beyond Anchor-based Object Detector}, + author={Kong, Tao and Sun, Fuchun and Liu, Huaping and Jiang, Yuning and Shi, Jianbo}, + journal={arXiv preprint arXiv:1904.03797}, + year={2019} +} +``` diff --git a/configs/mmdet/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py b/configs/mmdet/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py new file mode 100644 index 00000000..c5d17849 --- /dev/null +++ b/configs/mmdet/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py @@ -0,0 +1,12 @@ +_base_ = './fovea_r50_fpn_4x4_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + with_deform=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/mmdet/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py new file mode 100644 index 00000000..cc5affef --- /dev/null +++ b/configs/mmdet/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py @@ -0,0 +1,29 @@ +_base_ = './fovea_r50_fpn_4x4_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + with_deform=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +data = dict(train=dict(pipeline=train_pipeline)) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py b/configs/mmdet/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py new file mode 100644 index 00000000..e7265bcd --- /dev/null +++ b/configs/mmdet/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py @@ -0,0 +1,10 @@ +_base_ = './fovea_r50_fpn_4x4_1x_coco.py' +model = dict( + bbox_head=dict( + with_deform=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/mmdet/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py new file mode 100644 index 00000000..8fc39bea --- /dev/null +++ b/configs/mmdet/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py @@ -0,0 +1,25 @@ +_base_ = './fovea_r50_fpn_4x4_1x_coco.py' +model = dict( + bbox_head=dict( + with_deform=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +data = dict(train=dict(pipeline=train_pipeline)) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/foveabox/fovea_r101_fpn_4x4_1x_coco.py b/configs/mmdet/foveabox/fovea_r101_fpn_4x4_1x_coco.py new file mode 100644 index 00000000..9201af11 --- /dev/null +++ b/configs/mmdet/foveabox/fovea_r101_fpn_4x4_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fovea_r50_fpn_4x4_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/foveabox/fovea_r101_fpn_4x4_2x_coco.py b/configs/mmdet/foveabox/fovea_r101_fpn_4x4_2x_coco.py new file mode 100644 index 00000000..1ef5243f --- /dev/null +++ b/configs/mmdet/foveabox/fovea_r101_fpn_4x4_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fovea_r50_fpn_4x4_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/foveabox/fovea_r50_fpn_4x4_1x_coco.py b/configs/mmdet/foveabox/fovea_r50_fpn_4x4_1x_coco.py new file mode 100644 index 00000000..7e986ebc --- /dev/null +++ b/configs/mmdet/foveabox/fovea_r50_fpn_4x4_1x_coco.py @@ -0,0 +1,52 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='FOVEA', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + num_outs=5, + add_extra_convs='on_input'), + bbox_head=dict( + type='FoveaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + base_edge_list=[16, 32, 64, 128, 256], + scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)), + sigma=0.4, + with_deform=False, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=1.50, + alpha=0.4, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + nms_pre=1000, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)) +data = dict(samples_per_gpu=4, workers_per_gpu=4) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/foveabox/fovea_r50_fpn_4x4_2x_coco.py b/configs/mmdet/foveabox/fovea_r50_fpn_4x4_2x_coco.py new file mode 100644 index 00000000..68ce4d25 --- /dev/null +++ b/configs/mmdet/foveabox/fovea_r50_fpn_4x4_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './fovea_r50_fpn_4x4_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/foveabox/metafile.yml b/configs/mmdet/foveabox/metafile.yml new file mode 100644 index 00000000..fe9a2834 --- /dev/null +++ b/configs/mmdet/foveabox/metafile.yml @@ -0,0 +1,172 @@ +Collections: + - Name: FoveaBox + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 4x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.03797 + Title: 'FoveaBox: Beyond Anchor-based Object Detector' + README: configs/foveabox/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/fovea.py#L6 + Version: v2.0.0 + +Models: + - Name: fovea_r50_fpn_4x4_1x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py + Metadata: + Training Memory (GB): 5.6 + inference time (ms/im): + - value: 41.49 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219-ee4d5303.pth + + - Name: fovea_r50_fpn_4x4_2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r50_fpn_4x4_2x_coco.py + Metadata: + Training Memory (GB): 5.6 + inference time (ms/im): + - value: 41.49 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203-2df792b1.pth + + - Name: fovea_align_r50_fpn_gn-head_4x4_2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py + Metadata: + Training Memory (GB): 8.1 + inference time (ms/im): + - value: 51.55 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth + + - Name: fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py + Metadata: + Training Memory (GB): 8.1 + inference time (ms/im): + - value: 54.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205-85ce26cb.pth + + - Name: fovea_r101_fpn_4x4_1x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r101_fpn_4x4_1x_coco.py + Metadata: + Training Memory (GB): 9.2 + inference time (ms/im): + - value: 57.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219-05e38f1c.pth + + - Name: fovea_r101_fpn_4x4_2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r101_fpn_4x4_2x_coco.py + Metadata: + Training Memory (GB): 11.7 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208-02320ea4.pth + + - Name: fovea_align_r101_fpn_gn-head_4x4_2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py + Metadata: + Training Memory (GB): 11.7 + inference time (ms/im): + - value: 68.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208-c39a027a.pth + + - Name: fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py + Metadata: + Training Memory (GB): 11.7 + inference time (ms/im): + - value: 68.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208-649c5eb6.pth diff --git a/configs/mmdet/fpg/README.md b/configs/mmdet/fpg/README.md new file mode 100644 index 00000000..9d89510f --- /dev/null +++ b/configs/mmdet/fpg/README.md @@ -0,0 +1,43 @@ +# FPG + +> [Feature Pyramid Grids](https://arxiv.org/abs/2004.03580) + + + +## Abstract + +Feature pyramid networks have been widely adopted in the object detection literature to improve feature representations for better handling of variations in scale. In this paper, we present Feature Pyramid Grids (FPG), a deep multi-pathway feature pyramid, that represents the feature scale-space as a regular grid of parallel bottom-up pathways which are fused by multi-directional lateral connections. FPG can improve single-pathway feature pyramid networks by significantly increasing its performance at similar computation cost, highlighting importance of deep pyramid representations. In addition to its general and uniform structure, over complicated structures that have been found with neural architecture search, it also compares favorably against such approaches without relying on search. We hope that FPG with its uniform and effective nature can serve as a strong component for future work in object recognition. + +
+ +
+ +## Results and Models + +We benchmark the new training schedule (crop training, large batch, unfrozen BN, 50 epochs) introduced in NAS-FPN. +All backbones are Resnet-50 in pytorch style. + +| Method | Neck | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:------------:|:-----------:|:-------:|:--------:|:--------------:|:------:|:-------:|:-------:|:--------:| +| Faster R-CNN | FPG | 50e | 20.0 | - | 42.3 | - |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856.log.json) | +| Faster R-CNN | FPG-chn128 | 50e | 11.9 | - | 41.2 | - |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857.log.json) | +| Faster R-CNN | FPN | 50e | 20.0 | - | 38.9 | - |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpn_crop640_50e_coco/faster_rcnn_r50_fpn_crop640_50e_coco_20220311_011857-be7c9f42.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpn_crop640_50e_coco/faster_rcnn_r50_fpn_crop640_50e_coco_20220311_011857.log.json) | +| Mask R-CNN | FPG | 50e | 23.2 | - | 43.0 | 38.1 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857.log.json) | +| Mask R-CNN | FPG-chn128 | 50e | 15.3 | - | 41.7 | 37.1 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859.log.json) | +| Mask R-CNN | FPN | 50e | 23.2 | - | 49.6 | 35.6 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpn_crop640_50e_coco/mask_rcnn_r50_fpn_crop640_50e_coco_20220311_011855-a756664a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpn_crop640_50e_coco/mask_rcnn_r50_fpn_crop640_50e_coco_20220311_011855.log.json) | +| RetinaNet | FPG | 50e | 20.8 | - | 40.5 | - |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809.log.json) | +| RetinaNet | FPG-chn128 | 50e | 19.9 | - | 39.9 | - |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829.log.json) | + +**Note**: Chn128 means to decrease the number of channels of features and convs from 256 (default) to 128 in +Neck and BBox Head, which can greatly decrease memory consumption without sacrificing much precision. + +## Citation + +```latex +@article{chen2020feature, + title={Feature pyramid grids}, + author={Chen, Kai and Cao, Yuhang and Loy, Chen Change and Lin, Dahua and Feichtenhofer, Christoph}, + journal={arXiv preprint arXiv:2004.03580}, + year={2020} +} +``` diff --git a/configs/mmdet/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py b/configs/mmdet/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py new file mode 100644 index 00000000..4535034e --- /dev/null +++ b/configs/mmdet/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py @@ -0,0 +1,9 @@ +_base_ = 'faster_rcnn_r50_fpg_crop640_50e_coco.py' + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + neck=dict(out_channels=128, inter_channels=128), + rpn_head=dict(in_channels=128), + roi_head=dict( + bbox_roi_extractor=dict(out_channels=128), + bbox_head=dict(in_channels=128))) diff --git a/configs/mmdet/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py b/configs/mmdet/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py new file mode 100644 index 00000000..3ab2a2c5 --- /dev/null +++ b/configs/mmdet/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py @@ -0,0 +1,48 @@ +_base_ = 'faster_rcnn_r50_fpn_crop640_50e_coco.py' + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + neck=dict( + type='FPG', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + inter_channels=256, + num_outs=5, + stack_times=9, + paths=['bu'] * 9, + same_down_trans=None, + same_up_trans=dict( + type='conv', + kernel_size=3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_lateral_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_down_trans=dict( + type='interpolation_conv', + mode='nearest', + kernel_size=3, + norm_cfg=norm_cfg, + order=('act', 'conv', 'norm'), + inplace=False), + across_up_trans=None, + across_skip_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + output_trans=dict( + type='last_conv', + kernel_size=3, + order=('act', 'conv', 'norm'), + inplace=False), + norm_cfg=norm_cfg, + skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])) diff --git a/configs/mmdet/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py b/configs/mmdet/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py new file mode 100644 index 00000000..e4ec940a --- /dev/null +++ b/configs/mmdet/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py @@ -0,0 +1,73 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + backbone=dict(norm_cfg=norm_cfg, norm_eval=False), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict(bbox_head=dict(norm_cfg=norm_cfg))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=(640, 640), + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=(640, 640)), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(640, 640), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=64), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# learning policy +optimizer = dict( + type='SGD', + lr=0.08, + momentum=0.9, + weight_decay=0.0001, + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.1, + step=[30, 40]) +# runtime settings +runner = dict(max_epochs=50) +evaluation = dict(interval=2) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py b/configs/mmdet/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py new file mode 100644 index 00000000..baa4a5af --- /dev/null +++ b/configs/mmdet/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py @@ -0,0 +1,10 @@ +_base_ = 'mask_rcnn_r50_fpg_crop640_50e_coco.py' + +model = dict( + neck=dict(out_channels=128, inter_channels=128), + rpn_head=dict(in_channels=128), + roi_head=dict( + bbox_roi_extractor=dict(out_channels=128), + bbox_head=dict(in_channels=128), + mask_roi_extractor=dict(out_channels=128), + mask_head=dict(in_channels=128))) diff --git a/configs/mmdet/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py b/configs/mmdet/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py new file mode 100644 index 00000000..3c9ea276 --- /dev/null +++ b/configs/mmdet/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py @@ -0,0 +1,48 @@ +_base_ = 'mask_rcnn_r50_fpn_crop640_50e_coco.py' + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + neck=dict( + type='FPG', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + inter_channels=256, + num_outs=5, + stack_times=9, + paths=['bu'] * 9, + same_down_trans=None, + same_up_trans=dict( + type='conv', + kernel_size=3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_lateral_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_down_trans=dict( + type='interpolation_conv', + mode='nearest', + kernel_size=3, + norm_cfg=norm_cfg, + order=('act', 'conv', 'norm'), + inplace=False), + across_up_trans=None, + across_skip_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + output_trans=dict( + type='last_conv', + kernel_size=3, + order=('act', 'conv', 'norm'), + inplace=False), + norm_cfg=norm_cfg, + skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])) diff --git a/configs/mmdet/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py b/configs/mmdet/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py new file mode 100644 index 00000000..c6bcc242 --- /dev/null +++ b/configs/mmdet/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py @@ -0,0 +1,79 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + backbone=dict(norm_cfg=norm_cfg, norm_eval=False), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + norm_cfg=norm_cfg, + num_outs=5), + roi_head=dict( + bbox_head=dict(norm_cfg=norm_cfg), mask_head=dict(norm_cfg=norm_cfg))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=(640, 640), + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=(640, 640)), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(640, 640), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=64), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# learning policy +optimizer = dict( + type='SGD', + lr=0.08, + momentum=0.9, + weight_decay=0.0001, + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.1, + step=[30, 40]) +# runtime settings +runner = dict(max_epochs=50) +evaluation = dict(interval=2) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/fpg/metafile.yml b/configs/mmdet/fpg/metafile.yml new file mode 100644 index 00000000..6b0a6a79 --- /dev/null +++ b/configs/mmdet/fpg/metafile.yml @@ -0,0 +1,104 @@ +Collections: + - Name: Feature Pyramid Grids + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Feature Pyramid Grids + Paper: + URL: https://arxiv.org/abs/2004.03580 + Title: 'Feature Pyramid Grids' + README: configs/fpg/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.10.0/mmdet/models/necks/fpg.py#L101 + Version: v2.10.0 + +Models: + - Name: faster_rcnn_r50_fpg_crop640_50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py + Metadata: + Training Memory (GB): 20.0 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth + + - Name: faster_rcnn_r50_fpg-chn128_crop640_50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py + Metadata: + Training Memory (GB): 11.9 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth + + - Name: mask_rcnn_r50_fpg_crop640_50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py + Metadata: + Training Memory (GB): 23.2 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth + + - Name: mask_rcnn_r50_fpg-chn128_crop640_50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py + Metadata: + Training Memory (GB): 15.3 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth + + - Name: retinanet_r50_fpg_crop640_50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py + Metadata: + Training Memory (GB): 20.8 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth + + - Name: retinanet_r50_fpg-chn128_crop640_50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py + Metadata: + Training Memory (GB): 19.9 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth diff --git a/configs/mmdet/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py b/configs/mmdet/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py new file mode 100644 index 00000000..9a6cf7e5 --- /dev/null +++ b/configs/mmdet/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py @@ -0,0 +1,5 @@ +_base_ = 'retinanet_r50_fpg_crop640_50e_coco.py' + +model = dict( + neck=dict(out_channels=128, inter_channels=128), + bbox_head=dict(in_channels=128)) diff --git a/configs/mmdet/fpg/retinanet_r50_fpg_crop640_50e_coco.py b/configs/mmdet/fpg/retinanet_r50_fpg_crop640_50e_coco.py new file mode 100644 index 00000000..504ed5ec --- /dev/null +++ b/configs/mmdet/fpg/retinanet_r50_fpg_crop640_50e_coco.py @@ -0,0 +1,53 @@ +_base_ = '../nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py' + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + neck=dict( + _delete_=True, + type='FPG', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + inter_channels=256, + num_outs=5, + add_extra_convs=True, + start_level=1, + stack_times=9, + paths=['bu'] * 9, + same_down_trans=None, + same_up_trans=dict( + type='conv', + kernel_size=3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_lateral_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_down_trans=dict( + type='interpolation_conv', + mode='nearest', + kernel_size=3, + norm_cfg=norm_cfg, + order=('act', 'conv', 'norm'), + inplace=False), + across_up_trans=None, + across_skip_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + output_trans=dict( + type='last_conv', + kernel_size=3, + order=('act', 'conv', 'norm'), + inplace=False), + norm_cfg=norm_cfg, + skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])) + +evaluation = dict(interval=2) diff --git a/configs/mmdet/free_anchor/README.md b/configs/mmdet/free_anchor/README.md new file mode 100644 index 00000000..e232f370 --- /dev/null +++ b/configs/mmdet/free_anchor/README.md @@ -0,0 +1,37 @@ +# FreeAnchor + +> [FreeAnchor: Learning to Match Anchors for Visual Object Detection](https://arxiv.org/abs/1909.02466) + + + +## Abstract + +Modern CNN-based object detectors assign anchors for ground-truth objects under the restriction of object-anchor Intersection-over-Unit (IoU). In this study, we propose a learning-to-match approach to break IoU restriction, allowing objects to match anchors in a flexible manner. Our approach, referred to as FreeAnchor, updates hand-crafted anchor assignment to "free" anchor matching by formulating detector training as a maximum likelihood estimation (MLE) procedure. FreeAnchor targets at learning features which best explain a class of objects in terms of both classification and localization. FreeAnchor is implemented by optimizing detection customized likelihood and can be fused with CNN-based detectors in a plug-and-play manner. Experiments on COCO demonstrate that FreeAnchor consistently outperforms their counterparts with significant margins. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:--------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | pytorch | 1x | 4.9 | 18.4 | 38.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130_095625.log.json) | +| R-101 | pytorch | 1x | 6.8 | 14.9 | 40.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130-358324e6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130_100723.log.json) | +| X-101-32x4d | pytorch | 1x | 8.1 | 11.1 | 41.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130-d4846968.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130_095627.log.json) | + +**Notes:** + +- We use 8 GPUs with 2 images/GPU. +- For more settings and models, please refer to the [official repo](https://github.com/zhangxiaosong18/FreeAnchor). + +## Citation + +```latex +@inproceedings{zhang2019freeanchor, + title = {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection}, + author = {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang}, + booktitle = {Neural Information Processing Systems}, + year = {2019} +} +``` diff --git a/configs/mmdet/free_anchor/metafile.yml b/configs/mmdet/free_anchor/metafile.yml new file mode 100644 index 00000000..170fb5c0 --- /dev/null +++ b/configs/mmdet/free_anchor/metafile.yml @@ -0,0 +1,79 @@ +Collections: + - Name: FreeAnchor + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FreeAnchor + - ResNet + Paper: + URL: https://arxiv.org/abs/1909.02466 + Title: 'FreeAnchor: Learning to Match Anchors for Visual Object Detection' + README: configs/free_anchor/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/dense_heads/free_anchor_retina_head.py#L10 + Version: v2.0.0 + +Models: + - Name: retinanet_free_anchor_r50_fpn_1x_coco + In Collection: FreeAnchor + Config: configs/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.9 + inference time (ms/im): + - value: 54.35 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth + + - Name: retinanet_free_anchor_r101_fpn_1x_coco + In Collection: FreeAnchor + Config: configs/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.8 + inference time (ms/im): + - value: 67.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130-358324e6.pth + + - Name: retinanet_free_anchor_x101_32x4d_fpn_1x_coco + In Collection: FreeAnchor + Config: configs/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.1 + inference time (ms/im): + - value: 90.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130-d4846968.pth diff --git a/configs/mmdet/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py b/configs/mmdet/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py new file mode 100644 index 00000000..f4aea53c --- /dev/null +++ b/configs/mmdet/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './retinanet_free_anchor_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py b/configs/mmdet/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py new file mode 100644 index 00000000..28f983c2 --- /dev/null +++ b/configs/mmdet/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py @@ -0,0 +1,22 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='FreeAnchorRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.75))) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..65f8a9e2 --- /dev/null +++ b/configs/mmdet/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = './retinanet_free_anchor_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/fsaf/README.md b/configs/mmdet/fsaf/README.md new file mode 100644 index 00000000..64976c57 --- /dev/null +++ b/configs/mmdet/fsaf/README.md @@ -0,0 +1,57 @@ +# FSAF + +> [Feature Selective Anchor-Free Module for Single-Shot Object Detection](https://arxiv.org/abs/1903.00621) + + + +## Abstract + +We motivate and present feature selective anchor-free (FSAF) module, a simple and effective building block for single-shot object detectors. It can be plugged into single-shot detectors with feature pyramid structure. The FSAF module addresses two limitations brought up by the conventional anchor-based detection: 1) heuristic-guided feature selection; 2) overlap-based anchor sampling. The general concept of the FSAF module is online feature selection applied to the training of multi-level anchor-free branches. Specifically, an anchor-free branch is attached to each level of the feature pyramid, allowing box encoding and decoding in the anchor-free manner at an arbitrary level. During training, we dynamically assign each instance to the most suitable feature level. At the time of inference, the FSAF module can work jointly with anchor-based branches by outputting predictions in parallel. We instantiate this concept with simple implementations of anchor-free branches and online feature selection strategy. Experimental results on the COCO detection track show that our FSAF module performs better than anchor-based counterparts while being faster. When working jointly with anchor-based branches, the FSAF module robustly improves the baseline RetinaNet by a large margin under various settings, while introducing nearly free inference overhead. And the resulting best model can achieve a state-of-the-art 44.6% mAP, outperforming all existing single-shot detectors on COCO. + +
+ +
+ +## Introduction + +FSAF is an anchor-free method published in CVPR2019 ([https://arxiv.org/pdf/1903.00621.pdf](https://arxiv.org/pdf/1903.00621.pdf)). +Actually it is equivalent to the anchor-based method with only one anchor at each feature map position in each FPN level. +And this is how we implemented it. +Only the anchor-free branch is released for its better compatibility with the current framework and less computational budget. + +In the original paper, feature maps within the central 0.2-0.5 area of a gt box are tagged as ignored. However, +it is empirically found that a hard threshold (0.2-0.2) gives a further gain on the performance. (see the table below) + +## Results and Models + +### Results on R50/R101/X101-FPN + +| Backbone | ignore range | ms-train| Lr schd |Train Mem (GB)| Train time (s/iter) | Inf time (fps) | box AP | Config | Download | +|:----------:| :-------: |:-------:|:-------:|:------------:|:---------------:|:--------------:|:-------------:|:------:|:--------:| +| R-50 | 0.2-0.5 | N | 1x | 3.15 | 0.43 | 12.3 | 36.0 (35.9) | | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco_20200715-b555b0e0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco_20200715_094657.log.json) | +| R-50 | 0.2-0.2 | N | 1x | 3.15 | 0.43 | 13.0 | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fsaf/fsaf_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco_20200428_072327.log.json)| +| R-101 | 0.2-0.2 | N | 1x | 5.08 | 0.58 | 10.8 | 39.3 (37.9) | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fsaf/fsaf_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco-9e71098f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco_20200428_160348.log.json)| +| X-101 | 0.2-0.2 | N | 1x | 9.38 | 1.23 | 5.6 | 42.4 (41.0) | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco-e3f6e6fd.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco_20200428_160424.log.json)| + +**Notes:** + +- *1x means the model is trained for 12 epochs.* +- *AP values in the brackets represent those reported in the original paper.* +- *All results are obtained with a single model and single-scale test.* +- *X-101 backbone represents ResNext-101-64x4d.* +- *All pretrained backbones use pytorch style.* +- *All models are trained on 8 Titan-XP gpus and tested on a single gpu.* + +## Citation + +BibTeX reference is as follows. + +```latex +@inproceedings{zhu2019feature, + title={Feature Selective Anchor-Free Module for Single-Shot Object Detection}, + author={Zhu, Chenchen and He, Yihui and Savvides, Marios}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={840--849}, + year={2019} +} +``` diff --git a/configs/mmdet/fsaf/fsaf_r101_fpn_1x_coco.py b/configs/mmdet/fsaf/fsaf_r101_fpn_1x_coco.py new file mode 100644 index 00000000..12b49fed --- /dev/null +++ b/configs/mmdet/fsaf/fsaf_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fsaf_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/fsaf/fsaf_r50_fpn_1x_coco.py b/configs/mmdet/fsaf/fsaf_r50_fpn_1x_coco.py new file mode 100644 index 00000000..67f3ec1c --- /dev/null +++ b/configs/mmdet/fsaf/fsaf_r50_fpn_1x_coco.py @@ -0,0 +1,48 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +# model settings +model = dict( + type='FSAF', + bbox_head=dict( + type='FSAFHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + reg_decoded_bbox=True, + # Only anchor-free branch is implemented. The anchor generator only + # generates 1 anchor at each feature point, as a substitute of the + # grid of features. + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=1, + scales_per_octave=1, + ratios=[1.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict(_delete_=True, type='TBLRBBoxCoder', normalizer=4.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0, + reduction='none'), + loss_bbox=dict( + _delete_=True, + type='IoULoss', + eps=1e-6, + loss_weight=1.0, + reduction='none')), + # training and testing settings + train_cfg=dict( + assigner=dict( + _delete_=True, + type='CenterRegionAssigner', + pos_scale=0.2, + neg_scale=0.2, + min_pos_iof=0.01), + allowed_border=-1, + pos_weight=-1, + debug=False)) +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=10, norm_type=2)) diff --git a/configs/mmdet/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..89c0c634 --- /dev/null +++ b/configs/mmdet/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './fsaf_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/fsaf/metafile.yml b/configs/mmdet/fsaf/metafile.yml new file mode 100644 index 00000000..5434e9ad --- /dev/null +++ b/configs/mmdet/fsaf/metafile.yml @@ -0,0 +1,80 @@ +Collections: + - Name: FSAF + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x Titan-XP GPUs + Architecture: + - FPN + - FSAF + - ResNet + Paper: + URL: https://arxiv.org/abs/1903.00621 + Title: 'Feature Selective Anchor-Free Module for Single-Shot Object Detection' + README: configs/fsaf/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/detectors/fsaf.py#L6 + Version: v2.1.0 + +Models: + - Name: fsaf_r50_fpn_1x_coco + In Collection: FSAF + Config: configs/fsaf/fsaf_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.15 + inference time (ms/im): + - value: 76.92 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth + + - Name: fsaf_r101_fpn_1x_coco + In Collection: FSAF + Config: configs/fsaf/fsaf_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.08 + inference time (ms/im): + - value: 92.59 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.3 (37.9) + Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco-9e71098f.pth + + - Name: fsaf_x101_64x4d_fpn_1x_coco + In Collection: FSAF + Config: configs/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.38 + inference time (ms/im): + - value: 178.57 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 (41.0) + Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco-e3f6e6fd.pth diff --git a/configs/mmdet/gcnet/README.md b/configs/mmdet/gcnet/README.md new file mode 100644 index 00000000..4d167831 --- /dev/null +++ b/configs/mmdet/gcnet/README.md @@ -0,0 +1,69 @@ +# GCNet + +> [GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond](https://arxiv.org/abs/1904.11492) + + + +## Abstract + +The Non-Local Network (NLNet) presents a pioneering approach for capturing long-range dependencies, via aggregating query-specific global context to each query position. However, through a rigorous empirical analysis, we have found that the global contexts modeled by non-local network are almost the same for different query positions within an image. In this paper, we take advantage of this finding to create a simplified network based on a query-independent formulation, which maintains the accuracy of NLNet but with significantly less computation. We further observe that this simplified design shares similar structure with Squeeze-Excitation Network (SENet). Hence we unify them into a three-step general framework for global context modeling. Within the general framework, we design a better instantiation, called the global context (GC) block, which is lightweight and can effectively model the global context. The lightweight property allows us to apply it for multiple layers in a backbone network to construct a global context network (GCNet), which generally outperforms both simplified NLNet and SENet on major benchmarks for various recognition tasks. + +
+ +
+ +## Introduction + +By [Yue Cao](http://yue-cao.me), [Jiarui Xu](http://jerryxu.net), [Stephen Lin](https://scholar.google.com/citations?user=c3PYmxUAAAAJ&hl=en), Fangyun Wei, [Han Hu](https://sites.google.com/site/hanhushomepage/). + +We provide config files to reproduce the results in the paper for +["GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond"](https://arxiv.org/abs/1904.11492) on COCO object detection. + +**GCNet** is initially described in [arxiv](https://arxiv.org/abs/1904.11492). Via absorbing advantages of Non-Local Networks (NLNet) and Squeeze-Excitation Networks (SENet), GCNet provides a simple, fast and effective approach for global context modeling, which generally outperforms both NLNet and SENet on major benchmarks for various recognition tasks. + +## Results and Models + +The results on COCO 2017val are shown in the below table. + +| Backbone | Model | Context | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------: | :--------------: | :------------: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| R-50-FPN | Mask | GC(c3-c5, r16) | 1x | 5.0 | | 39.7 | 35.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915-187da160.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915.log.json) | +| R-50-FPN | Mask | GC(c3-c5, r4) | 1x | 5.1 | 15.0 | 39.9 | 36.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204-17235656.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204_024626.log.json) | +| R-101-FPN | Mask | GC(c3-c5, r16) | 1x | 7.6 | 11.4 | 41.3 | 37.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205-e58ae947.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205_192835.log.json) | +| R-101-FPN | Mask | GC(c3-c5, r4) | 1x | 7.8 | 11.6 | 42.2 | 37.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206-af22dc9d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206_112128.log.json) | + +| Backbone | Model | Context | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------: | :--------------: | :------------: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :-------: | +| R-50-FPN | Mask | - | 1x | 4.4 | 16.6 | 38.4 | 34.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202-bb3eb55c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202_214122.log.json) | +| R-50-FPN | Mask | GC(c3-c5, r16) | 1x | 5.0 | 15.5 | 40.4 | 36.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202_174907.log.json) | +| R-50-FPN | Mask | GC(c3-c5, r4) | 1x | 5.1 | 15.1 | 40.7 | 36.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202_085547.log.json) | +| R-101-FPN | Mask | - | 1x | 6.4 | 13.3 | 40.5 | 36.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210-81658c8a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210_220422.log.json) | +| R-101-FPN | Mask | GC(c3-c5, r16) | 1x | 7.6 | 12.0 | 42.2 | 37.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207-945e77ca.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207_015330.log.json) | +| R-101-FPN | Mask | GC(c3-c5, r4) | 1x | 7.8 | 11.8 | 42.2 | 37.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206_142508.log.json) | +| X-101-FPN | Mask | - | 1x | 7.6 | 11.3 | 42.4 | 37.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211-7584841c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211_054326.log.json) | +| X-101-FPN | Mask | GC(c3-c5, r16) | 1x | 8.8 | 9.8 | 43.5 | 38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-cbed3d2c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211_164715.log.json) | +| X-101-FPN | Mask | GC(c3-c5, r4) | 1x | 9.0 | 9.7 | 43.9 | 39.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212-68164964.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212_070942.log.json) | +| X-101-FPN | Cascade Mask | - | 1x | 9.2 | 8.4 | 44.7 | 38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310-d5ad2a5e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310_115217.log.json) | +| X-101-FPN | Cascade Mask | GC(c3-c5, r16) | 1x | 10.3 | 7.7 | 46.2 | 39.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-10bf2463.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211_184154.log.json) | +| X-101-FPN | Cascade Mask | GC(c3-c5, r4) | 1x | 10.6 | | 46.4 | 40.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653-ed035291.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653.log.json) | +| X-101-FPN | DCN Cascade Mask | - | 1x | | | 47.5 | 40.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019-abbc39ea.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019.log.json)| +| X-101-FPN | DCN Cascade Mask | GC(c3-c5, r16) | 1x | | | 48.0 | 41.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648-44aa598a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648.log.json) | +| X-101-FPN | DCN Cascade Mask | GC(c3-c5, r4) | 1x | | | 47.9 | 41.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851-720338ec.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851.log.json) | + +**Notes:** + +- The `SyncBN` is added in the backbone for all models in **Table 2**. +- `GC` denotes Global Context (GC) block is inserted after 1x1 conv of backbone. +- `DCN` denotes replace 3x3 conv with 3x3 Deformable Convolution in `c3-c5` stages of backbone. +- `r4` and `r16` denote ratio 4 and ratio 16 in GC block respectively. + +## Citation + +```latex +@article{cao2019GCNet, + title={GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond}, + author={Cao, Yue and Xu, Jiarui and Lin, Stephen and Wei, Fangyun and Hu, Han}, + journal={arXiv preprint arXiv:1904.11492}, + year={2019} +} +``` diff --git a/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py new file mode 100644 index 00000000..5118895f --- /dev/null +++ b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..413499dd --- /dev/null +++ b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..50689aad --- /dev/null +++ b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..13672312 --- /dev/null +++ b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..50883ffe --- /dev/null +++ b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..31fdd070 --- /dev/null +++ b/configs/mmdet/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..ad6ad476 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..29f91674 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py new file mode 100644 index 00000000..6e1c5d0c --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..781dba78 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..32972de8 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..d299b69f --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..5ac908e6 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py new file mode 100644 index 00000000..0308a567 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..e04780c5 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..980f8191 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py new file mode 100644 index 00000000..f0c96e58 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..7fb8e82e --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py b/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py new file mode 100644 index 00000000..b1ddbee3 --- /dev/null +++ b/configs/mmdet/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/configs/mmdet/gcnet/metafile.yml b/configs/mmdet/gcnet/metafile.yml new file mode 100644 index 00000000..1281122a --- /dev/null +++ b/configs/mmdet/gcnet/metafile.yml @@ -0,0 +1,440 @@ +Collections: + - Name: GCNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Global Context Block + - FPN + - RPN + - ResNet + - ResNeXt + Paper: + URL: https://arxiv.org/abs/1904.11492 + Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond' + README: configs/gcnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/context_block.py#L13 + Version: v2.0.0 + +Models: + - Name: mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915-187da160.pth + + - Name: mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 5.1 + inference time (ms/im): + - value: 66.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204-17235656.pth + + - Name: mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 87.72 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205-e58ae947.pth + + - Name: mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + inference time (ms/im): + - value: 86.21 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206-af22dc9d.pth + + - Name: mask_rcnn_r50_fpn_syncbn-backbone_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 60.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202-bb3eb55c.pth + + - Name: mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + inference time (ms/im): + - value: 64.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth + + - Name: mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 5.1 + inference time (ms/im): + - value: 66.23 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth + + - Name: mask_rcnn_r101_fpn_syncbn-backbone_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 75.19 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210-81658c8a.pth + + - Name: mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 83.33 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207-945e77ca.pth + + - Name: mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + inference time (ms/im): + - value: 84.75 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth + + - Name: mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211-7584841c.pth + + - Name: mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 8.8 + inference time (ms/im): + - value: 102.04 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-cbed3d2c.pth + + - Name: mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 9.0 + inference time (ms/im): + - value: 103.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212-68164964.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py + Metadata: + Training Memory (GB): 9.2 + inference time (ms/im): + - value: 119.05 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310-d5ad2a5e.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 129.87 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-10bf2463.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 10.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653-ed035291.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019-abbc39ea.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648-44aa598a.pth + + - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851-720338ec.pth diff --git a/configs/mmdet/gfl/README.md b/configs/mmdet/gfl/README.md new file mode 100644 index 00000000..2a8e60a6 --- /dev/null +++ b/configs/mmdet/gfl/README.md @@ -0,0 +1,42 @@ +# GFL + +> [Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection](https://arxiv.org/abs/2006.04388) + + + +## Abstract + +One-stage detector basically formulates object detection as dense classification and localization. The classification is usually optimized by Focal Loss and the box location is commonly learned under Dirac delta distribution. A recent trend for one-stage detectors is to introduce an individual prediction branch to estimate the quality of localization, where the predicted quality facilitates the classification to improve detection performance. This paper delves into the representations of the above three fundamental elements: quality estimation, classification and localization. Two problems are discovered in existing practices, including (1) the inconsistent usage of the quality estimation and classification between training and inference and (2) the inflexible Dirac delta distribution for localization when there is ambiguity and uncertainty in complex scenes. To address the problems, we design new representations for these elements. Specifically, we merge the quality estimation into the class prediction vector to form a joint representation of localization quality and classification, and use a vector to represent arbitrary distribution of box locations. The improved representations eliminate the inconsistency risk and accurately depict the flexible distribution in real data, but contain continuous labels, which is beyond the scope of Focal Loss. We then propose Generalized Focal Loss (GFL) that generalizes Focal Loss from its discrete form to the continuous version for successful optimization. On COCO test-dev, GFL achieves 45.0\% AP using ResNet-101 backbone, surpassing state-of-the-art SAPD (43.5\%) and ATSS (43.6\%) with higher or comparable inference speed, under the same backbone and training settings. Notably, our best model can achieve a single-model single-scale AP of 48.2\%, at 10 FPS on a single 2080Ti GPU. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Multi-scale Training| Inf time (fps) | box AP | Config | Download | +|:-----------------:|:-------:|:-------:|:-------------------:|:--------------:|:------:|:------:|:--------:| +| R-50 | pytorch | 1x | No | 19.5 | 40.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244.log.json) | +| R-50 | pytorch | 2x | Yes | 19.5 | 42.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_r50_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802-37bb1edc.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802.log.json) | +| R-101 | pytorch | 2x | Yes | 14.7 | 44.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126.log.json) | +| R-101-dcnv2 | pytorch | 2x | Yes | 12.9 | 47.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002.log.json) | +| X-101-32x4d | pytorch | 2x | Yes | 12.1 | 45.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002-50c1ffdb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002.log.json) | +| X-101-32x4d-dcnv2 | pytorch | 2x | Yes | 10.7 | 48.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002-14a2bf25.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002.log.json) | + +[1] *1x and 2x mean the model is trained for 90K and 180K iterations, respectively.* \ +[2] *All results are obtained with a single model and without any test time data augmentation such as multi-scale, flipping and etc..* \ +[3] *`dcnv2` denotes deformable convolutional networks v2.* \ +[4] *FPS is tested with a single GeForce RTX 2080Ti GPU, using a batch size of 1.* + +## Citation + +We provide config files to reproduce the object detection results in the paper [Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection](https://arxiv.org/abs/2006.04388) + +```latex +@article{li2020generalized, + title={Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection}, + author={Li, Xiang and Wang, Wenhai and Wu, Lijun and Chen, Shuo and Hu, Xiaolin and Li, Jun and Tang, Jinhui and Yang, Jian}, + journal={arXiv preprint arXiv:2006.04388}, + year={2020} +} +``` diff --git a/configs/mmdet/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py b/configs/mmdet/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..b72c2b6e --- /dev/null +++ b/configs/mmdet/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './gfl_r50_fpn_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/gfl/gfl_r101_fpn_mstrain_2x_coco.py b/configs/mmdet/gfl/gfl_r101_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..e33b5c0d --- /dev/null +++ b/configs/mmdet/gfl/gfl_r101_fpn_mstrain_2x_coco.py @@ -0,0 +1,13 @@ +_base_ = './gfl_r50_fpn_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/gfl/gfl_r50_fpn_1x_coco.py b/configs/mmdet/gfl/gfl_r50_fpn_1x_coco.py new file mode 100644 index 00000000..cfd4b023 --- /dev/null +++ b/configs/mmdet/gfl/gfl_r50_fpn_1x_coco.py @@ -0,0 +1,57 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='GFL', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='GFLHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25), + reg_max=16, + loss_bbox=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/gfl/gfl_r50_fpn_mstrain_2x_coco.py b/configs/mmdet/gfl/gfl_r50_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..b8be6014 --- /dev/null +++ b/configs/mmdet/gfl/gfl_r50_fpn_mstrain_2x_coco.py @@ -0,0 +1,22 @@ +_base_ = './gfl_r50_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) +# multi-scale training +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py b/configs/mmdet/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..25398075 --- /dev/null +++ b/configs/mmdet/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py @@ -0,0 +1,18 @@ +_base_ = './gfl_r50_fpn_mstrain_2x_coco.py' +model = dict( + type='GFL', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, False, True, True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py b/configs/mmdet/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..effda195 --- /dev/null +++ b/configs/mmdet/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './gfl_r50_fpn_mstrain_2x_coco.py' +model = dict( + type='GFL', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/gfl/metafile.yml b/configs/mmdet/gfl/metafile.yml new file mode 100644 index 00000000..8f049c6b --- /dev/null +++ b/configs/mmdet/gfl/metafile.yml @@ -0,0 +1,134 @@ +Collections: + - Name: Generalized Focal Loss + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Generalized Focal Loss + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2006.04388 + Title: 'Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection' + README: configs/gfl/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/detectors/gfl.py#L6 + Version: v2.2.0 + +Models: + - Name: gfl_r50_fpn_1x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_r50_fpn_1x_coco.py + Metadata: + inference time (ms/im): + - value: 51.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth + + - Name: gfl_r50_fpn_mstrain_2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_r50_fpn_mstrain_2x_coco.py + Metadata: + inference time (ms/im): + - value: 51.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802-37bb1edc.pth + + - Name: gfl_r101_fpn_mstrain_2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py + Metadata: + inference time (ms/im): + - value: 68.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth + + - Name: gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py + Metadata: + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth + + - Name: gfl_x101_32x4d_fpn_mstrain_2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py + Metadata: + inference time (ms/im): + - value: 82.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002-50c1ffdb.pth + + - Name: gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py + Metadata: + inference time (ms/im): + - value: 93.46 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002-14a2bf25.pth diff --git a/configs/mmdet/ghm/README.md b/configs/mmdet/ghm/README.md new file mode 100644 index 00000000..6a8e99e5 --- /dev/null +++ b/configs/mmdet/ghm/README.md @@ -0,0 +1,33 @@ +# GHM + +> [Gradient Harmonized Single-stage Detector](https://arxiv.org/abs/1811.05181) + + + +## Abstract + +Despite the great success of two-stage detectors, single-stage detector is still a more elegant and efficient way, yet suffers from the two well-known disharmonies during training, i.e. the huge difference in quantity between positive and negative examples as well as between easy and hard examples. In this work, we first point out that the essential effect of the two disharmonies can be summarized in term of the gradient. Further, we propose a novel gradient harmonizing mechanism (GHM) to be a hedging for the disharmonies. The philosophy behind GHM can be easily embedded into both classification loss function like cross-entropy (CE) and regression loss function like smooth-L1 (SL1) loss. To this end, two novel loss functions called GHM-C and GHM-R are designed to balancing the gradient flow for anchor classification and bounding box refinement, respectively. Ablation study on MS COCO demonstrates that without laborious hyper-parameter tuning, both GHM-C and GHM-R can bring substantial improvement for single-stage detector. Without any whistles and bells, our model achieves 41.6 mAP on COCO test-dev set which surpasses the state-of-the-art method, Focal Loss (FL) + SL1, by 0.8. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| R-50-FPN | pytorch | 1x | 4.0 | 3.3 | 37.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ghm/retinanet_ghm_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130_004213.log.json) | +| R-101-FPN | pytorch | 1x | 6.0 | 4.4 | 39.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ghm/retinanet_ghm_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130-c148ee8f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130_145259.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.2 | 5.1 | 40.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131-e4333bd0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131_113653.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.3 | 5.2 | 41.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131-dd381cef.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131_113723.log.json) | + +## Citation + +```latex +@inproceedings{li2019gradient, + title={Gradient Harmonized Single-stage Detector}, + author={Li, Buyu and Liu, Yu and Wang, Xiaogang}, + booktitle={AAAI Conference on Artificial Intelligence}, + year={2019} +} +``` diff --git a/configs/mmdet/ghm/metafile.yml b/configs/mmdet/ghm/metafile.yml new file mode 100644 index 00000000..b4f488c4 --- /dev/null +++ b/configs/mmdet/ghm/metafile.yml @@ -0,0 +1,101 @@ +Collections: + - Name: GHM + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - GHM-C + - GHM-R + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1811.05181 + Title: 'Gradient Harmonized Single-stage Detector' + README: configs/ghm/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/losses/ghm_loss.py#L21 + Version: v2.0.0 + +Models: + - Name: retinanet_ghm_r50_fpn_1x_coco + In Collection: GHM + Config: configs/ghm/retinanet_ghm_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 303.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth + + - Name: retinanet_ghm_r101_fpn_1x_coco + In Collection: GHM + Config: configs/ghm/retinanet_ghm_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 227.27 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130-c148ee8f.pth + + - Name: retinanet_ghm_x101_32x4d_fpn_1x_coco + In Collection: GHM + Config: configs/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.2 + inference time (ms/im): + - value: 196.08 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131-e4333bd0.pth + + - Name: retinanet_ghm_x101_64x4d_fpn_1x_coco + In Collection: GHM + Config: configs/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 192.31 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131-dd381cef.pth diff --git a/configs/mmdet/ghm/retinanet_ghm_r101_fpn_1x_coco.py b/configs/mmdet/ghm/retinanet_ghm_r101_fpn_1x_coco.py new file mode 100644 index 00000000..aaf6fc26 --- /dev/null +++ b/configs/mmdet/ghm/retinanet_ghm_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './retinanet_ghm_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/ghm/retinanet_ghm_r50_fpn_1x_coco.py b/configs/mmdet/ghm/retinanet_ghm_r50_fpn_1x_coco.py new file mode 100644 index 00000000..61b97510 --- /dev/null +++ b/configs/mmdet/ghm/retinanet_ghm_r50_fpn_1x_coco.py @@ -0,0 +1,19 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +model = dict( + bbox_head=dict( + loss_cls=dict( + _delete_=True, + type='GHMC', + bins=30, + momentum=0.75, + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict( + _delete_=True, + type='GHMR', + mu=0.02, + bins=10, + momentum=0.7, + loss_weight=10.0))) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..cd2e4cc3 --- /dev/null +++ b/configs/mmdet/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_ghm_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..b6107d8c --- /dev/null +++ b/configs/mmdet/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_ghm_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/gn+ws/README.md b/configs/mmdet/gn+ws/README.md new file mode 100644 index 00000000..7f1dec11 --- /dev/null +++ b/configs/mmdet/gn+ws/README.md @@ -0,0 +1,54 @@ +# GN + WS + +> [Weight Standardization](https://arxiv.org/abs/1903.10520) + + + +## Abstract + +Batch Normalization (BN) has become an out-of-box technique to improve deep network training. However, its effectiveness is limited for micro-batch training, i.e., each GPU typically has only 1-2 images for training, which is inevitable for many computer vision tasks, e.g., object detection and semantic segmentation, constrained by memory consumption. To address this issue, we propose Weight Standardization (WS) and Batch-Channel Normalization (BCN) to bring two success factors of BN into micro-batch training: 1) the smoothing effects on the loss landscape and 2) the ability to avoid harmful elimination singularities along the training trajectory. WS standardizes the weights in convolutional layers to smooth the loss landscape by reducing the Lipschitz constants of the loss and the gradients; BCN combines batch and channel normalizations and leverages estimated statistics of the activations in convolutional layers to keep networks away from elimination singularities. We validate WS and BCN on comprehensive computer vision tasks, including image classification, object detection, instance segmentation, video recognition and semantic segmentation. All experimental results consistently show that WS and BCN improve micro-batch training significantly. Moreover, using WS and BCN with micro-batch training is even able to match or outperform the performances of BN with large-batch training. + +
+ +
+ +## Results and Models + +Faster R-CNN + +| Backbone | Style | Normalization | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:---------:|:-------:|:-------------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50-FPN | pytorch | GN+WS | 1x | 5.9 | 11.7 | 39.7 | - | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130_210936.log.json) | +| R-101-FPN | pytorch | GN+WS | 1x | 8.9 | 9.0 | 41.7 | - | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205-a93b0d75.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205_232146.log.json) | +| X-50-32x4d-FPN | pytorch | GN+WS | 1x | 7.0 | 10.3 | 40.7 | - | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203-839c5d9d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203_220113.log.json) | +| X-101-32x4d-FPN | pytorch | GN+WS | 1x | 10.8 | 7.6 | 42.1 | - | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212-27da1bc2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212_195302.log.json) | + +Mask R-CNN + +| Backbone | Style | Normalization | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:---------:|:-------:|:-------------:|:---------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50-FPN | pytorch | GN+WS | 2x | 7.3 | 10.5 | 40.6 | 36.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226-16acb762.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226_062128.log.json) | +| R-101-FPN | pytorch | GN+WS | 2x | 10.3 | 8.6 | 42.0 | 37.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212-ea357cd9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212_213627.log.json) | +| X-50-32x4d-FPN | pytorch | GN+WS | 2x | 8.4 | 9.3 | 41.1 | 37.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216-649fdb6f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216_201500.log.json) | +| X-101-32x4d-FPN | pytorch | GN+WS | 2x | 12.2 | 7.1 | 42.1 | 37.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319-33fb95b5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319_104101.log.json) | +| R-50-FPN | pytorch | GN+WS | 20-23-24e | 7.3 | - | 41.1 | 37.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213-487d1283.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213_035123.log.json) | +| R-101-FPN | pytorch | GN+WS | 20-23-24e | 10.3 | - | 43.1 | 38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213-57b5a50f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213_130142.log.json) | +| X-50-32x4d-FPN | pytorch | GN+WS | 20-23-24e | 8.4 | - | 42.1 | 38.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226-969bcb2c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226_093732.log.json) | +| X-101-32x4d-FPN | pytorch | GN+WS | 20-23-24e | 12.2 | - | 42.7 | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316-e6cd35ef.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316_013741.log.json) | + +Note: + +- GN+WS requires about 5% more memory than GN, and it is only 5% slower than GN. +- In the paper, a 20-23-24e lr schedule is used instead of 2x. +- The X-50-GN and X-101-GN pretrained models are also shared by the authors. + +## Citation + +```latex +@article{weightstandardization, + author = {Siyuan Qiao and Huiyu Wang and Chenxi Liu and Wei Shen and Alan Yuille}, + title = {Weight Standardization}, + journal = {arXiv preprint arXiv:1903.10520}, + year = {2019}, +} +``` diff --git a/configs/mmdet/gn+ws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py b/configs/mmdet/gn+ws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py new file mode 100644 index 00000000..cd2cb2b6 --- /dev/null +++ b/configs/mmdet/gn+ws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://jhu/resnet101_gn_ws'))) diff --git a/configs/mmdet/gn+ws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py b/configs/mmdet/gn+ws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py new file mode 100644 index 00000000..1b326b88 --- /dev/null +++ b/configs/mmdet/gn+ws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://jhu/resnet50_gn_ws')), + neck=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg))) diff --git a/configs/mmdet/gn+ws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py b/configs/mmdet/gn+ws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py new file mode 100644 index 00000000..f64ae891 --- /dev/null +++ b/configs/mmdet/gn+ws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = './faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py' +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://jhu/resnext101_32x4d_gn_ws'))) diff --git a/configs/mmdet/gn+ws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py b/configs/mmdet/gn+ws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py new file mode 100644 index 00000000..246851b9 --- /dev/null +++ b/configs/mmdet/gn+ws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = './faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py' +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + type='ResNeXt', + depth=50, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://jhu/resnext50_32x4d_gn_ws'))) diff --git a/configs/mmdet/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py b/configs/mmdet/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py new file mode 100644 index 00000000..a790d932 --- /dev/null +++ b/configs/mmdet/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py' +# learning policy +lr_config = dict(step=[20, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py b/configs/mmdet/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py new file mode 100644 index 00000000..a9fa6a24 --- /dev/null +++ b/configs/mmdet/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://jhu/resnet101_gn_ws'))) diff --git a/configs/mmdet/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py b/configs/mmdet/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py new file mode 100644 index 00000000..55168085 --- /dev/null +++ b/configs/mmdet/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py' +# learning policy +lr_config = dict(step=[20, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py b/configs/mmdet/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py new file mode 100644 index 00000000..63be60ff --- /dev/null +++ b/configs/mmdet/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py @@ -0,0 +1,20 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://jhu/resnet50_gn_ws')), + neck=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg), + mask_head=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg))) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py b/configs/mmdet/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py new file mode 100644 index 00000000..cfa14c99 --- /dev/null +++ b/configs/mmdet/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py' +# learning policy +lr_config = dict(step=[20, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py b/configs/mmdet/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py new file mode 100644 index 00000000..6498b03f --- /dev/null +++ b/configs/mmdet/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py @@ -0,0 +1,19 @@ +_base_ = './mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py' +# model settings +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://jhu/resnext101_32x4d_gn_ws'))) diff --git a/configs/mmdet/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py b/configs/mmdet/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py new file mode 100644 index 00000000..79ce0adf --- /dev/null +++ b/configs/mmdet/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py' +# learning policy +lr_config = dict(step=[20, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py b/configs/mmdet/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py new file mode 100644 index 00000000..7fac3175 --- /dev/null +++ b/configs/mmdet/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py @@ -0,0 +1,19 @@ +_base_ = './mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py' +# model settings +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + type='ResNeXt', + depth=50, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://jhu/resnext50_32x4d_gn_ws'))) diff --git a/configs/mmdet/gn+ws/metafile.yml b/configs/mmdet/gn+ws/metafile.yml new file mode 100644 index 00000000..bc89359c --- /dev/null +++ b/configs/mmdet/gn+ws/metafile.yml @@ -0,0 +1,263 @@ +Collections: + - Name: Weight Standardization + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Group Normalization + - Weight Standardization + Paper: + URL: https://arxiv.org/abs/1903.10520 + Title: 'Weight Standardization' + README: configs/gn+ws/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py + Version: v2.0.0 + +Models: + - Name: faster_rcnn_r50_fpn_gn_ws-all_1x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py + Metadata: + Training Memory (GB): 5.9 + inference time (ms/im): + - value: 85.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth + + - Name: faster_rcnn_r101_fpn_gn_ws-all_1x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py + Metadata: + Training Memory (GB): 8.9 + inference time (ms/im): + - value: 111.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205-a93b0d75.pth + + - Name: faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 97.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203-839c5d9d.pth + + - Name: faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py + Metadata: + Training Memory (GB): 10.8 + inference time (ms/im): + - value: 131.58 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212-27da1bc2.pth + + - Name: mask_rcnn_r50_fpn_gn_ws-all_2x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py + Metadata: + Training Memory (GB): 7.3 + inference time (ms/im): + - value: 95.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226-16acb762.pth + + - Name: mask_rcnn_r101_fpn_gn_ws-all_2x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 116.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212-ea357cd9.pth + + - Name: mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py + Metadata: + Training Memory (GB): 8.4 + inference time (ms/im): + - value: 107.53 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216-649fdb6f.pth + + - Name: mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py + Metadata: + Training Memory (GB): 12.2 + inference time (ms/im): + - value: 140.85 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319-33fb95b5.pth + + - Name: mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py + Metadata: + Training Memory (GB): 7.3 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213-487d1283.pth + + - Name: mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py + Metadata: + Training Memory (GB): 10.3 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213-57b5a50f.pth + + - Name: mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py + Metadata: + Training Memory (GB): 8.4 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226-969bcb2c.pth + + - Name: mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316-e6cd35ef.pth diff --git a/configs/mmdet/gn/README.md b/configs/mmdet/gn/README.md new file mode 100644 index 00000000..36602faf --- /dev/null +++ b/configs/mmdet/gn/README.md @@ -0,0 +1,41 @@ +# GN + +> [Group Normalization](https://arxiv.org/abs/1803.08494) + + + +## Abstract + +Batch Normalization (BN) is a milestone technique in the development of deep learning, enabling various networks to train. However, normalizing along the batch dimension introduces problems --- BN's error increases rapidly when the batch size becomes smaller, caused by inaccurate batch statistics estimation. This limits BN's usage for training larger models and transferring features to computer vision tasks including detection, segmentation, and video, which require small batches constrained by memory consumption. In this paper, we present Group Normalization (GN) as a simple alternative to BN. GN divides the channels into groups and computes within each group the mean and variance for normalization. GN's computation is independent of batch sizes, and its accuracy is stable in a wide range of batch sizes. On ResNet-50 trained in ImageNet, GN has 10.6% lower error than its BN counterpart when using a batch size of 2; when using typical batch sizes, GN is comparably good with BN and outperforms other normalization variants. Moreover, GN can be naturally transferred from pre-training to fine-tuning. GN can outperform its BN-based counterparts for object detection and segmentation in COCO, and for video classification in Kinetics, showing that GN can effectively replace the powerful BN in a variety of tasks. GN can be easily implemented by a few lines of code in modern libraries. + +
+ +
+ +## Results and Models + +| Backbone | model | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:-------------:|:----------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50-FPN (d) | Mask R-CNN | 2x | 7.1 | 11.0 | 40.2 | 36.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206_050355.log.json) | +| R-50-FPN (d) | Mask R-CNN | 3x | 7.1 | - | 40.5 | 36.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214-8b23b1e5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214_063512.log.json) | +| R-101-FPN (d) | Mask R-CNN | 2x | 9.9 | 9.0 | 41.9 | 37.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205-d96b1b50.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205_234402.log.json) | +| R-101-FPN (d) | Mask R-CNN | 3x | 9.9 | | 42.1 | 38.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609-0df864f4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609.log.json) | +| R-50-FPN (c) | Mask R-CNN | 2x | 7.1 | 10.9 | 40.0 | 36.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207-20d3e849.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207_225832.log.json) | +| R-50-FPN (c) | Mask R-CNN | 3x | 7.1 | - | 40.1 | 36.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225-542aefbc.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225_235135.log.json) | + +**Notes:** + +- (d) means pretrained model converted from Detectron, and (c) means the contributed model pretrained by [@thangvubk](https://github.com/thangvubk). +- The `3x` schedule is epoch [28, 34, 36]. +- **Memory, Train/Inf time is outdated.** + +## Citation + +```latex +@inproceedings{wu2018group, + title={Group Normalization}, + author={Wu, Yuxin and He, Kaiming}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2018} +} +``` diff --git a/configs/mmdet/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py b/configs/mmdet/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py new file mode 100644 index 00000000..a505ba0e --- /dev/null +++ b/configs/mmdet/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask_rcnn_r50_fpn_gn-all_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet101_gn'))) diff --git a/configs/mmdet/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py b/configs/mmdet/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py new file mode 100644 index 00000000..12a9d17e --- /dev/null +++ b/configs/mmdet/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py @@ -0,0 +1,5 @@ +_base_ = './mask_rcnn_r101_fpn_gn-all_2x_coco.py' + +# learning policy +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py b/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py new file mode 100644 index 00000000..1de7d98e --- /dev/null +++ b/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py @@ -0,0 +1,49 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet50_gn')), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg), + mask_head=dict(norm_cfg=norm_cfg))) +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py b/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py new file mode 100644 index 00000000..f9177196 --- /dev/null +++ b/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py @@ -0,0 +1,5 @@ +_base_ = './mask_rcnn_r50_fpn_gn-all_2x_coco.py' + +# learning policy +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py b/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py new file mode 100644 index 00000000..2f430fda --- /dev/null +++ b/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py @@ -0,0 +1,17 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://contrib/resnet50_gn')), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg), + mask_head=dict(norm_cfg=norm_cfg))) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py b/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py new file mode 100644 index 00000000..66834f08 --- /dev/null +++ b/configs/mmdet/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py @@ -0,0 +1,5 @@ +_base_ = './mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py' + +# learning policy +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/gn/metafile.yml b/configs/mmdet/gn/metafile.yml new file mode 100644 index 00000000..4a1ecae0 --- /dev/null +++ b/configs/mmdet/gn/metafile.yml @@ -0,0 +1,162 @@ +Collections: + - Name: Group Normalization + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Group Normalization + Paper: + URL: https://arxiv.org/abs/1803.08494 + Title: 'Group Normalization' + README: configs/gn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py + Version: v2.0.0 + +Models: + - Name: mask_rcnn_r50_fpn_gn-all_2x_coco + In Collection: Group Normalization + Config: configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth + + - Name: mask_rcnn_r50_fpn_gn-all_3x_coco + In Collection: Group Normalization + Config: configs/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214-8b23b1e5.pth + + - Name: mask_rcnn_r101_fpn_gn-all_2x_coco + In Collection: Group Normalization + Config: configs/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py + Metadata: + Training Memory (GB): 9.9 + inference time (ms/im): + - value: 111.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205-d96b1b50.pth + + - Name: mask_rcnn_r101_fpn_gn-all_3x_coco + In Collection: Group Normalization + Config: configs/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py + Metadata: + Training Memory (GB): 9.9 + inference time (ms/im): + - value: 111.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609-0df864f4.pth + + - Name: mask_rcnn_r50_fpn_gn-all_contrib_2x_coco + In Collection: Group Normalization + Config: configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 91.74 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207-20d3e849.pth + + - Name: mask_rcnn_r50_fpn_gn-all_contrib_3x_coco + In Collection: Group Normalization + Config: configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 91.74 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225-542aefbc.pth diff --git a/configs/mmdet/grid_rcnn/README.md b/configs/mmdet/grid_rcnn/README.md new file mode 100644 index 00000000..9b27c96e --- /dev/null +++ b/configs/mmdet/grid_rcnn/README.md @@ -0,0 +1,47 @@ +# Grid R-CNN + +> [Grid R-CNN](https://arxiv.org/abs/1811.12030) + + + +## Abstract + +This paper proposes a novel object detection framework named Grid R-CNN, which adopts a grid guided localization mechanism for accurate object detection. Different from the traditional regression based methods, the Grid R-CNN captures the spatial information explicitly and enjoys the position sensitive property of fully convolutional architecture. Instead of using only two independent points, we design a multi-point supervision formulation to encode more clues in order to reduce the impact of inaccurate prediction of specific points. To take the full advantage of the correlation of points in a grid, we propose a two-stage information fusion strategy to fuse feature maps of neighbor grid points. The grid guided localization approach is easy to be extended to different state-of-the-art detection frameworks. Grid R-CNN leads to high quality object localization, and experiments demonstrate that it achieves a 4.1% AP gain at IoU=0.8 and a 10.0% AP gain at IoU=0.9 on COCO benchmark compared to Faster R-CNN with Res50 backbone and FPN architecture. + +Grid R-CNN is a well-performed objection detection framework. It transforms the traditional box offset regression problem into a grid point estimation problem. With the guidance of the grid points, it can obtain high-quality localization results. However, the speed of Grid R-CNN is not so satisfactory. In this technical report we present Grid R-CNN Plus, a better and faster version of Grid R-CNN. We have made several updates that significantly speed up the framework and simultaneously improve the accuracy. On COCO dataset, the Res50-FPN based Grid R-CNN Plus detector achieves an mAP of 40.4%, outperforming the baseline on the same model by 3.0 points with similar inference time. + +
+ +
+ +## Results and Models + +| Backbone | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:-----------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | 2x | 5.1 | 15.0 | 40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130_221140.log.json) | +| R-101 | 2x | 7.0 | 12.6 | 41.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309-d6eca030.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309_164224.log.json) | +| X-101-32x4d | 2x | 8.3 | 10.8 | 42.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130-d8f0e3ff.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130_215413.log.json) | +| X-101-64x4d | 2x | 11.3 | 7.7 | 43.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204-ec76a754.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204_080641.log.json) | + +**Notes:** + +- All models are trained with 8 GPUs instead of 32 GPUs in the original paper. +- The warming up lasts for 1 epoch and `2x` here indicates 25 epochs. + +## Citation + +```latex +@inproceedings{lu2019grid, + title={Grid r-cnn}, + author={Lu, Xin and Li, Buyu and Yue, Yuxin and Li, Quanquan and Yan, Junjie}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} + +@article{lu2019grid, + title={Grid R-CNN Plus: Faster and Better}, + author={Lu, Xin and Li, Buyu and Yue, Yuxin and Li, Quanquan and Yan, Junjie}, + journal={arXiv preprint arXiv:1906.05688}, + year={2019} +} +``` diff --git a/configs/mmdet/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py b/configs/mmdet/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py new file mode 100644 index 00000000..1bb5889b --- /dev/null +++ b/configs/mmdet/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './grid_rcnn_r50_fpn_gn-head_2x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/grid_rcnn/grid_rcnn_r50_fpn_gn-head_1x_coco.py b/configs/mmdet/grid_rcnn/grid_rcnn_r50_fpn_gn-head_1x_coco.py new file mode 100644 index 00000000..4aa00ece --- /dev/null +++ b/configs/mmdet/grid_rcnn/grid_rcnn_r50_fpn_gn-head_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = ['grid_rcnn_r50_fpn_gn-head_2x_coco.py'] +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py b/configs/mmdet/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py new file mode 100644 index 00000000..df63cd5d --- /dev/null +++ b/configs/mmdet/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py @@ -0,0 +1,131 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='GridRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='GridRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + with_reg=False, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False), + grid_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + grid_head=dict( + type='GridHead', + grid_points=9, + num_convs=8, + in_channels=256, + point_feat_channels=64, + norm_cfg=dict(type='GN', num_groups=36), + loss_grid=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=15))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_radius=1, + pos_weight=-1, + max_num_grid=192, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.03, + nms=dict(type='nms', iou_threshold=0.3), + max_per_img=100))) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=3665, + warmup_ratio=1.0 / 80, + step=[17, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=25) diff --git a/configs/mmdet/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py b/configs/mmdet/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py new file mode 100644 index 00000000..3bc8516e --- /dev/null +++ b/configs/mmdet/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py @@ -0,0 +1,24 @@ +_base_ = './grid_rcnn_r50_fpn_gn-head_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=3665, + warmup_ratio=1.0 / 80, + step=[17, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=25) diff --git a/configs/mmdet/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py b/configs/mmdet/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py new file mode 100644 index 00000000..c78f8f65 --- /dev/null +++ b/configs/mmdet/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py @@ -0,0 +1,13 @@ +_base_ = './grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/grid_rcnn/metafile.yml b/configs/mmdet/grid_rcnn/metafile.yml new file mode 100644 index 00000000..d1aa8513 --- /dev/null +++ b/configs/mmdet/grid_rcnn/metafile.yml @@ -0,0 +1,101 @@ +Collections: + - Name: Grid R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RPN + - Dilated Convolution + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/1906.05688 + Title: 'Grid R-CNN' + README: configs/grid_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/grid_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: grid_rcnn_r50_fpn_gn-head_2x_coco + In Collection: Grid R-CNN + Config: configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py + Metadata: + Training Memory (GB): 5.1 + inference time (ms/im): + - value: 66.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth + + - Name: grid_rcnn_r101_fpn_gn-head_2x_coco + In Collection: Grid R-CNN + Config: configs/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 79.37 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309-d6eca030.pth + + - Name: grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco + In Collection: Grid R-CNN + Config: configs/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py + Metadata: + Training Memory (GB): 8.3 + inference time (ms/im): + - value: 92.59 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130-d8f0e3ff.pth + + - Name: grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco + In Collection: Grid R-CNN + Config: configs/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py + Metadata: + Training Memory (GB): 11.3 + inference time (ms/im): + - value: 129.87 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204-ec76a754.pth diff --git a/configs/mmdet/groie/README.md b/configs/mmdet/groie/README.md new file mode 100644 index 00000000..989a2ed8 --- /dev/null +++ b/configs/mmdet/groie/README.md @@ -0,0 +1,72 @@ +# GRoIE + +> [A novel Region of Interest Extraction Layer for Instance Segmentation](https://arxiv.org/abs/2004.13665) + + + +## Abstract + +Given the wide diffusion of deep neural network architectures for computer vision tasks, several new applications are nowadays more and more feasible. Among them, a particular attention has been recently given to instance segmentation, by exploiting the results achievable by two-stage networks (such as Mask R-CNN or Faster R-CNN), derived from R-CNN. In these complex architectures, a crucial role is played by the Region of Interest (RoI) extraction layer, devoted to extracting a coherent subset of features from a single Feature Pyramid Network (FPN) layer attached on top of a backbone. +This paper is motivated by the need to overcome the limitations of existing RoI extractors which select only one (the best) layer from FPN. Our intuition is that all the layers of FPN retain useful information. Therefore, the proposed layer (called Generic RoI Extractor - GRoIE) introduces non-local building blocks and attention mechanisms to boost the performance. +A comprehensive ablation study at component level is conducted to find the best set of algorithms and parameters for the GRoIE layer. Moreover, GRoIE can be integrated seamlessly with every two-stage architecture for both object detection and instance segmentation tasks. Therefore, the improvements brought about by the use of GRoIE in different state-of-the-art architectures are also evaluated. The proposed layer leads up to gain a 1.1% AP improvement on bounding box detection and 1.7% AP improvement on instance segmentation. + +
+ +
+ +## Introduction + +By Leonardo Rossi, Akbar Karimi and Andrea Prati from +[IMPLab](http://implab.ce.unipr.it/). + +We provide configs to reproduce the results in the paper for +"*A novel Region of Interest Extraction Layer for Instance Segmentation*" +on COCO object detection. + +This paper is motivated by the need to overcome to the limitations of existing +RoI extractors which select only one (the best) layer from FPN. + +Our intuition is that all the layers of FPN retain useful information. + +Therefore, the proposed layer (called Generic RoI Extractor - **GRoIE**) +introduces non-local building blocks and attention mechanisms to boost the +performance. + +## Results and Models + +The results on COCO 2017 minival (5k images) are shown in the below table. + +### Application of GRoIE to different architectures + +| Backbone | Method | Lr schd | box AP | mask AP | Config | Download| +| :-------: | :--------------: | :-----: | :----: | :-----: | :-------:| :--------:| +| R-50-FPN | Faster Original | 1x | 37.4 | | [config](../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| R-50-FPN | + GRoIE | 1x | 38.3 | | [config](./faster_rcnn_r50_fpn_groie_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715.log.json) | +| R-50-FPN | Grid R-CNN | 1x | 39.1 | | [config](./grid_rcnn_r50_fpn_gn-head_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_1x_coco/grid_rcnn_r50_fpn_gn-head_1x_coco_20200605_202059-64f00ee8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_1x_coco/grid_rcnn_r50_fpn_gn-head_1x_coco_20200605_202059.log.json) | +| R-50-FPN | + GRoIE | 1x | | | [config](./grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py)|| +| R-50-FPN | Mask R-CNN | 1x | 38.2 | 34.7 | [config](../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json) | +| R-50-FPN | + GRoIE | 1x | 39.0 | 36.0 | [config](./mask_rcnn_r50_fpn_groie_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715-50d90c74.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715.log.json) | +| R-50-FPN | GC-Net | 1x | 40.7 | 36.5 | [config](../gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202_085547.log.json) | +| R-50-FPN | + GRoIE | 1x | 41.0 | 37.8 | [config](./mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth) | +| R-101-FPN | GC-Net | 1x | 42.2 | 37.8 | [config](../gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206_142508.log.json) | +| R-101-FPN | + GRoIE | 1x | 42.6 | 38.7 | [config](./mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507-8daae01c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507.log.json) | + +## Citation + +If you use this work or benchmark in your research, please cite this project. + +```latex +@inproceedings{rossi2021novel, + title={A novel region of interest extraction layer for instance segmentation}, + author={Rossi, Leonardo and Karimi, Akbar and Prati, Andrea}, + booktitle={2020 25th International Conference on Pattern Recognition (ICPR)}, + pages={2203--2209}, + year={2021}, + organization={IEEE} +} +``` + +## Contact + +The implementation of GRoIE is currently maintained by +[Leonardo Rossi](https://github.com/hachreak/). diff --git a/configs/mmdet/groie/faster_rcnn_r50_fpn_groie_1x_coco.py b/configs/mmdet/groie/faster_rcnn_r50_fpn_groie_1x_coco.py new file mode 100644 index 00000000..0fc528bf --- /dev/null +++ b/configs/mmdet/groie/faster_rcnn_r50_fpn_groie_1x_coco.py @@ -0,0 +1,25 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/configs/mmdet/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py b/configs/mmdet/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py new file mode 100644 index 00000000..8e4b4ab2 --- /dev/null +++ b/configs/mmdet/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = '../grid_rcnn/grid_rcnn_r50_fpn_gn-head_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)), + grid_roi_extractor=dict( + type='GenericRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/configs/mmdet/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py b/configs/mmdet/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py new file mode 100644 index 00000000..8b837221 --- /dev/null +++ b/configs/mmdet/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = '../gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)), + mask_roi_extractor=dict( + type='GenericRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/configs/mmdet/groie/mask_rcnn_r50_fpn_groie_1x_coco.py b/configs/mmdet/groie/mask_rcnn_r50_fpn_groie_1x_coco.py new file mode 100644 index 00000000..81dfb487 --- /dev/null +++ b/configs/mmdet/groie/mask_rcnn_r50_fpn_groie_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)), + mask_roi_extractor=dict( + type='GenericRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/configs/mmdet/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py b/configs/mmdet/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py new file mode 100644 index 00000000..852c5ca7 --- /dev/null +++ b/configs/mmdet/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = '../gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)), + mask_roi_extractor=dict( + type='GenericRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/configs/mmdet/groie/metafile.yml b/configs/mmdet/groie/metafile.yml new file mode 100644 index 00000000..269cb393 --- /dev/null +++ b/configs/mmdet/groie/metafile.yml @@ -0,0 +1,93 @@ +Collections: + - Name: GRoIE + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Generic RoI Extractor + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/2004.13665 + Title: 'A novel Region of Interest Extraction Layer for Instance Segmentation' + README: configs/groie/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/roi_heads/roi_extractors/groie.py#L15 + Version: v2.1.0 + +Models: + - Name: faster_rcnn_r50_fpn_groie_1x_coco + In Collection: GRoIE + Config: configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth + + - Name: grid_rcnn_r50_fpn_gn-head_groie_1x_coco + In Collection: GRoIE + Config: configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + + - Name: mask_rcnn_r50_fpn_groie_1x_coco + In Collection: GRoIE + Config: configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715-50d90c74.pth + + - Name: mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco + In Collection: GRoIE + Config: configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth + + - Name: mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco + In Collection: GRoIE + Config: configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507-8daae01c.pth diff --git a/configs/mmdet/guided_anchoring/README.md b/configs/mmdet/guided_anchoring/README.md new file mode 100644 index 00000000..b42de99b --- /dev/null +++ b/configs/mmdet/guided_anchoring/README.md @@ -0,0 +1,59 @@ +# Guided Anchoring + +> [Region Proposal by Guided Anchoring](https://arxiv.org/abs/1901.03278) + + + +## Abstract + +Region anchors are the cornerstone of modern object detection techniques. State-of-the-art detectors mostly rely on a dense anchoring scheme, where anchors are sampled uniformly over the spatial domain with a predefined set of scales and aspect ratios. In this paper, we revisit this foundational stage. Our study shows that it can be done much more effectively and efficiently. Specifically, we present an alternative scheme, named Guided Anchoring, which leverages semantic features to guide the anchoring. The proposed method jointly predicts the locations where the center of objects of interest are likely to exist as well as the scales and aspect ratios at different locations. On top of predicted anchor shapes, we mitigate the feature inconsistency with a feature adaption module. We also study the use of high-quality proposals to improve detection performance. The anchoring scheme can be seamlessly integrated into proposal methods and detectors. With Guided Anchoring, we achieve 9.1% higher recall on MS COCO with 90% fewer anchors than the RPN baseline. We also adopt Guided Anchoring in Fast R-CNN, Faster R-CNN and RetinaNet, respectively improving the detection mAP by 2.2%, 2.7% and 1.2%. + +
+ +
+ +## Results and Models + +The results on COCO 2017 val is shown in the below table. (results on test-dev are usually slightly higher than val). + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | AR 1000 | Config | Download | +| :----: | :-------------: | :-----: | :-----: | :------: | :------------: | :-----: | :------: | :--------: | +| GA-RPN | R-50-FPN | caffe | 1x | 5.3 | 15.8 | 68.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531-899008a6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531_011819.log.json) | +| GA-RPN | R-101-FPN | caffe | 1x | 7.3 | 13.0 | 69.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531-ca9ba8fb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531_011812.log.json) | +| GA-RPN | X-101-32x4d-FPN | pytorch | 1x | 8.5 | 10.0 | 70.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220-c28d1b18.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220_221326.log.json) | +| GA-RPN | X-101-64x4d-FPN | pytorch | 1x | 7.1 | 7.5 | 71.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225-3c6e1aa2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225_152704.log.json) | + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------------: | :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| GA-Faster RCNN | R-50-FPN | caffe | 1x | 5.5 | | 39.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718-a11ccfe6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718.log.json) | +| GA-Faster RCNN | R-101-FPN | caffe | 1x | 7.5 | | 41.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_bbox_mAP-0.415_20200505_115528-fb82e499.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_20200505_115528.log.json) | +| GA-Faster RCNN | X-101-32x4d-FPN | pytorch | 1x | 8.7 | 9.7 | 43.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215-1ded9da3.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215_184547.log.json) | +| GA-Faster RCNN | X-101-64x4d-FPN | pytorch | 1x | 11.8 | 7.3 | 43.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215-0fa7bde7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215_104455.log.json) | +| GA-RetinaNet | R-50-FPN | caffe | 1x | 3.5 | 16.8 | 36.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020_225450.log.json) | +| GA-RetinaNet | R-101-FPN | caffe | 1x | 5.5 | 12.9 | 39.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531-6266453c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531_012847.log.json) | +| GA-RetinaNet | X-101-32x4d-FPN | pytorch | 1x | 6.9 | 10.6 | 40.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219-40c56caa.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219_223025.log.json) | +| GA-RetinaNet | X-101-64x4d-FPN | pytorch | 1x | 9.9 | 7.7 | 41.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226-ef9f7f1f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226_221123.log.json) | + +- In the Guided Anchoring paper, `score_thr` is set to 0.001 in Fast/Faster RCNN and 0.05 in RetinaNet for both baselines and Guided Anchoring. + +- Performance on COCO test-dev benchmark are shown as follows. + +| Method | Backbone | Style | Lr schd | Aug Train | Score thr | AP | AP_50 | AP_75 | AP_small | AP_medium | AP_large | Download | +| :------------: | :-------: | :---: | :-----: | :-------: | :-------: | :---: | :---: | :---: | :------: | :-------: | :------: | :------: | +| GA-Faster RCNN | R-101-FPN | caffe | 1x | F | 0.05 | | | | | | | | +| GA-Faster RCNN | R-101-FPN | caffe | 1x | F | 0.001 | | | | | | | | +| GA-RetinaNet | R-101-FPN | caffe | 1x | F | 0.05 | | | | | | | | +| GA-RetinaNet | R-101-FPN | caffe | 2x | T | 0.05 | | | | | | | | + +## Citation + +We provide config files to reproduce the results in the CVPR 2019 paper for [Region Proposal by Guided Anchoring](https://arxiv.org/abs/1901.03278). + +```latex +@inproceedings{wang2019region, + title={Region Proposal by Guided Anchoring}, + author={Jiaqi Wang and Kai Chen and Shuo Yang and Chen Change Loy and Dahua Lin}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} +``` diff --git a/configs/mmdet/guided_anchoring/ga_fast_r50_caffe_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_fast_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..8fc203c6 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_fast_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,65 @@ +_base_ = '../fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + roi_head=dict( + bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6), + sampler=dict(num=256))), + test_cfg=dict(rcnn=dict(score_thr=1e-3))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=300), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=None), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img', 'proposals']), + ]) +] +data = dict( + train=dict( + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_train2017.pkl', + pipeline=train_pipeline), + val=dict( + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl', + pipeline=test_pipeline), + test=dict( + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl', + pipeline=test_pipeline)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..a40e7c6f --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './ga_faster_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..b0add92c --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,65 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='GARPNHead', + in_channels=256, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.14, 0.14]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.11, 0.11]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + roi_head=dict( + bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + center_ratio=0.2, + ignore_ratio=0.5), + rpn_proposal=dict(nms_post=1000, max_per_img=300), + rcnn=dict( + assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6), + sampler=dict(type='RandomSampler', num=256))), + test_cfg=dict( + rpn=dict(nms_post=1000, max_per_img=300), rcnn=dict(score_thr=1e-3))) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/guided_anchoring/ga_faster_r50_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_faster_r50_fpn_1x_coco.py new file mode 100644 index 00000000..e3d82389 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_faster_r50_fpn_1x_coco.py @@ -0,0 +1,65 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='GARPNHead', + in_channels=256, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.14, 0.14]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.11, 0.11]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + roi_head=dict( + bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + center_ratio=0.2, + ignore_ratio=0.5), + rpn_proposal=dict(nms_post=1000, max_per_img=300), + rcnn=dict( + assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6), + sampler=dict(type='RandomSampler', num=256))), + test_cfg=dict( + rpn=dict(nms_post=1000, max_per_img=300), rcnn=dict(score_thr=1e-3))) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..f1dda949 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga_faster_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..fb9e2afc --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga_faster_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..1b1cccd0 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './ga_retinanet_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py b/configs/mmdet/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py new file mode 100644 index 00000000..260895b4 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py @@ -0,0 +1,169 @@ +_base_ = '../_base_/default_runtime.py' + +# model settings +model = dict( + type='RetinaNet', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5), + bbox_head=dict( + type='GARetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0))) +# training and testing settings +train_cfg = dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + center_ratio=0.2, + ignore_ratio=0.5, + debug=False) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 960)], + keep_ratio=True, + multiscale_mode='range'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='bbox') +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..33512011 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,62 @@ +_base_ = '../retinanet/retinanet_r50_caffe_fpn_1x_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='GARetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict(neg_iou_thr=0.5, min_pos_iou=0.0), + center_ratio=0.2, + ignore_ratio=0.5)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/guided_anchoring/ga_retinanet_r50_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_retinanet_r50_fpn_1x_coco.py new file mode 100644 index 00000000..76947235 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,62 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='GARetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict(neg_iou_thr=0.5, min_pos_iou=0.0), + center_ratio=0.2, + ignore_ratio=0.5)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..c5eb34f5 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga_retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..5c69a6f8 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga_retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..039703ec --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = './ga_rpn_r50_caffe_fpn_1x_coco.py' +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..7830894a --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,58 @@ +_base_ = '../rpn/rpn_r50_caffe_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='GARPNHead', + in_channels=256, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.14, 0.14]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.11, 0.11]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + center_ratio=0.2, + ignore_ratio=0.5)), + test_cfg=dict(rpn=dict(nms_post=1000))) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/guided_anchoring/ga_rpn_r50_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_rpn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..27ab3e73 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_rpn_r50_fpn_1x_coco.py @@ -0,0 +1,58 @@ +_base_ = '../rpn/rpn_r50_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='GARPNHead', + in_channels=256, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.14, 0.14]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.11, 0.11]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + center_ratio=0.2, + ignore_ratio=0.5)), + test_cfg=dict(rpn=dict(nms_post=1000))) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..cccc985f --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga_rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..4e134d23 --- /dev/null +++ b/configs/mmdet/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga_rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/guided_anchoring/metafile.yml b/configs/mmdet/guided_anchoring/metafile.yml new file mode 100644 index 00000000..3019d4a1 --- /dev/null +++ b/configs/mmdet/guided_anchoring/metafile.yml @@ -0,0 +1,246 @@ +Collections: + - Name: Guided Anchoring + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - Guided Anchoring + - ResNet + Paper: + URL: https://arxiv.org/abs/1901.03278 + Title: 'Region Proposal by Guided Anchoring' + README: configs/guided_anchoring/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/dense_heads/ga_retina_head.py#L10 + Version: v2.0.0 + +Models: + - Name: ga_rpn_r50_caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.3 + inference time (ms/im): + - value: 63.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Region Proposal + Dataset: COCO + Metrics: + AR@1000: 68.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531-899008a6.pth + + - Name: ga_rpn_r101_caffe_fpn_1x_coco.py + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py.py + Metadata: + Training Memory (GB): 7.3 + inference time (ms/im): + - value: 76.92 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Region Proposal + Dataset: COCO + Metrics: + AR@1000: 69.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531-ca9ba8fb.pth + + - Name: ga_rpn_x101_32x4d_fpn_1x_coco.py + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py.py + Metadata: + Training Memory (GB): 8.5 + inference time (ms/im): + - value: 100 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Region Proposal + Dataset: COCO + Metrics: + AR@1000: 70.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220-c28d1b18.pth + + - Name: ga_rpn_x101_64x4d_fpn_1x_coco.py.py + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py.py.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 133.33 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Region Proposal + Dataset: COCO + Metrics: + AR@1000: 70.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225-3c6e1aa2.pth + + - Name: ga_faster_r50_caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718-a11ccfe6.pth + + - Name: ga_faster_r101_caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_bbox_mAP-0.415_20200505_115528-fb82e499.pth + + - Name: ga_faster_x101_32x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.7 + inference time (ms/im): + - value: 103.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215-1ded9da3.pth + + - Name: ga_faster_x101_64x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 11.8 + inference time (ms/im): + - value: 136.99 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215-0fa7bde7.pth + + - Name: ga_retinanet_r50_caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.5 + inference time (ms/im): + - value: 59.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth + + - Name: ga_retinanet_r101_caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531-6266453c.pth + + - Name: ga_retinanet_x101_32x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.9 + inference time (ms/im): + - value: 94.34 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219-40c56caa.pth + + - Name: ga_retinanet_x101_64x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.9 + inference time (ms/im): + - value: 129.87 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226-ef9f7f1f.pth diff --git a/configs/mmdet/hrnet/README.md b/configs/mmdet/hrnet/README.md new file mode 100644 index 00000000..f1a9d964 --- /dev/null +++ b/configs/mmdet/hrnet/README.md @@ -0,0 +1,101 @@ +# HRNet + +> [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1902.09212) + + + +## Abstract + +This is an official pytorch implementation of Deep High-Resolution Representation Learning for Human Pose Estimation. In this work, we are interested in the human pose estimation problem with a focus on learning reliable high-resolution representations. Most existing methods recover high-resolution representations from low-resolution representations produced by a high-to-low resolution network. Instead, our proposed network maintains high-resolution representations through the whole process. We start from a high-resolution subnetwork as the first stage, gradually add high-to-low resolution subnetworks one by one to form more stages, and connect the mutli-resolution subnetworks in parallel. We conduct repeated multi-scale fusions such that each of the high-to-low resolution representations receives information from other parallel representations over and over, leading to rich high-resolution representations. As a result, the predicted keypoint heatmap is potentially more accurate and spatially more precise. We empirically demonstrate the effectiveness of our network through the superior pose estimation results over two benchmark datasets: the COCO keypoint detection dataset and the MPII Human Pose dataset. + +High-resolution representation learning plays an essential role in many vision problems, e.g., pose estimation and semantic segmentation. The high-resolution network (HRNet), recently developed for human pose estimation, maintains high-resolution representations through the whole process by connecting high-to-low resolution convolutions in parallel and produces strong high-resolution representations by repeatedly conducting fusions across parallel convolutions. +In this paper, we conduct a further study on high-resolution representations by introducing a simple yet effective modification and apply it to a wide range of vision tasks. We augment the high-resolution representation by aggregating the (upsampled) representations from all the parallel convolutions rather than only the representation from the high-resolution convolution as done in HRNet. This simple modification leads to stronger representations, evidenced by superior results. We show top results in semantic segmentation on Cityscapes, LIP, and PASCAL Context, and facial landmark detection on AFLW, COFW, 300W, and WFLW. In addition, we build a multi-level representation from the high-resolution representation and apply it to the Faster R-CNN object detection framework and the extended frameworks. The proposed approach achieves superior results to existing single-model networks on COCO object detection. + +
+ +
+ +## Results and Models + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :-------------:|:------:| :------:| :--------:| +| HRNetV2p-W18 | pytorch | 1x | 6.6 | 13.4 | 36.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130_211246.log.json) | +| HRNetV2p-W18 | pytorch | 2x | 6.6 | - | 38.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731-a4ec0611.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731.log.json) | +| HRNetV2p-W32 | pytorch | 1x | 9.0 | 12.4 | 40.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130-6e286425.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130_204442.log.json) | +| HRNetV2p-W32 | pytorch | 2x | 9.0 | - | 41.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927-976a9c15.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927.log.json) | +| HRNetV2p-W40 | pytorch | 1x | 10.4 | 10.5 | 41.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210-95c1f5ce.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210_125315.log.json) | +| HRNetV2p-W40 | pytorch | 2x | 10.4 | - | 42.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033-0f236ef4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033.log.json) | + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :-------------:|:------:| :------:|:------:|:--------:| +| HRNetV2p-W18 | pytorch | 1x | 7.0 | 11.7 | 37.7 | 34.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205-1c3d78ed.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205_232523.log.json) | +| HRNetV2p-W18 | pytorch | 2x | 7.0 | - | 39.8 | 36.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212-b3c825b1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212_134222.log.json) | +| HRNetV2p-W32 | pytorch | 1x | 9.4 | 11.3 | 41.2 | 37.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207-b29f616e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207_055017.log.json) | +| HRNetV2p-W32 | pytorch | 2x | 9.4 | - | 42.5 | 37.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213-45b75b4d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213_150518.log.json) | +| HRNetV2p-W40 | pytorch | 1x | 10.9 | | 42.1 | 37.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646-66738b35.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646.log.json) | +| HRNetV2p-W40 | pytorch | 2x | 10.9 | | 42.8 | 38.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732-aed5e4ab.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732.log.json) | + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :-------------:|:------:| :------: | :--------: | +| HRNetV2p-W18 | pytorch | 20e | 7.0 | 11.0 | 41.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210-434be9d7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210_105632.log.json) | +| HRNetV2p-W32 | pytorch | 20e | 9.4 | 11.0 | 43.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208-928455a4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208_160511.log.json) | +| HRNetV2p-W40 | pytorch | 20e | 10.8 | | 43.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112-75e47b04.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112.log.json) | + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :-------------:|:------:| :------:|:------:|:--------:| +| HRNetV2p-W18 | pytorch | 20e | 8.5 | 8.5 |41.6 |36.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210-b543cd2b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210_093149.log.json) | +| HRNetV2p-W32 | pytorch | 20e | | 8.3 |44.3 |38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043-39d9cf7b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043.log.json) | +| HRNetV2p-W40 | pytorch | 20e | 12.5 | |45.1 |39.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922-969c4610.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922.log.json) | + +### Hybrid Task Cascade (HTC) + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :-------------:|:------:| :------:|:------:|:--------:| +| HRNetV2p-W18 | pytorch | 20e | 10.8 | 4.7 | 42.8 | 37.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/htc_hrnetv2p_w18_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210-b266988c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210_182735.log.json) | +| HRNetV2p-W32 | pytorch | 20e | 13.1 | 4.9 | 45.4 | 39.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/htc_hrnetv2p_w32_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207-7639fa12.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207_193153.log.json) | +| HRNetV2p-W40 | pytorch | 20e | 14.6 | | 46.4 | 40.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/htc_hrnetv2p_w40_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411-417c4d5b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411.log.json) | + +### FCOS + +| Backbone | Style | GN | MS train | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:--------:|:-------:|:------:|:------:|:------:|:------:|:--------:| +|HRNetV2p-W18| pytorch | Y | N | 1x | 13.0 | 12.9 | 35.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710-4ad151de.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710.log.json) | +|HRNetV2p-W18| pytorch | Y | N | 2x | 13.0 | - | 38.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110-5c575fa5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110.log.json) | +|HRNetV2p-W32| pytorch | Y | N | 1x | 17.5 | 12.9 | 39.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730-cb8055c0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730.log.json) | +|HRNetV2p-W32| pytorch | Y | N | 2x | 17.5 | - | 40.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133-77b6b9bb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133.log.json) | +|HRNetV2p-W18| pytorch | Y | Y | 2x | 13.0 | 12.9 | 38.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651-441e9d9f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651.log.json) | +|HRNetV2p-W32| pytorch | Y | Y | 2x | 17.5 | 12.4 | 41.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846-b6f2b49f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846.log.json) | +|HRNetV2p-W48| pytorch | Y | Y | 2x | 20.3 | 10.8 | 42.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752-f22d2ce5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752.log.json) | + +**Note:** + +- The `28e` schedule in HTC indicates decreasing the lr at 24 and 27 epochs, with a total of 28 epochs. +- HRNetV2 ImageNet pretrained models are in [HRNets for Image Classification](https://github.com/HRNet/HRNet-Image-Classification). + +## Citation + +```latex +@inproceedings{SunXLW19, + title={Deep High-Resolution Representation Learning for Human Pose Estimation}, + author={Ke Sun and Bin Xiao and Dong Liu and Jingdong Wang}, + booktitle={CVPR}, + year={2019} +} + +@article{SunZJCXLMWLW19, + title={High-Resolution Representations for Labeling Pixels and Regions}, + author={Ke Sun and Yang Zhao and Borui Jiang and Tianheng Cheng and Bin Xiao + and Dong Liu and Yadong Mu and Xinggang Wang and Wenyu Liu and Jingdong Wang}, + journal = {CoRR}, + volume = {abs/1904.04514}, + year={2019} +} +``` diff --git a/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py b/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py new file mode 100644 index 00000000..839cf3eb --- /dev/null +++ b/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py @@ -0,0 +1,11 @@ +_base_ = './cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py' +# model settings +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py b/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py new file mode 100644 index 00000000..99426027 --- /dev/null +++ b/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py @@ -0,0 +1,40 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py b/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py new file mode 100644 index 00000000..10d5e83c --- /dev/null +++ b/configs/mmdet/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py @@ -0,0 +1,12 @@ +_base_ = './cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py' +# model settings +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py b/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py new file mode 100644 index 00000000..ebd5e202 --- /dev/null +++ b/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py @@ -0,0 +1,11 @@ +_base_ = './cascade_rcnn_hrnetv2p_w32_20e_coco.py' +# model settings +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py b/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py new file mode 100644 index 00000000..e7f89a9e --- /dev/null +++ b/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py @@ -0,0 +1,40 @@ +_base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py b/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py new file mode 100644 index 00000000..265e8d63 --- /dev/null +++ b/configs/mmdet/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py @@ -0,0 +1,12 @@ +_base_ = './cascade_rcnn_hrnetv2p_w32_20e_coco.py' +# model settings +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py new file mode 100644 index 00000000..1df2c3db --- /dev/null +++ b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = './faster_rcnn_hrnetv2p_w32_1x_coco.py' +# model settings +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py new file mode 100644 index 00000000..a4b987a1 --- /dev/null +++ b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py @@ -0,0 +1,5 @@ +_base_ = './faster_rcnn_hrnetv2p_w18_1x_coco.py' + +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py new file mode 100644 index 00000000..be058099 --- /dev/null +++ b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py @@ -0,0 +1,37 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) diff --git a/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py new file mode 100644 index 00000000..63c87171 --- /dev/null +++ b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './faster_rcnn_hrnetv2p_w32_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py new file mode 100644 index 00000000..886a7c90 --- /dev/null +++ b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = './faster_rcnn_hrnetv2p_w32_1x_coco.py' +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py new file mode 100644 index 00000000..585cc2c3 --- /dev/null +++ b/configs/mmdet/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './faster_rcnn_hrnetv2p_w40_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py b/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py new file mode 100644 index 00000000..fd662bd1 --- /dev/null +++ b/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py @@ -0,0 +1,10 @@ +_base_ = './fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py' +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py b/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py new file mode 100644 index 00000000..34975959 --- /dev/null +++ b/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py new file mode 100644 index 00000000..37bfdae9 --- /dev/null +++ b/configs/mmdet/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py @@ -0,0 +1,10 @@ +_base_ = './fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py' +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py b/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py new file mode 100644 index 00000000..10617f24 --- /dev/null +++ b/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py @@ -0,0 +1,70 @@ +_base_ = '../fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256, + stride=2, + num_outs=5)) +img_norm_cfg = dict( + mean=[103.53, 116.28, 123.675], std=[57.375, 57.12, 58.395], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py b/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py new file mode 100644 index 00000000..7b381307 --- /dev/null +++ b/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py new file mode 100644 index 00000000..482f8872 --- /dev/null +++ b/configs/mmdet/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py @@ -0,0 +1,39 @@ +_base_ = './fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py' +img_norm_cfg = dict( + mean=[103.53, 116.28, 123.675], std=[57.375, 57.12, 58.395], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/mmdet/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py new file mode 100644 index 00000000..0ae9dbe3 --- /dev/null +++ b/configs/mmdet/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py @@ -0,0 +1,11 @@ +_base_ = './fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py' +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/configs/mmdet/hrnet/htc_hrnetv2p_w18_20e_coco.py b/configs/mmdet/hrnet/htc_hrnetv2p_w18_20e_coco.py new file mode 100644 index 00000000..3c2eb1dd --- /dev/null +++ b/configs/mmdet/hrnet/htc_hrnetv2p_w18_20e_coco.py @@ -0,0 +1,10 @@ +_base_ = './htc_hrnetv2p_w32_20e_coco.py' +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/configs/mmdet/hrnet/htc_hrnetv2p_w32_20e_coco.py b/configs/mmdet/hrnet/htc_hrnetv2p_w32_20e_coco.py new file mode 100644 index 00000000..545cb83e --- /dev/null +++ b/configs/mmdet/hrnet/htc_hrnetv2p_w32_20e_coco.py @@ -0,0 +1,37 @@ +_base_ = '../htc/htc_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) diff --git a/configs/mmdet/hrnet/htc_hrnetv2p_w40_20e_coco.py b/configs/mmdet/hrnet/htc_hrnetv2p_w40_20e_coco.py new file mode 100644 index 00000000..94bff1bc --- /dev/null +++ b/configs/mmdet/hrnet/htc_hrnetv2p_w40_20e_coco.py @@ -0,0 +1,11 @@ +_base_ = './htc_hrnetv2p_w32_20e_coco.py' +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/configs/mmdet/hrnet/htc_hrnetv2p_w40_28e_coco.py b/configs/mmdet/hrnet/htc_hrnetv2p_w40_28e_coco.py new file mode 100644 index 00000000..7067e8b6 --- /dev/null +++ b/configs/mmdet/hrnet/htc_hrnetv2p_w40_28e_coco.py @@ -0,0 +1,4 @@ +_base_ = './htc_hrnetv2p_w40_20e_coco.py' +# learning policy +lr_config = dict(step=[24, 27]) +runner = dict(type='EpochBasedRunner', max_epochs=28) diff --git a/configs/mmdet/hrnet/htc_x101_64x4d_fpn_16x1_28e_coco.py b/configs/mmdet/hrnet/htc_x101_64x4d_fpn_16x1_28e_coco.py new file mode 100644 index 00000000..815f2857 --- /dev/null +++ b/configs/mmdet/hrnet/htc_x101_64x4d_fpn_16x1_28e_coco.py @@ -0,0 +1,4 @@ +_base_ = '../htc/htc_x101_64x4d_fpn_16x1_20e_coco.py' +# learning policy +lr_config = dict(step=[24, 27]) +runner = dict(type='EpochBasedRunner', max_epochs=28) diff --git a/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py new file mode 100644 index 00000000..cb12200e --- /dev/null +++ b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py @@ -0,0 +1,10 @@ +_base_ = './mask_rcnn_hrnetv2p_w32_1x_coco.py' +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py new file mode 100644 index 00000000..ca62682a --- /dev/null +++ b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_hrnetv2p_w18_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py new file mode 100644 index 00000000..d5f0eb56 --- /dev/null +++ b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py @@ -0,0 +1,37 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) diff --git a/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py new file mode 100644 index 00000000..63d5d139 --- /dev/null +++ b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_hrnetv2p_w32_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py new file mode 100644 index 00000000..5a76f4b0 --- /dev/null +++ b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = './mask_rcnn_hrnetv2p_w18_1x_coco.py' +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py new file mode 100644 index 00000000..3a2a5106 --- /dev/null +++ b/configs/mmdet/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_hrnetv2p_w40_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/hrnet/metafile.yml b/configs/mmdet/hrnet/metafile.yml new file mode 100644 index 00000000..ac36efa9 --- /dev/null +++ b/configs/mmdet/hrnet/metafile.yml @@ -0,0 +1,971 @@ +Models: + - Name: faster_rcnn_hrnetv2p_w18_1x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py + Metadata: + Training Memory (GB): 6.6 + inference time (ms/im): + - value: 74.63 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster_rcnn_hrnetv2p_w18_2x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py + Metadata: + Training Memory (GB): 6.6 + inference time (ms/im): + - value: 74.63 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731-a4ec0611.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster_rcnn_hrnetv2p_w32_1x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py + Metadata: + Training Memory (GB): 9.0 + inference time (ms/im): + - value: 80.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130-6e286425.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster_rcnn_hrnetv2p_w32_2x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py + Metadata: + Training Memory (GB): 9.0 + inference time (ms/im): + - value: 80.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927-976a9c15.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster_rcnn_hrnetv2p_w40_1x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py + Metadata: + Training Memory (GB): 10.4 + inference time (ms/im): + - value: 95.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210-95c1f5ce.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster_rcnn_hrnetv2p_w40_2x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py + Metadata: + Training Memory (GB): 10.4 + inference time (ms/im): + - value: 95.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033-0f236ef4.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask_rcnn_hrnetv2p_w18_1x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 85.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205-1c3d78ed.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask_rcnn_hrnetv2p_w18_2x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 85.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212-b3c825b1.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask_rcnn_hrnetv2p_w32_1x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py + Metadata: + Training Memory (GB): 9.4 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207-b29f616e.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask_rcnn_hrnetv2p_w32_2x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py + Metadata: + Training Memory (GB): 9.4 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213-45b75b4d.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask_rcnn_hrnetv2p_w40_1x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py + Metadata: + Training Memory (GB): 10.9 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646-66738b35.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask_rcnn_hrnetv2p_w40_2x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py + Metadata: + Training Memory (GB): 10.9 + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732-aed5e4ab.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade_rcnn_hrnetv2p_w18_20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210-434be9d7.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade_rcnn_hrnetv2p_w32_20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py + Metadata: + Training Memory (GB): 9.4 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208-928455a4.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade_rcnn_hrnetv2p_w40_20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py + Metadata: + Training Memory (GB): 10.8 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112-75e47b04.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade_mask_rcnn_hrnetv2p_w18_20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py + Metadata: + Training Memory (GB): 8.5 + inference time (ms/im): + - value: 117.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210-b543cd2b.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade_mask_rcnn_hrnetv2p_w32_20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py + Metadata: + inference time (ms/im): + - value: 120.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043-39d9cf7b.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade_mask_rcnn_hrnetv2p_w40_20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py + Metadata: + Training Memory (GB): 12.5 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922-969c4610.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: htc_hrnetv2p_w18_20e_coco + In Collection: HTC + Config: configs/hrnet/htc_hrnetv2p_w18_20e_coco.py + Metadata: + Training Memory (GB): 10.8 + inference time (ms/im): + - value: 212.77 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210-b266988c.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: htc_hrnetv2p_w32_20e_coco + In Collection: HTC + Config: configs/hrnet/htc_hrnetv2p_w32_20e_coco.py + Metadata: + Training Memory (GB): 13.1 + inference time (ms/im): + - value: 204.08 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207-7639fa12.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: htc_hrnetv2p_w40_20e_coco + In Collection: HTC + Config: configs/hrnet/htc_hrnetv2p_w40_20e_coco.py + Metadata: + Training Memory (GB): 14.6 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411-417c4d5b.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p_w18_gn-head_4x4_1x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 13.0 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710-4ad151de.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p_w18_gn-head_4x4_2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 13.0 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110-5c575fa5.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p_w32_gn-head_4x4_1x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 17.5 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730-cb8055c0.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p_w32_gn-head_4x4_2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 17.5 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133-77b6b9bb.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 13.0 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651-441e9d9f.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 17.5 + inference time (ms/im): + - value: 80.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846-b6f2b49f.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 20.3 + inference time (ms/im): + - value: 92.59 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752-f22d2ce5.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 diff --git a/configs/mmdet/htc/README.md b/configs/mmdet/htc/README.md new file mode 100644 index 00000000..c57a5d18 --- /dev/null +++ b/configs/mmdet/htc/README.md @@ -0,0 +1,67 @@ +# HTC + +> [Hybrid Task Cascade for Instance Segmentation](ttps://arxiv.org/abs/1901.07518) + + + +## Abstract + +Cascade is a classic yet powerful architecture that has boosted performance on various tasks. However, how to introduce cascade to instance segmentation remains an open question. A simple combination of Cascade R-CNN and Mask R-CNN only brings limited gain. In exploring a more effective approach, we find that the key to a successful instance segmentation cascade is to fully leverage the reciprocal relationship between detection and segmentation. In this work, we propose a new framework, Hybrid Task Cascade (HTC), which differs in two important aspects: (1) instead of performing cascaded refinement on these two tasks separately, it interweaves them for a joint multi-stage processing; (2) it adopts a fully convolutional branch to provide spatial context, which can help distinguishing hard foreground from cluttered background. Overall, this framework can learn more discriminative features progressively while integrating complementary features together in each stage. Without bells and whistles, a single HTC obtains 38.4 and 1.5 improvement over a strong Cascade Mask R-CNN baseline on MSCOCO dataset. Moreover, our overall system achieves 48.6 mask AP on the test-challenge split, ranking 1st in the COCO 2018 Challenge Object Detection Task. + +
+ +
+ +## Introduction + +HTC requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +| | ├── stuffthingmaps +``` + +## Results and Models + +The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val) + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:---------:|:-------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50-FPN | pytorch | 1x | 8.2 | 5.8 | 42.3 | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317-7332cf16.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317_070435.log.json) | +| R-50-FPN | pytorch | 20e | 8.2 | - | 43.3 | 38.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_r50_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319_070313.log.json) | +| R-101-FPN | pytorch | 20e | 10.2 | 5.5 | 44.8 | 39.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_r101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317-9b41b48f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317_153107.log.json) | +| X-101-32x4d-FPN | pytorch |20e| 11.4 | 5.0 | 46.1 | 40.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318-de97ae01.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318_034519.log.json) | +| X-101-64x4d-FPN | pytorch |20e| 14.5 | 4.4 | 47.0 | 41.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318-b181fd7a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318_081711.log.json) | + +- In the HTC paper and COCO 2018 Challenge, `score_thr` is set to 0.001 for both baselines and HTC. +- We use 8 GPUs with 2 images/GPU for R-50 and R-101 models, and 16 GPUs with 1 image/GPU for X-101 models. + If you would like to train X-101 HTC with 8 GPUs, you need to change the lr from 0.02 to 0.01. + +We also provide a powerful HTC with DCN and multi-scale training model. No testing augmentation is used. + +| Backbone | Style | DCN | training scales | Lr schd | box AP | mask AP | Config | Download | +|:----------------:|:-------:|:-----:|:---------------:|:-------:|:------:|:-------:|:------:|:--------:| +| X-101-64x4d-FPN | pytorch | c3-c5 | 400~1400 | 20e | 50.4 | 43.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312_203410.log.json) | + +## Citation + +We provide config files to reproduce the results in the CVPR 2019 paper for [Hybrid Task Cascade](https://arxiv.org/abs/1901.07518). + +```latex +@inproceedings{chen2019hybrid, + title={Hybrid task cascade for instance segmentation}, + author={Chen, Kai and Pang, Jiangmiao and Wang, Jiaqi and Xiong, Yu and Li, Xiaoxiao and Sun, Shuyang and Feng, Wansen and Liu, Ziwei and Shi, Jianping and Ouyang, Wanli and Chen Change Loy and Dahua Lin}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} +``` diff --git a/configs/mmdet/htc/htc_r101_fpn_20e_coco.py b/configs/mmdet/htc/htc_r101_fpn_20e_coco.py new file mode 100644 index 00000000..b42297bf --- /dev/null +++ b/configs/mmdet/htc/htc_r101_fpn_20e_coco.py @@ -0,0 +1,9 @@ +_base_ = './htc_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/htc/htc_r50_fpn_1x_coco.py b/configs/mmdet/htc/htc_r50_fpn_1x_coco.py new file mode 100644 index 00000000..1e8e18a0 --- /dev/null +++ b/configs/mmdet/htc/htc_r50_fpn_1x_coco.py @@ -0,0 +1,56 @@ +_base_ = './htc_without_semantic_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + loss_seg=dict( + type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2)))) +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SegRescale', scale_factor=1 / 8), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict( + seg_prefix=data_root + 'stuffthingmaps/train2017/', + pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/htc/htc_r50_fpn_20e_coco.py b/configs/mmdet/htc/htc_r50_fpn_20e_coco.py new file mode 100644 index 00000000..7d2e0116 --- /dev/null +++ b/configs/mmdet/htc/htc_r50_fpn_20e_coco.py @@ -0,0 +1,4 @@ +_base_ = './htc_r50_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/htc/htc_without_semantic_r50_fpn_1x_coco.py b/configs/mmdet/htc/htc_without_semantic_r50_fpn_1x_coco.py new file mode 100644 index 00000000..565104f4 --- /dev/null +++ b/configs/mmdet/htc/htc_without_semantic_r50_fpn_1x_coco.py @@ -0,0 +1,236 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='HybridTaskCascade', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='HybridTaskCascadeRoIHead', + interleaved=True, + mask_info_flow=True, + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=[ + dict( + type='HTCMaskHead', + with_conv_res=False, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py b/configs/mmdet/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py new file mode 100644 index 00000000..0c834f28 --- /dev/null +++ b/configs/mmdet/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py @@ -0,0 +1,19 @@ +_base_ = './htc_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) +data = dict(samples_per_gpu=1, workers_per_gpu=1) +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py b/configs/mmdet/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py new file mode 100644 index 00000000..8b0d962b --- /dev/null +++ b/configs/mmdet/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py @@ -0,0 +1,19 @@ +_base_ = './htc_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) +data = dict(samples_per_gpu=1, workers_per_gpu=1) +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py b/configs/mmdet/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py new file mode 100644 index 00000000..c8d87033 --- /dev/null +++ b/configs/mmdet/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py @@ -0,0 +1,43 @@ +_base_ = './htc_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) +# dataset settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), + dict( + type='Resize', + img_scale=[(1600, 400), (1600, 1400)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SegRescale', scale_factor=1 / 8), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +data = dict( + samples_per_gpu=1, workers_per_gpu=1, train=dict(pipeline=train_pipeline)) +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/htc/metafile.yml b/configs/mmdet/htc/metafile.yml new file mode 100644 index 00000000..acd038c7 --- /dev/null +++ b/configs/mmdet/htc/metafile.yml @@ -0,0 +1,165 @@ +Collections: + - Name: HTC + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - HTC + - RPN + - ResNet + - ResNeXt + - RoIAlign + Paper: + URL: https://arxiv.org/abs/1901.07518 + Title: 'Hybrid Task Cascade for Instance Segmentation' + README: configs/htc/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/htc.py#L6 + Version: v2.0.0 + +Models: + - Name: htc_r50_fpn_1x_coco + In Collection: HTC + Config: configs/htc/htc_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.2 + inference time (ms/im): + - value: 172.41 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317-7332cf16.pth + + - Name: htc_r50_fpn_20e_coco + In Collection: HTC + Config: configs/htc/htc_r50_fpn_20e_coco.py + Metadata: + Training Memory (GB): 8.2 + inference time (ms/im): + - value: 172.41 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth + + - Name: htc_r101_fpn_20e_coco + In Collection: HTC + Config: configs/htc/htc_r101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 10.2 + inference time (ms/im): + - value: 181.82 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317-9b41b48f.pth + + - Name: htc_x101_32x4d_fpn_16x1_20e_coco + In Collection: HTC + Config: configs/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py + Metadata: + Training Resources: 16x V100 GPUs + Batch Size: 16 + Training Memory (GB): 11.4 + inference time (ms/im): + - value: 200 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318-de97ae01.pth + + - Name: htc_x101_64x4d_fpn_16x1_20e_coco + In Collection: HTC + Config: configs/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py + Metadata: + Training Resources: 16x V100 GPUs + Batch Size: 16 + Training Memory (GB): 14.5 + inference time (ms/im): + - value: 227.27 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318-b181fd7a.pth + + - Name: htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco + In Collection: HTC + Config: configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py + Metadata: + Training Resources: 16x V100 GPUs + Batch Size: 16 + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth diff --git a/configs/mmdet/instaboost/README.md b/configs/mmdet/instaboost/README.md new file mode 100644 index 00000000..083a9e7b --- /dev/null +++ b/configs/mmdet/instaboost/README.md @@ -0,0 +1,58 @@ +# Instaboost + +> [Instaboost: Boosting instance segmentation via probability map guided copy-pasting](https://arxiv.org/abs/1908.07801) + + + +## Abstract + +Instance segmentation requires a large number of training samples to achieve satisfactory performance and benefits from proper data augmentation. To enlarge the training set and increase the diversity, previous methods have investigated using data annotation from other domain (e.g. bbox, point) in a weakly supervised mechanism. In this paper, we present a simple, efficient and effective method to augment the training set using the existing instance mask annotations. Exploiting the pixel redundancy of the background, we are able to improve the performance of Mask R-CNN for 1.7 mAP on COCO dataset and 3.3 mAP on Pascal VOC dataset by simply introducing random jittering to objects. Furthermore, we propose a location probability map based approach to explore the feasible locations that objects can be placed based on local appearance similarity. With the guidance of such map, we boost the performance of R101-Mask R-CNN on instance segmentation from 35.7 mAP to 37.9 mAP without modifying the backbone or network structure. Our method is simple to implement and does not increase the computational complexity. It can be integrated into the training pipeline of any instance segmentation model without affecting the training and inference efficiency. + +
+ +
+ +## Introduction + +Configs in this directory is the implementation for ICCV2019 paper "InstaBoost: Boosting Instance Segmentation Via Probability Map Guided Copy-Pasting" and provided by the authors of the paper. InstaBoost is a data augmentation method for object detection and instance segmentation. The paper has been released on [`arXiv`](https://arxiv.org/abs/1908.07801). + +## Usage + +### Requirements + +You need to install `instaboostfast` before using it. + +```shell +pip install instaboostfast +``` + +The code and more details can be found [here](https://github.com/GothicAi/Instaboost). + +### Integration with MMDetection + +InstaBoost have been already integrated in the data pipeline, thus all you need is to add or change **InstaBoost** configurations after **LoadImageFromFile**. We have provided examples like [this](mask_rcnn_r50_fpn_instaboost_4x#L121). You can refer to [`InstaBoostConfig`](https://github.com/GothicAi/InstaBoost-pypi#instaboostconfig) for more details. + +## Results and Models + +- All models were trained on `coco_2017_train` and tested on `coco_2017_val` for convenience of evaluation and comparison. In the paper, the results are obtained from `test-dev`. +- To balance accuracy and training time when using InstaBoost, models released in this page are all trained for 48 Epochs. Other training and testing configs strictly follow the original framework. +- For results and models in MMDetection V1.x, please refer to [Instaboost](https://github.com/GothicAi/Instaboost). + +| Network | Backbone | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :--------: | :-----: | :------: | :------------: | :------:| :-----: | :------: | :-----------------: | +| Mask R-CNN | R-50-FPN | 4x | 4.4 | 17.5 | 40.6 | 36.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307_223635.log.json) | +| Mask R-CNN | R-101-FPN | 4x | 6.4 | | 42.5 | 38.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738-f23f3a5f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738.log.json) | +| Mask R-CNN | X-101-64x4d-FPN | 4x | 10.7 | | 44.7 | 39.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947-8ed58c1b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947.log.json) | +| Cascade R-CNN | R-101-FPN | 4x | 6.0 | 12.0 | 43.7 | 38.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-c19d98d9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307_223646.log.json) | + +## Citation + +```latex +@inproceedings{fang2019instaboost, + title={Instaboost: Boosting instance segmentation via probability map guided copy-pasting}, + author={Fang, Hao-Shu and Sun, Jianhua and Wang, Runzhong and Gou, Minghao and Li, Yong-Lu and Lu, Cewu}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + pages={682--691}, + year={2019} +} +``` diff --git a/configs/mmdet/instaboost/cascade_mask_rcnn_r101_fpn_instaboost_4x_coco.py b/configs/mmdet/instaboost/cascade_mask_rcnn_r101_fpn_instaboost_4x_coco.py new file mode 100644 index 00000000..9d0515d7 --- /dev/null +++ b/configs/mmdet/instaboost/cascade_mask_rcnn_r101_fpn_instaboost_4x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py b/configs/mmdet/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py new file mode 100644 index 00000000..a89a81f5 --- /dev/null +++ b/configs/mmdet/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py @@ -0,0 +1,28 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='InstaBoost', + action_candidate=('normal', 'horizontal', 'skip'), + action_prob=(1, 0, 0), + scale=(0.8, 1.2), + dx=15, + dy=15, + theta=(-1, 1), + color_prob=0.5, + hflag=False, + aug_ratio=0.5), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(pipeline=train_pipeline)) +# learning policy +lr_config = dict(step=[32, 44]) +runner = dict(type='EpochBasedRunner', max_epochs=48) diff --git a/configs/mmdet/instaboost/cascade_mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py b/configs/mmdet/instaboost/cascade_mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py new file mode 100644 index 00000000..d67b7992 --- /dev/null +++ b/configs/mmdet/instaboost/cascade_mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py b/configs/mmdet/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py new file mode 100644 index 00000000..ebbb43e9 --- /dev/null +++ b/configs/mmdet/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_instaboost_4x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py b/configs/mmdet/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py new file mode 100644 index 00000000..55ca62b7 --- /dev/null +++ b/configs/mmdet/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py @@ -0,0 +1,28 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='InstaBoost', + action_candidate=('normal', 'horizontal', 'skip'), + action_prob=(1, 0, 0), + scale=(0.8, 1.2), + dx=15, + dy=15, + theta=(-1, 1), + color_prob=0.5, + hflag=False, + aug_ratio=0.5), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(pipeline=train_pipeline)) +# learning policy +lr_config = dict(step=[32, 44]) +runner = dict(type='EpochBasedRunner', max_epochs=48) diff --git a/configs/mmdet/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py b/configs/mmdet/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py new file mode 100644 index 00000000..2010f448 --- /dev/null +++ b/configs/mmdet/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_r50_fpn_instaboost_4x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/instaboost/metafile.yml b/configs/mmdet/instaboost/metafile.yml new file mode 100644 index 00000000..325283d3 --- /dev/null +++ b/configs/mmdet/instaboost/metafile.yml @@ -0,0 +1,99 @@ +Collections: + - Name: InstaBoost + Metadata: + Training Data: COCO + Training Techniques: + - InstaBoost + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Paper: + URL: https://arxiv.org/abs/1908.07801 + Title: 'Instaboost: Boosting instance segmentation via probability map guided copy-pasting' + README: configs/instaboost/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/datasets/pipelines/instaboost.py#L7 + Version: v2.0.0 + +Models: + - Name: mask_rcnn_r50_fpn_instaboost_4x_coco + In Collection: InstaBoost + Config: configs/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 57.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 48 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth + + - Name: mask_rcnn_r101_fpn_instaboost_4x_coco + In Collection: InstaBoost + Config: configs/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py + Metadata: + Training Memory (GB): 6.4 + Epochs: 48 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738-f23f3a5f.pth + + - Name: mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco + In Collection: InstaBoost + Config: configs/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 48 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947-8ed58c1b.pth + + - Name: cascade_mask_rcnn_r50_fpn_instaboost_4x_coco + In Collection: InstaBoost + Config: configs/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 83.33 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 48 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-c19d98d9.pth diff --git a/configs/mmdet/lad/README.md b/configs/mmdet/lad/README.md new file mode 100644 index 00000000..a5ded4f8 --- /dev/null +++ b/configs/mmdet/lad/README.md @@ -0,0 +1,44 @@ +# LAD + +> [Improving Object Detection by Label Assignment Distillation](https://arxiv.org/abs/2108.10520) + + + +## Abstract + +Label assignment in object detection aims to assign targets, foreground or background, to sampled regions in an image. Unlike labeling for image classification, this problem is not well defined due to the object's bounding box. In this paper, we investigate the problem from a perspective of distillation, hence we call Label Assignment Distillation (LAD). Our initial motivation is very simple, we use a teacher network to generate labels for the student. This can be achieved in two ways: either using the teacher's prediction as the direct targets (soft label), or through the hard labels dynamically assigned by the teacher (LAD). Our experiments reveal that: (i) LAD is more effective than soft-label, but they are complementary. (ii) Using LAD, a smaller teacher can also improve a larger student significantly, while soft-label can't. We then introduce Co-learning LAD, in which two networks simultaneously learn from scratch and the role of teacher and student are dynamically interchanged. Using PAA-ResNet50 as a teacher, our LAD techniques can improve detectors PAA-ResNet101 and PAA-ResNeXt101 to 46AP and 47.5AP on the COCO test-dev set. With a stronger teacher PAA-SwinB, we improve the students PAA-ResNet50 to 43.7AP by only 1x schedule training and standard setting, and PAA-ResNet101 to 47.9AP, significantly surpassing the current methods. + +
+ +
+ +## Results and Models + +We provide config files to reproduce the object detection results in the +WACV 2022 paper for Improving Object Detection by Label Assignment +Distillation. + +### PAA with LAD + +| Teacher | Student | Training schedule | AP (val) | Config | +| :-------: | :-----: | :---------------: | :------: | :----------------------------------------------------: | +| -- | R-50 | 1x | 40.4 | | +| -- | R-101 | 1x | 42.6 | | +| R-101 | R-50 | 1x | 41.6 | [config](configs/lad/lad_r50_paa_r101_fpn_coco_1x.py) | +| R-50 | R-101 | 1x | 43.2 | [config](configs/lad/lad_r101_paa_r50_fpn_coco_1x.py) | + +## Note + +- Meaning of Config name: lad_r50(student model)_paa(based on paa)_r101(teacher model)_fpn(neck)_coco(dataset)_1x(12 epoch).py +- Results may fluctuate by about 0.2 mAP. + +## Citation + +```latex +@inproceedings{nguyen2021improving, + title={Improving Object Detection by Label Assignment Distillation}, + author={Chuong H. Nguyen and Thuy C. Nguyen and Tuan N. Tang and Nam L. H. Phan}, + booktitle = {WACV}, + year={2022} +} +``` diff --git a/configs/mmdet/lad/lad_r101_paa_r50_fpn_coco_1x.py b/configs/mmdet/lad/lad_r101_paa_r50_fpn_coco_1x.py new file mode 100644 index 00000000..4877d95b --- /dev/null +++ b/configs/mmdet/lad/lad_r101_paa_r50_fpn_coco_1x.py @@ -0,0 +1,126 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth' # noqa +model = dict( + type='LAD', + # student + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='LADHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # teacher + teacher_ckpt=teacher_ckpt, + teacher_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + teacher_neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + teacher_bbox_head=dict( + type='LADHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.1, + neg_iou_thr=0.1, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + score_voting=True, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +data = dict(samples_per_gpu=8, workers_per_gpu=4) +optimizer = dict(lr=0.01) +fp16 = dict(loss_scale=512.) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/lad/lad_r50_paa_r101_fpn_coco_1x.py b/configs/mmdet/lad/lad_r50_paa_r101_fpn_coco_1x.py new file mode 100644 index 00000000..29bbe693 --- /dev/null +++ b/configs/mmdet/lad/lad_r50_paa_r101_fpn_coco_1x.py @@ -0,0 +1,125 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +teacher_ckpt = 'http://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth' # noqa +model = dict( + type='LAD', + # student + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='LADHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # teacher + teacher_ckpt=teacher_ckpt, + teacher_backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + teacher_neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + teacher_bbox_head=dict( + type='LADHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.1, + neg_iou_thr=0.1, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + score_voting=True, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +data = dict(samples_per_gpu=8, workers_per_gpu=4) +optimizer = dict(lr=0.01) +fp16 = dict(loss_scale=512.) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/lad/metafile.yml b/configs/mmdet/lad/metafile.yml new file mode 100644 index 00000000..5076f28d --- /dev/null +++ b/configs/mmdet/lad/metafile.yml @@ -0,0 +1,42 @@ +Collections: + - Name: Label Assignment Distillation + Metadata: + Training Data: COCO + Training Techniques: + - Label Assignment Distillation + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2108.10520 + Title: 'Improving Object Detection by Label Assignment Distillation' + README: configs/lad/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.19.0/mmdet/models/detectors/lad.py#L10 + Version: v2.19.0 + +Models: + - Name: lad_r50_paa_r101_fpn_coco_1x + In Collection: Label Assignment Distillation + Config: configs/lad/lad_r50_paa_r101_fpn_coco_1x.py + Metadata: + Teacher: R-101 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + + - Name: lad_r101_paa_r50_fpn_coco_1x + In Collection: Label Assignment Distillation + Config: configs/lad/lad_r101_paa_r50_fpn_coco_1x.py + Metadata: + Teacher: R-50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.2 diff --git a/configs/mmdet/ld/README.md b/configs/mmdet/ld/README.md new file mode 100644 index 00000000..d4c2fd17 --- /dev/null +++ b/configs/mmdet/ld/README.md @@ -0,0 +1,43 @@ +# LD + +> [Localization Distillation for Dense Object Detection](https://arxiv.org/abs/2102.12252) + + + +## Abstract + +Knowledge distillation (KD) has witnessed its powerful capability in learning compact models in object detection. Previous KD methods for object detection mostly focus on imitating deep features within the imitation regions instead of mimicking classification logits due to its inefficiency in distilling localization information. In this paper, by reformulating the knowledge distillation process on localization, we present a novel localization distillation (LD) method which can efficiently transfer the localization knowledge from the teacher to the student. Moreover, we also heuristically introduce the concept of valuable localization region that can aid to selectively distill the semantic and localization knowledge for a certain region. Combining these two new components, for the first time, we show that logit mimicking can outperform feature imitation and localization knowledge distillation is more important and efficient than semantic knowledge for distilling object detectors. Our distillation scheme is simple as well as effective and can be easily applied to different dense object detectors. Experiments show that our LD can boost the AP score of GFocal-ResNet-50 with a single-scale 1× training schedule from 40.1 to 42.1 on the COCO benchmark without any sacrifice on the inference speed. + +
+ +
+ +## Results and Models + +### GFocalV1 with LD + +| Teacher | Student | Training schedule | Mini-batch size | AP (val) | AP50 (val) | AP75 (val) | Config | +| :-------: | :-----: | :---------------: | :-------------: | :------: | :--------: | :--------: | :--------------: | +| -- | R-18 | 1x | 6 | 35.8 | 53.1 | 38.2 | | +| R-101 | R-18 | 1x | 6 | 36.5 | 52.9 | 39.3 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ld/ld_r18_gflv1_r101_fpn_coco_1x.py) | +| -- | R-34 | 1x | 6 | 38.9 | 56.6 | 42.2 | | +| R-101 | R-34 | 1x | 6 | 39.8 | 56.6 | 43.1 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ld/ld_r34_gflv1_r101_fpn_coco_1x.py) | +| -- | R-50 | 1x | 6 | 40.1 | 58.2 | 43.1 | | +| R-101 | R-50 | 1x | 6 | 41.1 | 58.7 | 44.9 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ld/ld_r50_gflv1_r101_fpn_coco_1x.py) | +| -- | R-101 | 2x | 6 | 44.6 | 62.9 | 48.4 | | +| R-101-DCN | R-101 | 2x | 6 | 45.4 | 63.1 | 49.5 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ld/ld_r101_gflv1_r101dcn_fpn_coco_1x.py) | + +## Note + +- Meaning of Config name: ld_r18(student model)_gflv1(based on gflv1)_r101(teacher model)_fpn(neck)_coco(dataset)_1x(12 epoch).py + +## Citation + +```latex +@Inproceedings{zheng2022LD, + title={Localization Distillation for Dense Object Detection}, + author= {Zheng, Zhaohui and Ye, Rongguang and Wang, Ping and Ren, Dongwei and Zuo, Wangmeng and Hou, Qibin and Cheng, Mingming}, + booktitle={CVPR}, + year={2022} +} +``` diff --git a/configs/mmdet/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x.py b/configs/mmdet/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x.py new file mode 100644 index 00000000..1cbdb4cf --- /dev/null +++ b/configs/mmdet/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x.py @@ -0,0 +1,44 @@ +_base_ = ['./ld_r18_gflv1_r101_fpn_coco_1x.py'] +teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth' # noqa +model = dict( + teacher_config='configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py', + teacher_ckpt=teacher_ckpt, + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5)) + +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) +# multi-scale training +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/ld/ld_r18_gflv1_r101_fpn_coco_1x.py b/configs/mmdet/ld/ld_r18_gflv1_r101_fpn_coco_1x.py new file mode 100644 index 00000000..18dce814 --- /dev/null +++ b/configs/mmdet/ld/ld_r18_gflv1_r101_fpn_coco_1x.py @@ -0,0 +1,62 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth' # noqa +model = dict( + type='KnowledgeDistillationSingleStageDetector', + teacher_config='configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py', + teacher_ckpt=teacher_ckpt, + backbone=dict( + type='ResNet', + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict( + type='FPN', + in_channels=[64, 128, 256, 512], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='LDHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25), + loss_ld=dict( + type='KnowledgeDistillationKLDivLoss', loss_weight=0.25, T=10), + reg_max=16, + loss_bbox=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/ld/ld_r34_gflv1_r101_fpn_coco_1x.py b/configs/mmdet/ld/ld_r34_gflv1_r101_fpn_coco_1x.py new file mode 100644 index 00000000..3b6996d4 --- /dev/null +++ b/configs/mmdet/ld/ld_r34_gflv1_r101_fpn_coco_1x.py @@ -0,0 +1,19 @@ +_base_ = ['./ld_r18_gflv1_r101_fpn_coco_1x.py'] +model = dict( + backbone=dict( + type='ResNet', + depth=34, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet34')), + neck=dict( + type='FPN', + in_channels=[64, 128, 256, 512], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5)) diff --git a/configs/mmdet/ld/ld_r50_gflv1_r101_fpn_coco_1x.py b/configs/mmdet/ld/ld_r50_gflv1_r101_fpn_coco_1x.py new file mode 100644 index 00000000..2b18785a --- /dev/null +++ b/configs/mmdet/ld/ld_r50_gflv1_r101_fpn_coco_1x.py @@ -0,0 +1,19 @@ +_base_ = ['./ld_r18_gflv1_r101_fpn_coco_1x.py'] +model = dict( + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5)) diff --git a/configs/mmdet/ld/metafile.yml b/configs/mmdet/ld/metafile.yml new file mode 100644 index 00000000..d555a6df --- /dev/null +++ b/configs/mmdet/ld/metafile.yml @@ -0,0 +1,72 @@ +Collections: + - Name: Localization Distillation + Metadata: + Training Data: COCO + Training Techniques: + - Localization Distillation + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2102.12252 + Title: 'Localization Distillation for Dense Object Detection' + README: configs/ld/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.11.0/mmdet/models/dense_heads/ld_head.py#L11 + Version: v2.11.0 + +Models: + - Name: ld_r18_gflv1_r101_fpn_coco_1x + In Collection: Localization Distillation + Config: configs/ld/ld_r18_gflv1_r101_fpn_coco_1x.py + Metadata: + Teacher: R-101 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.5 + box AP@0.5: 52.9 + box AP@0.75: 39.3 + + - Name: ld_r34_gflv1_r101_fpn_coco_1x + In Collection: Localization Distillation + Config: configs/ld/ld_r34_gflv1_r101_fpn_coco_1x.py + Metadata: + Teacher: R-101 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.8 + box AP@0.5: 56.6 + box AP@0.75: 43.1 + + - Name: ld_r50_gflv1_r101_fpn_coco_1x + In Collection: Localization Distillation + Config: configs/ld/ld_r50_gflv1_r101_fpn_coco_1x.py + Metadata: + Teacher: R-101 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + box AP@0.5: 58.7 + box AP@0.75: 44.9 + + - Name: ld_r101_gflv1_r101dcn_fpn_coco_1x + In Collection: Localization Distillation + Config: configs/ld/ld_r101_gflv1_r101dcn_fpn_coco_1x.py + Metadata: + Teacher: R-101-DCN + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.4 + box AP@0.5: 63.1 + box AP@0.75: 49.5 diff --git a/configs/mmdet/legacy_1.x/README.md b/configs/mmdet/legacy_1.x/README.md new file mode 100644 index 00000000..38a2a0e4 --- /dev/null +++ b/configs/mmdet/legacy_1.x/README.md @@ -0,0 +1,53 @@ +# Legacy Configs in MMDetection V1.x + + + +Configs in this directory implement the legacy configs used by MMDetection V1.x and its model zoos. + +To help users convert their models from V1.x to MMDetection V2.0, we provide v1.x configs to inference the converted v1.x models. +Due to the BC-breaking changes in MMDetection V2.0 from MMDetection V1.x, running inference with the same model weights in these two version will produce different results. The difference will cause within 1% AP absolute difference as can be found in the following table. + +## Usage + +To upgrade the model version, the users need to do the following steps. + +### 1. Convert model weights + +There are three main difference in the model weights between V1.x and V2.0 codebases. + +1. Since the class order in all the detector's classification branch is reordered, all the legacy model weights need to go through the conversion process. +2. The regression and segmentation head no longer contain the background channel. Weights in these background channels should be removed to fix in the current codebase. +3. For two-stage detectors, their wegihts need to be upgraded since MMDetection V2.0 refactors all the two-stage detectors with `RoIHead`. + +The users can do the same modification as mentioned above for the self-implemented +detectors. We provide a scripts `tools/model_converters/upgrade_model_version.py` to convert the model weights in the V1.x model zoo. + +```bash +python tools/model_converters/upgrade_model_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH} --num-classes ${NUM_CLASSES} + +``` + +- OLD_MODEL_PATH: the path to load the model weights in 1.x version. +- NEW_MODEL_PATH: the path to save the converted model weights in 2.0 version. +- NUM_CLASSES: number of classes of the original model weights. Usually it is 81 for COCO dataset, 21 for VOC dataset. + The number of classes in V2.0 models should be equal to that in V1.x models - 1. + +### 2. Use configs with legacy settings + +After converting the model weights, checkout to the v1.2 release to find the corresponding config file that uses the legacy settings. +The V1.x models usually need these three legacy modules: `LegacyAnchorGenerator`, `LegacyDeltaXYWHBBoxCoder`, and `RoIAlign(align=False)`. +For models using ResNet Caffe backbones, they also need to change the pretrain name and the corresponding `img_norm_cfg`. +An example is in [`retinanet_r50_caffe_fpn_1x_coco_v1.py`](retinanet_r50_caffe_fpn_1x_coco_v1.py) +Then use the config to test the model weights. For most models, the obtained results should be close to that in V1.x. +We provide configs of some common structures in this directory. + +## Performance + +The performance change after converting the models in this directory are listed as the following. +| Method | Style | Lr schd | V1.x box AP | V1.x mask AP | V2.0 box AP | V2.0 mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------:| :-----: |:------:| :-----: | :-------: |:------------------------------------------------------------------------------------------------------------------------------: | +| Mask R-CNN R-50-FPN | pytorch | 1x | 37.3 | 34.2 | 36.8 | 33.9 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth)| +| RetinaNet R-50-FPN | caffe | 1x | 35.8 | - | 35.4 | - | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/retinanet_r50_caffe_1x_coco_v1.py) | +| RetinaNet R-50-FPN | pytorch | 1x | 35.6 |-|35.2| -| [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_1x_20181125-7b0c2548.pth) | +| Cascade Mask R-CNN R-50-FPN | pytorch | 1x | 41.2 | 35.7 |40.8| 35.6| [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_1x_20181123-88b170c9.pth) | +| SSD300-VGG16 | caffe | 120e | 25.7 |-|25.4|-| [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/ssd300_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_coco_vgg16_caffe_120e_20181221-84d7110b.pth) | diff --git a/configs/mmdet/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py b/configs/mmdet/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py new file mode 100644 index 00000000..fc9d0048 --- /dev/null +++ b/configs/mmdet/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py @@ -0,0 +1,79 @@ +_base_ = [ + '../_base_/models/cascade_mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='CascadeRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + anchor_generator=dict(type='LegacyAnchorGenerator', center_offset=0.5), + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0])), + roi_head=dict( + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=2, + aligned=False)), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + reg_class_agnostic=True, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2])), + dict( + type='Shared2FCBBoxHead', + reg_class_agnostic=True, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1])), + dict( + type='Shared2FCBBoxHead', + reg_class_agnostic=True, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067])), + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=14, + sampling_ratio=2, + aligned=False)))) +dist_params = dict(backend='nccl', port=29515) diff --git a/configs/mmdet/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py b/configs/mmdet/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py new file mode 100644 index 00000000..8c573bef --- /dev/null +++ b/configs/mmdet/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='FasterRCNN', + backbone=dict( + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + rpn_head=dict( + type='RPNHead', + anchor_generator=dict( + type='LegacyAnchorGenerator', + center_offset=0.5, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=2, + aligned=False), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn_proposal=dict(max_per_img=2000), + rcnn=dict(assigner=dict(match_low_quality=True)))) diff --git a/configs/mmdet/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py b/configs/mmdet/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py new file mode 100644 index 00000000..04581bbc --- /dev/null +++ b/configs/mmdet/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py @@ -0,0 +1,34 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + rpn_head=dict( + anchor_generator=dict(type='LegacyAnchorGenerator', center_offset=0.5), + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=2, + aligned=False)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=14, + sampling_ratio=2, + aligned=False)), + bbox_head=dict( + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + + # model training and testing settings + train_cfg=dict( + rpn_proposal=dict(max_per_img=2000), + rcnn=dict(assigner=dict(match_low_quality=True)))) diff --git a/configs/mmdet/legacy_1.x/retinanet_r50_caffe_fpn_1x_coco_v1.py b/configs/mmdet/legacy_1.x/retinanet_r50_caffe_fpn_1x_coco_v1.py new file mode 100644 index 00000000..a63d248c --- /dev/null +++ b/configs/mmdet/legacy_1.x/retinanet_r50_caffe_fpn_1x_coco_v1.py @@ -0,0 +1,41 @@ +_base_ = './retinanet_r50_fpn_1x_coco_v1.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py b/configs/mmdet/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py new file mode 100644 index 00000000..6198b971 --- /dev/null +++ b/configs/mmdet/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py @@ -0,0 +1,17 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + bbox_head=dict( + type='RetinaHead', + anchor_generator=dict( + type='LegacyAnchorGenerator', + center_offset=0.5, + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0))) diff --git a/configs/mmdet/legacy_1.x/ssd300_coco_v1.py b/configs/mmdet/legacy_1.x/ssd300_coco_v1.py new file mode 100644 index 00000000..65ccc1e5 --- /dev/null +++ b/configs/mmdet/legacy_1.x/ssd300_coco_v1.py @@ -0,0 +1,84 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# model settings +input_size = 300 +model = dict( + bbox_head=dict( + type='SSDHead', + anchor_generator=dict( + type='LegacySSDAnchorGenerator', + scale_major=False, + input_size=input_size, + basesize_ratio_range=(0.15, 0.9), + strides=[8, 16, 32, 64, 100, 300], + ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]))) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(300, 300), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(300, 300), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=3, + train=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4) +optimizer_config = dict(_delete_=True) +dist_params = dict(backend='nccl', port=29555) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/libra_rcnn/README.md b/configs/mmdet/libra_rcnn/README.md new file mode 100644 index 00000000..35446f6a --- /dev/null +++ b/configs/mmdet/libra_rcnn/README.md @@ -0,0 +1,53 @@ +# Libra R-CNN + +> [Libra R-CNN: Towards Balanced Learning for Object Detection](https://arxiv.org/abs/1904.02701) + + + +## Abstract + +Compared with model architectures, the training process, which is also crucial to the success of detectors, has received relatively less attention in object detection. In this work, we carefully revisit the standard training practice of detectors, and find that the detection performance is often limited by the imbalance during the training process, which generally consists in three levels - sample level, feature level, and objective level. To mitigate the adverse effects caused thereby, we propose Libra R-CNN, a simple but effective framework towards balanced learning for object detection. It integrates three novel components: IoU-balanced sampling, balanced feature pyramid, and balanced L1 loss, respectively for reducing the imbalance at sample, feature, and objective level. Benefitted from the overall balanced design, Libra R-CNN significantly improves the detection performance. Without bells and whistles, it achieves 2.5 points and 2.0 points higher Average Precision (AP) than FPN Faster R-CNN and RetinaNet respectively on MSCOCO. + +Instance recognition is rapidly advanced along with the developments of various deep convolutional neural networks. Compared to the architectures of networks, the training process, which is also crucial to the success of detectors, has received relatively less attention. In this work, we carefully revisit the standard training practice of detectors, and find that the detection performance is often limited by the imbalance during the training process, which generally consists in three levels - sample level, feature level, and objective level. To mitigate the adverse effects caused thereby, we propose Libra R-CNN, a simple yet effective framework towards balanced learning for instance recognition. It integrates IoU-balanced sampling, balanced feature pyramid, and objective re-weighting, respectively for reducing the imbalance at sample, feature, and objective level. Extensive experiments conducted on MS COCO, LVIS and Pascal VOC datasets prove the effectiveness of the overall balanced design. + +
+ +
+ +## Results and Models + +The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val) + +| Architecture | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:------------:|:---------------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| Faster R-CNN | R-50-FPN | pytorch | 1x | 4.6 | 19.0 | 38.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| Fast R-CNN | R-50-FPN | pytorch | 1x | | | | | +| Faster R-CNN | R-101-FPN | pytorch | 1x | 6.5 | 14.4 | 40.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203-8dba6a5a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203_001405.log.json) | +| Faster R-CNN | X-101-64x4d-FPN | pytorch | 1x | 10.8 | 8.5 | 42.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315-3a7d0488.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315_231625.log.json) | +| RetinaNet | R-50-FPN | pytorch | 1x | 4.2 | 17.7 | 37.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205-804d94ce.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205_112757.log.json) | + +## Citation + +We provide config files to reproduce the results in the CVPR 2019 paper [Libra R-CNN](https://arxiv.org/pdf/1904.02701.pdf). + +The extended version of [Libra R-CNN](https://arxiv.org/pdf/2108.10175.pdf) is accpeted by IJCV. + +```latex +@inproceedings{pang2019libra, + title={Libra R-CNN: Towards Balanced Learning for Object Detection}, + author={Pang, Jiangmiao and Chen, Kai and Shi, Jianping and Feng, Huajun and Ouyang, Wanli and Dahua Lin}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} + +@article{pang2021towards, + title={Towards Balanced Learning for Instance Recognition}, + author={Pang, Jiangmiao and Chen, Kai and Li, Qi and Xu, Zhihai and Feng, Huajun and Shi, Jianping and Ouyang, Wanli and Lin, Dahua}, + journal={International Journal of Computer Vision}, + volume={129}, + number={5}, + pages={1376--1393}, + year={2021}, + publisher={Springer} +} +``` diff --git a/configs/mmdet/libra_rcnn/libra_fast_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/libra_rcnn/libra_fast_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..efbedc86 --- /dev/null +++ b/configs/mmdet/libra_rcnn/libra_fast_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,50 @@ +_base_ = '../fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py' +# model settings +model = dict( + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + dict( + type='BFP', + in_channels=256, + num_levels=5, + refine_level=2, + refine_type='non_local') + ], + roi_head=dict( + bbox_head=dict( + loss_bbox=dict( + _delete_=True, + type='BalancedL1Loss', + alpha=0.5, + gamma=1.5, + beta=1.0, + loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + sampler=dict( + _delete_=True, + type='CombinedSampler', + num=512, + pos_fraction=0.25, + add_gt_as_proposals=True, + pos_sampler=dict(type='InstanceBalancedPosSampler'), + neg_sampler=dict( + type='IoUBalancedNegSampler', + floor_thr=-1, + floor_fraction=0, + num_bins=3))))) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +data = dict( + train=dict(proposal_file=data_root + + 'libra_proposals/rpn_r50_fpn_1x_train2017.pkl'), + val=dict(proposal_file=data_root + + 'libra_proposals/rpn_r50_fpn_1x_val2017.pkl'), + test=dict(proposal_file=data_root + + 'libra_proposals/rpn_r50_fpn_1x_val2017.pkl')) diff --git a/configs/mmdet/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py b/configs/mmdet/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..e899706b --- /dev/null +++ b/configs/mmdet/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './libra_faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..89a0d7b2 --- /dev/null +++ b/configs/mmdet/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,41 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +# model settings +model = dict( + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + dict( + type='BFP', + in_channels=256, + num_levels=5, + refine_level=2, + refine_type='non_local') + ], + roi_head=dict( + bbox_head=dict( + loss_bbox=dict( + _delete_=True, + type='BalancedL1Loss', + alpha=0.5, + gamma=1.5, + beta=1.0, + loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict(sampler=dict(neg_pos_ub=5), allowed_border=-1), + rcnn=dict( + sampler=dict( + _delete_=True, + type='CombinedSampler', + num=512, + pos_fraction=0.25, + add_gt_as_proposals=True, + pos_sampler=dict(type='InstanceBalancedPosSampler'), + neg_sampler=dict( + type='IoUBalancedNegSampler', + floor_thr=-1, + floor_fraction=0, + num_bins=3))))) diff --git a/configs/mmdet/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..06740a77 --- /dev/null +++ b/configs/mmdet/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './libra_faster_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py b/configs/mmdet/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py new file mode 100644 index 00000000..be274209 --- /dev/null +++ b/configs/mmdet/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,26 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +# model settings +model = dict( + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5), + dict( + type='BFP', + in_channels=256, + num_levels=5, + refine_level=1, + refine_type='non_local') + ], + bbox_head=dict( + loss_bbox=dict( + _delete_=True, + type='BalancedL1Loss', + alpha=0.5, + gamma=1.5, + beta=0.11, + loss_weight=1.0))) diff --git a/configs/mmdet/libra_rcnn/metafile.yml b/configs/mmdet/libra_rcnn/metafile.yml new file mode 100644 index 00000000..8c327959 --- /dev/null +++ b/configs/mmdet/libra_rcnn/metafile.yml @@ -0,0 +1,99 @@ +Collections: + - Name: Libra R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - IoU-Balanced Sampling + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Balanced Feature Pyramid + Paper: + URL: https://arxiv.org/abs/1904.02701 + Title: 'Libra R-CNN: Towards Balanced Learning for Object Detection' + README: configs/libra_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/bfp.py#L10 + Version: v2.0.0 + +Models: + - Name: libra_faster_rcnn_r50_fpn_1x_coco + In Collection: Libra R-CNN + Config: configs/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.6 + inference time (ms/im): + - value: 52.63 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth + + - Name: libra_faster_rcnn_r101_fpn_1x_coco + In Collection: Libra R-CNN + Config: configs/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.5 + inference time (ms/im): + - value: 69.44 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203-8dba6a5a.pth + + - Name: libra_faster_rcnn_x101_64x4d_fpn_1x_coco + In Collection: Libra R-CNN + Config: configs/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.8 + inference time (ms/im): + - value: 117.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315-3a7d0488.pth + + - Name: libra_retinanet_r50_fpn_1x_coco + In Collection: Libra R-CNN + Config: configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + inference time (ms/im): + - value: 56.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205-804d94ce.pth diff --git a/configs/mmdet/lvis/README.md b/configs/mmdet/lvis/README.md new file mode 100644 index 00000000..5c805648 --- /dev/null +++ b/configs/mmdet/lvis/README.md @@ -0,0 +1,54 @@ +# LVIS + +> [LVIS: A Dataset for Large Vocabulary Instance Segmentation](https://arxiv.org/abs/1908.03195) + + + +## Abstract + +Progress on object detection is enabled by datasets that focus the research community's attention on open challenges. This process led us from simple images to complex scenes and from bounding boxes to segmentation masks. In this work, we introduce LVIS (pronounced `el-vis'): a new dataset for Large Vocabulary Instance Segmentation. We plan to collect ~2 million high-quality instance segmentation masks for over 1000 entry-level object categories in 164k images. Due to the Zipfian distribution of categories in natural images, LVIS naturally has a long tail of categories with few training samples. Given that state-of-the-art deep learning methods for object detection perform poorly in the low-sample regime, we believe that our dataset poses an important and exciting new scientific challenge. + +
+ +
+ +## Common Setting + +* Please follow [install guide](../../docs/get_started.md#install-mmdetection) to install open-mmlab forked cocoapi first. +* Run following scripts to install our forked lvis-api. + + ```shell + pip install git+https://github.com/lvis-dataset/lvis-api.git + ``` + +* All experiments use oversample strategy [here](../../docs/tutorials/customize_dataset.md#class-balanced-dataset) with oversample threshold `1e-3`. +* The size of LVIS v0.5 is half of COCO, so schedule `2x` in LVIS is roughly the same iterations as `1x` in COCO. + +## Results and models of LVIS v0.5 + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: |:--------: | +| R-50-FPN | pytorch | 2x | - | - | 26.1 | 25.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis-dbd06831.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_20200531_160435.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 27.1 | 27.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis-54582ee2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_20200601_134748.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 26.7 | 26.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis-3cf55ea2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_20200531_221749.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 26.4 | 26.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis-1c99a5ad.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_20200601_194651.log.json) | + +## Results and models of LVIS v1 + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| R-50-FPN | pytorch | 1x | 9.1 | - | 22.5 | 21.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1-aa78ac3d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_061305.log.json) | +| R-101-FPN | pytorch | 1x | 10.8 | - | 24.6 | 23.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1-ec55ce32.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_070959.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 11.8 | - | 26.7 | 25.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-ebbc5c81.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_071317.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 14.6 | - | 27.2 | 25.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-43d9edfe.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-20200830_060206.log.json) | + +## Citation + +```latex +@inproceedings{gupta2019lvis, + title={{LVIS}: A Dataset for Large Vocabulary Instance Segmentation}, + author={Gupta, Agrim and Dollar, Piotr and Girshick, Ross}, + booktitle={Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition}, + year={2019} +} +``` diff --git a/configs/mmdet/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1.py b/configs/mmdet/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1.py new file mode 100644 index 00000000..0f017f58 --- /dev/null +++ b/configs/mmdet/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py b/configs/mmdet/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py new file mode 100644 index 00000000..637f4a63 --- /dev/null +++ b/configs/mmdet/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py b/configs/mmdet/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py new file mode 100644 index 00000000..92ddb526 --- /dev/null +++ b/configs/mmdet/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py @@ -0,0 +1,31 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/lvis_v1_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict(num_classes=1203), mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/configs/mmdet/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py b/configs/mmdet/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py new file mode 100644 index 00000000..d53c5dc6 --- /dev/null +++ b/configs/mmdet/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py @@ -0,0 +1,31 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/lvis_v0.5_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict(num_classes=1230), mask_head=dict(num_classes=1230)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/configs/mmdet/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py b/configs/mmdet/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py new file mode 100644 index 00000000..a6115c1a --- /dev/null +++ b/configs/mmdet/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py b/configs/mmdet/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py new file mode 100644 index 00000000..96b62523 --- /dev/null +++ b/configs/mmdet/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py b/configs/mmdet/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py new file mode 100644 index 00000000..0f95a732 --- /dev/null +++ b/configs/mmdet/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py b/configs/mmdet/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py new file mode 100644 index 00000000..986acda5 --- /dev/null +++ b/configs/mmdet/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/mask2former/README.md b/configs/mmdet/mask2former/README.md new file mode 100644 index 00000000..481e7593 --- /dev/null +++ b/configs/mmdet/mask2former/README.md @@ -0,0 +1,60 @@ +# Mask2Former + +> [Masked-attention Mask Transformer for Universal Image Segmentation](http://arxiv.org/abs/2112.01527) + + + +## Abstract + +Image segmentation is about grouping pixels with different semantics, e.g., category or instance membership, where each choice of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K). + +
+ +
+ +## Introduction + +Mask2Former requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +| | | ├── instances_train2017.json +| | | ├── instances_val2017.json +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +## Results and Models + +| Backbone | style | Pretrain | Lr schd | Mem (GB) | Inf time (fps) | PQ | box mAP | mask mAP | Config | Download | +| :------: | :-----: | :----------: | :-----: | :------: | :------------: | :---: | :-----: | :------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | ImageNet-1K | 50e | 13.9 | - | 51.9 | 44.8 | 41.9 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220326_224516-0091ce2b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220326_224516.log.json) | +| R-101 | pytorch | ImageNet-1K | 50e | 16.1 | - | 52.4 | 45.3 | 42.4 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220329_225104-bb4df090.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220329_225104.log.json) | +| Swin-T | - | ImageNet-1K | 50e | 15.9 | - | 53.4 | 46.3 | 43.4 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220326_224553-c92f921c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220326_224553.log.json) | +| Swin-S | - | ImageNet-1K | 50e | 19.1 | - | 54.5 | 47.8 | 44.5 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220329_225200-9f633bcf.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220329_225200.log.json) | +| Swin-B | - | ImageNet-1K | 50e | 26.0 | - | 55.1 | 48.2 | 44.9 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_20220331_002244-1db756b2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_20220331_002244.log.json) | +| Swin-B | - | ImageNet-21K | 50e | 25.8 | - | 56.3 | 50.0 | 46.3 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_20220329_230021-89d7c1b1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_20220329_230021.log.json) | +| Swin-L | - | ImageNet-21K | 100e | 21.1 | - | 57.6 | 52.2 | 48.5 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_20220407_104949-c481ee28.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_20220407_104949.log.json) | + +## Citation + +```latex +@article{cheng2021mask2former, + title={Masked-attention Mask Transformer for Universal Image Segmentation}, + author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar}, + journal={arXiv}, + year={2021} +} +``` diff --git a/configs/mmdet/mask2former/mask2former_r101_lsj_8x2_50e_coco.py b/configs/mmdet/mask2former/mask2former_r101_lsj_8x2_50e_coco.py new file mode 100644 index 00000000..27050585 --- /dev/null +++ b/configs/mmdet/mask2former/mask2former_r101_lsj_8x2_50e_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask2former_r50_lsj_8x2_50e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mmdet/mask2former/mask2former_r50_lsj_8x2_50e_coco.py new file mode 100644 index 00000000..2c23625e --- /dev/null +++ b/configs/mmdet/mask2former/mask2former_r50_lsj_8x2_50e_coco.py @@ -0,0 +1,253 @@ +_base_ = [ + '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' +] +num_things_classes = 80 +num_stuff_classes = 53 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type='Mask2Former', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='Mask2FormerHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + num_transformer_feat_level=3, + pixel_decoder=dict( + type='MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=False, + norm_cfg=None, + init_cfg=None), + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True)), + operation_order=('self_attn', 'norm', 'ffn', 'norm')), + init_cfg=None), + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + init_cfg=None), + enforce_decoder_input_project=False, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + transformer_decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=9, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=False), + ffn_cfgs=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True), + feedforward_channels=2048, + operation_order=('cross_attn', 'norm', 'self_attn', 'norm', + 'ffn', 'norm')), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=2.0), + mask_cost=dict( + type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), + dice_cost=dict( + type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=True, + # max_per_image is for instance segmentation. + max_per_image=100, + iou_thr=0.8, + # In Mask2Former's panoptic postprocessing, + # it will filter mask area where score is less than 0.5 . + filter_low_score=True), + init_cfg=None) + +# dataset settings +image_size = (1024, 1024) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + # large scale jittering + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=image_size), + dict(type='DefaultFormatBundle', img_to_float=True), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data_root = 'data/coco/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict( + pipeline=test_pipeline, + ins_ann_file=data_root + 'annotations/instances_val2017.json', + ), + test=dict( + pipeline=test_pipeline, + ins_ann_file=data_root + 'annotations/instances_val2017.json', + )) + +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi, + }, + norm_decay_mult=0.0)) +optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + gamma=0.1, + by_epoch=False, + step=[327778, 355092], + warmup='linear', + warmup_by_epoch=False, + warmup_ratio=1.0, # no warmup + warmup_iters=10) + +max_iters = 368750 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook', by_epoch=False), + dict(type='TensorboardLoggerHook', by_epoch=False) + ]) +interval = 5000 +workflow = [('train', interval)] +checkpoint_config = dict( + by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) + +# Before 365001th iteration, we do evaluation every 5000 iterations. +# After 365000th iteration, we do evaluation every 368750 iterations, +# which means that we do evaluation at the end of training. +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mmdet/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py b/configs/mmdet/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py new file mode 100644 index 00000000..d0cf3762 --- /dev/null +++ b/configs/mmdet/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py @@ -0,0 +1,5 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/configs/mmdet/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py b/configs/mmdet/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py new file mode 100644 index 00000000..d2a58259 --- /dev/null +++ b/configs/mmdet/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py @@ -0,0 +1,42 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + pretrain_img_size=384, + embed_dims=128, + depths=depths, + num_heads=[4, 8, 16, 32], + window_size=12, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(in_channels=[128, 256, 512, 1024])) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mmdet/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py b/configs/mmdet/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py new file mode 100644 index 00000000..13aa28c4 --- /dev/null +++ b/configs/mmdet/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py @@ -0,0 +1,26 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict( + embed_dims=192, + num_heads=[6, 12, 24, 48], + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536])) + +data = dict(samples_per_gpu=1, workers_per_gpu=1) + +lr_config = dict(step=[655556, 710184]) + +max_iters = 737500 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +# Before 735001th iteration, we do evaluation every 5000 iterations. +# After 735000th iteration, we do evaluation every 737500 iterations, +# which means that we do evaluation at the end of training.' +interval = 5000 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mmdet/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mmdet/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py new file mode 100644 index 00000000..7b1b05ab --- /dev/null +++ b/configs/mmdet/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py @@ -0,0 +1,37 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + depths=depths, init_cfg=dict(type='Pretrained', + checkpoint=pretrained))) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mmdet/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mmdet/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py new file mode 100644 index 00000000..70e3103e --- /dev/null +++ b/configs/mmdet/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py @@ -0,0 +1,62 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +depths = [2, 2, 6, 2] +model = dict( + type='Mask2Former', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), + init_cfg=None) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mmdet/mask2former/metafile.yml b/configs/mmdet/mask2former/metafile.yml new file mode 100644 index 00000000..2ceed805 --- /dev/null +++ b/configs/mmdet/mask2former/metafile.yml @@ -0,0 +1,159 @@ +Collections: + - Name: Mask2Former + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 8x A100 GPUs + Architecture: + - Mask2Former + Paper: + URL: https://arxiv.org/pdf/2112.01527 + Title: 'Masked-attention Mask Transformer for Universal Image Segmentation' + README: configs/mask2former/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/detectors/mask2former.py#L7 + Version: v2.23.0 + +Models: +- Name: mask2former_r50_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 13.9 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.9 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 51.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220326_224516-0091ce2b.pth +- Name: mask2former_r101_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 16.1 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.4 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 52.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220329_225104-bb4df090.pth +- Name: mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 15.9 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.4 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 53.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220326_224553-c92f921c.pth +- Name: mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 19.1 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.5 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 54.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220329_225200-9f633bcf.pth +- Name: mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 26.0 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.9 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 55.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_20220331_002244-1db756b2.pth +- Name: mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 25.8 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 46.3 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 56.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_20220329_230021-89d7c1b1.pth +- Name: mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py + Metadata: + Training Memory (GB): 21.1 + Iterations: 737500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 48.5 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 57.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_20220407_104949-c481ee28.pth diff --git a/configs/mmdet/mask_rcnn/README.md b/configs/mmdet/mask_rcnn/README.md new file mode 100644 index 00000000..9336dd35 --- /dev/null +++ b/configs/mmdet/mask_rcnn/README.md @@ -0,0 +1,59 @@ +# Mask R-CNN + +> [Mask R-CNN](https://arxiv.org/abs/1703.06870) + + + +## Abstract + +We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without bells and whistles, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| R-50-FPN | caffe | 1x | 4.3 | | 38.0 | 34.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.38__segm_mAP-0.344_20200504_231812-0ebd1859.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_20200504_231812.log.json) | +| R-50-FPN | pytorch | 1x | 4.4 | 16.1 | 38.2 | 34.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json) | +| R-50-FPN (FP16) | pytorch | 1x | 3.6 | 24.1 | 38.1 | 34.7 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205-59faf7e4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205_130539.log.json) | +| R-50-FPN | pytorch | 2x | - | - | 39.2 | 35.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_20200505_003907.log.json) | +| R-101-FPN | caffe | 1x | | | 40.4 | 36.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758-805e06c1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758.log.json)| +| R-101-FPN | pytorch | 1x | 6.4 | 13.5 | 40.0 | 36.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204_144809.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 40.8 | 36.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_bbox_mAP-0.408__segm_mAP-0.366_20200505_071027-14b391c7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_20200505_071027.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.6 | 11.3 | 41.9 | 37.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205_034906.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 42.2 | 37.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.422__segm_mAP-0.378_20200506_004702-faef898c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_20200506_004702.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.7 | 8.0 | 42.8 | 38.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201_124310.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 42.7 | 38.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208-39d6f70c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208.log.json)| +| X-101-32x8d-FPN | pytorch | 1x | - | - | 42.8 | 38.3 | | + +## Pre-trained Models + +We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks. + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| [R-50-FPN](./mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py) | caffe | 2x | 4.3 | | 40.3 | 36.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_bbox_mAP-0.403__segm_mAP-0.365_20200504_231822-a75c98ce.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_20200504_231822.log.json) +| [R-50-FPN](./mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py) | caffe | 3x | 4.3 | | 40.8 | 37.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_20200504_163245.log.json) +| [R-50-FPN](./mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py) | pytorch| 3x | 4.1 | | 40.9 | 37.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154.log.json) +| [R-101-FPN](./mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py) | caffe | 3x | 5.9 | | 42.9 | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339-3c33ce02.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339.log.json) +| [R-101-FPN](./mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py) | pytorch| 3x | 6.1 | | 42.7 | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244-5675c317.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244.log.json) +| [x101-32x4d-FPN](./mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py) | pytorch| 3x | 7.3 | | 43.6 | 39.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410-abcd7859.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410.log.json) +| [X-101-32x8d-FPN](./mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py) | pytorch | 1x | - | | 43.6 | 39.0 | +| [X-101-32x8d-FPN](./mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py) | pytorch | 3x | 10.3 | | 44.3 | 39.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042-8bd2c639.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042.log.json) +| [X-101-64x4d-FPN](./mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py) | pytorch | 3x | 10.4 | | 44.5 | 39.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447-c376f129.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447.log.json) + +## Citation + +```latex +@article{He_2017, + title={Mask R-CNN}, + journal={2017 IEEE International Conference on Computer Vision (ICCV)}, + publisher={IEEE}, + author={He, Kaiming and Gkioxari, Georgia and Dollar, Piotr and Girshick, Ross}, + year={2017}, + month={Oct} +} +``` diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..95b324f5 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask_rcnn_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..e39781dc --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,55 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + depth=101, + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..b7986e85 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py new file mode 100644 index 00000000..c9059d53 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..0696cbe7 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,10 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py new file mode 100644 index 00000000..a44c0183 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_caffe_c4.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..5a23f8c7 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,40 @@ +_base_ = './mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py new file mode 100644 index 00000000..6308e404 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py @@ -0,0 +1,49 @@ +_base_ = './mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py new file mode 100644 index 00000000..4f7150ca --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..1b48a210 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py' +# learning policy +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py new file mode 100644 index 00000000..bebbaaab --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = './mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py new file mode 100644 index 00000000..3f8079d3 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py @@ -0,0 +1,61 @@ +_base_ = './mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + bbox_roi_extractor=dict( + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=2, + aligned=False)), + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_roi_extractor=dict( + roi_layer=dict( + type='RoIAlign', + output_size=14, + sampling_ratio=2, + aligned=False)))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..6a6c9246 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py new file mode 100644 index 00000000..932b1f90 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py new file mode 100644 index 00000000..fb8289b0 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py @@ -0,0 +1,3 @@ +_base_ = './mask_rcnn_r50_fpn_1x_coco.py' +# fp16 settings +fp16 = dict(loss_scale=512.) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..b3d9242c --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,4 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py new file mode 100644 index 00000000..9eb6d57e --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py @@ -0,0 +1,23 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..a8b3799b --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py new file mode 100644 index 00000000..2cd3cee5 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_r101_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..b698a7d2 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py new file mode 100644 index 00000000..108ea4e3 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py @@ -0,0 +1,65 @@ +_base_ = './mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) + +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py new file mode 100644 index 00000000..6b912f69 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py @@ -0,0 +1,60 @@ +_base_ = './mask_rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) + +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..8ba0e9c2 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,85 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) + +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + to_rgb=False) + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# Use RepeatDataset to speed up training +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..2333b03a --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_x101_32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py new file mode 100644 index 00000000..6074cca2 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask_rcnn_x101_32x4d_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..9f9cb1c4 --- /dev/null +++ b/configs/mmdet/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/mask_rcnn/metafile.yml b/configs/mmdet/mask_rcnn/metafile.yml new file mode 100644 index 00000000..f74bdf30 --- /dev/null +++ b/configs/mmdet/mask_rcnn/metafile.yml @@ -0,0 +1,447 @@ +Collections: + - Name: Mask R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Softmax + - RPN + - Convolution + - Dense Connections + - FPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/1703.06870v3 + Title: "Mask R-CNN" + README: configs/mask_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: mask_rcnn_r50_caffe_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.38__segm_mAP-0.344_20200504_231812-0ebd1859.pth + + - Name: mask_rcnn_r50_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 62.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth + + - Name: mask_rcnn_r50_fpn_fp16_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py + Metadata: + Training Memory (GB): 3.6 + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + inference time (ms/im): + - value: 41.49 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP16 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205-59faf7e4.pth + + - Name: mask_rcnn_r50_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 62.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth + + - Name: mask_rcnn_r101_caffe_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758-805e06c1.pth + + - Name: mask_rcnn_r101_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 74.07 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth + + - Name: mask_rcnn_r101_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 74.07 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_bbox_mAP-0.408__segm_mAP-0.366_20200505_071027-14b391c7.pth + + - Name: mask_rcnn_x101_32x4d_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth + + - Name: mask_rcnn_x101_32x4d_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.422__segm_mAP-0.378_20200506_004702-faef898c.pth + + - Name: mask_rcnn_x101_64x4d_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.7 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth + + - Name: mask_rcnn_x101_64x4d_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 10.7 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208-39d6f70c.pth + + - Name: mask_rcnn_x101_32x8d_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.7 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.3 + + - Name: mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_bbox_mAP-0.403__segm_mAP-0.365_20200504_231822-a75c98ce.pth + + - Name: mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth + + - Name: mask_rcnn_r50_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py + Metadata: + Training Memory (GB): 4.1 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth + + - Name: mask_rcnn_r101_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py + Metadata: + Training Memory (GB): 6.1 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244-5675c317.pth + + - Name: mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py + Metadata: + Training Memory (GB): 5.9 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339-3c33ce02.pth + + - Name: mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py + Metadata: + Training Memory (GB): 7.3 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410-abcd7859.pth + + - Name: mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + + - Name: mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco + Metadata: + Training Memory (GB): 10.3 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042-8bd2c639.pth + + - Name: mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py + Metadata: + Epochs: 36 + Training Memory (GB): 10.4 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447-c376f129.pth diff --git a/configs/mmdet/maskformer/README.md b/configs/mmdet/maskformer/README.md new file mode 100644 index 00000000..fa394f90 --- /dev/null +++ b/configs/mmdet/maskformer/README.md @@ -0,0 +1,52 @@ +# MaskFormer + +> [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) + + + +## Abstract + +Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models. + +
+ +
+ +## Introduction + +MaskFormer requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +## Results and Models + +| Backbone | style | Lr schd | Mem (GB) | Inf time (fps) | PQ | SQ | RQ | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st | Config | Download | detail | +|:--------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------:| +| R-50 | pytorch | 75e | 16.2 | - | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) | +| Swin-L | pytorch | 300e | 27.2 | - | 53.249 | 81.704 | 64.231 | 58.798 | 82.923 | 70.282 | 44.874 | 79.863 | 55.097 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco_20220326_221612-061b4eb8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco_20220326_221612.log.json) | - | +## Citation + +```latex +@inproceedings{cheng2021maskformer, + title={Per-Pixel Classification is Not All You Need for Semantic Segmentation}, + author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov}, + journal={NeurIPS}, + year={2021} +} +``` diff --git a/configs/mmdet/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py b/configs/mmdet/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py new file mode 100644 index 00000000..46b3c135 --- /dev/null +++ b/configs/mmdet/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py @@ -0,0 +1,238 @@ +_base_ = [ + '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' +] +num_things_classes = 80 +num_stuff_classes = 53 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type='MaskFormer', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='MaskFormerHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + pixel_decoder=dict( + type='TransformerEncoderPixelDecoder', + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + attn_drop=0.1, + proj_drop=0.1, + dropout_layer=None, + batch_first=False), + ffn_cfgs=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.1, + dropout_layer=None, + add_identity=True), + operation_order=('self_attn', 'norm', 'ffn', 'norm'), + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=False), + init_cfg=None), + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True)), + enforce_decoder_input_project=False, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + transformer_decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + attn_drop=0.1, + proj_drop=0.1, + dropout_layer=None, + batch_first=False), + ffn_cfgs=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.1, + dropout_layer=None, + add_identity=True), + # the following parameter was not used, + # just make current api happy + feedforward_channels=2048, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=20.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=1.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=1.0), + mask_cost=dict( + type='FocalLossCost', weight=20.0, binary_input=True), + dice_cost=dict( + type='DiceCost', weight=1.0, pred_act=True, eps=1.0)), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=False, + # max_per_image is for instance segmentation. + max_per_image=100, + object_mask_thr=0.8, + iou_thr=0.8, + # In MaskFormer's panoptic postprocessing, + # it will not filter masks whose score is smaller than 0.5 . + filter_low_score=False), + init_cfg=None) + +# dataset settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[[ + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict( + type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ]]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=1, + workers_per_gpu=1, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': dict(lr_mult=1.0, decay_mult=0.0) + }, + norm_decay_mult=0.0)) +optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + gamma=0.1, + by_epoch=True, + step=[50], + warmup='linear', + warmup_by_epoch=False, + warmup_ratio=1.0, # no warmup + warmup_iters=10) +runner = dict(type='EpochBasedRunner', max_epochs=75) diff --git a/configs/mmdet/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py b/configs/mmdet/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py new file mode 100644 index 00000000..bc23c54d --- /dev/null +++ b/configs/mmdet/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py @@ -0,0 +1,67 @@ +_base_ = './maskformer_r50_mstrain_16x1_75e_coco.py' + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + patch_size=4, + window_size=12, + mlp_ratio=4, + depths=depths, + num_heads=[6, 12, 24, 48], + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + in_channels=[192, 384, 768, 1536], # pass to pixel_decoder inside + pixel_decoder=dict( + _delete_=True, + type='PixelDecoder', + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU')), + enforce_decoder_input_project=True)) + +# weight_decay = 0.01 +# norm_weight_decay = 0.0 +# embed_weight_decay = 0.0 +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +norm_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'norm': norm_multi, + 'absolute_pos_embed': embed_multi, + 'relative_position_bias_table': embed_multi, + 'query_embed': embed_multi +} + +# optimizer +optimizer = dict( + type='AdamW', + lr=6e-5, + weight_decay=0.01, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) +optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + gamma=0.1, + by_epoch=True, + step=[250], + warmup='linear', + warmup_by_epoch=False, + warmup_ratio=1e-6, + warmup_iters=1500) +runner = dict(type='EpochBasedRunner', max_epochs=300) diff --git a/configs/mmdet/maskformer/metafile.yml b/configs/mmdet/maskformer/metafile.yml new file mode 100644 index 00000000..6530fa14 --- /dev/null +++ b/configs/mmdet/maskformer/metafile.yml @@ -0,0 +1,43 @@ +Collections: + - Name: MaskFormer + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 16x V100 GPUs + Architecture: + - MaskFormer + Paper: + URL: https://arxiv.org/pdf/2107.06278 + Title: 'Per-Pixel Classification is Not All You Need for Semantic Segmentation' + README: configs/maskformer/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/detectors/maskformer.py#L7 + Version: v2.22.0 + +Models: + - Name: maskformer_r50_mstrain_16x1_75e_coco + In Collection: MaskFormer + Config: configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py + Metadata: + Training Memory (GB): 16.2 + Epochs: 75 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 46.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth + - Name: maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco + In Collection: MaskFormer + Config: configs/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py + Metadata: + Training Memory (GB): 27.2 + Epochs: 300 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 53.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco_20220326_221612-061b4eb8.pth diff --git a/configs/mmdet/ms_rcnn/README.md b/configs/mmdet/ms_rcnn/README.md new file mode 100644 index 00000000..44508c06 --- /dev/null +++ b/configs/mmdet/ms_rcnn/README.md @@ -0,0 +1,36 @@ +# MS R-CNN + +> [Mask Scoring R-CNN](https://arxiv.org/abs/1903.00241) + + + +## Abstract + +Letting a deep network be aware of the quality of its own predictions is an interesting yet important problem. In the task of instance segmentation, the confidence of instance classification is used as mask quality score in most instance segmentation frameworks. However, the mask quality, quantified as the IoU between the instance mask and its ground truth, is usually not well correlated with classification score. In this paper, we study this problem and propose Mask Scoring R-CNN which contains a network block to learn the quality of the predicted instance masks. The proposed network block takes the instance feature and the corresponding predicted mask together to regress the mask IoU. The mask scoring strategy calibrates the misalignment between mask quality and mask score, and improves instance segmentation performance by prioritizing more accurate mask predictions during COCO AP evaluation. By extensive evaluations on the COCO dataset, Mask Scoring R-CNN brings consistent and noticeable gain with different models, and outperforms the state-of-the-art Mask R-CNN. We hope our simple and effective approach will provide a new direction for improving instance segmentation. + +
+ +
+ +## Results and Models + +| Backbone | style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:-------------:|:----------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50-FPN | caffe | 1x | 4.5 | | 38.2 | 36.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848.log.json) | +| R-50-FPN | caffe | 2x | - | - | 38.8 | 36.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_bbox_mAP-0.388__segm_mAP-0.363_20200506_004738-ee87b137.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_20200506_004738.log.json) | +| R-101-FPN | caffe | 1x | 6.5 | | 40.4 | 37.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.404__segm_mAP-0.376_20200506_004755-b9b12a37.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_20200506_004755.log.json) | +| R-101-FPN | caffe | 2x | - | - | 41.1 | 38.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_bbox_mAP-0.411__segm_mAP-0.381_20200506_011134-5f3cc74f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_20200506_011134.log.json) | +| R-X101-32x4d | pytorch | 2x | 7.9 | 11.0 | 41.8 | 38.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206-81fd1740.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206_100113.log.json) | +| R-X101-64x4d | pytorch | 1x | 11.0 | 8.0 | 43.0 | 39.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206-86ba88d2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206_091744.log.json) | +| R-X101-64x4d | pytorch | 2x | 11.0 | 8.0 | 42.6 | 39.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308-02a445e2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308_012247.log.json) | + +## Citation + +```latex +@inproceedings{huang2019msrcnn, + title={Mask Scoring R-CNN}, + author={Zhaojin Huang and Lichao Huang and Yongchao Gong and Chang Huang and Xinggang Wang}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019}, +} +``` diff --git a/configs/mmdet/ms_rcnn/metafile.yml b/configs/mmdet/ms_rcnn/metafile.yml new file mode 100644 index 00000000..a6c7dc59 --- /dev/null +++ b/configs/mmdet/ms_rcnn/metafile.yml @@ -0,0 +1,159 @@ +Collections: + - Name: Mask Scoring R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RPN + - FPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/1903.00241 + Title: 'Mask Scoring R-CNN' + README: configs/ms_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_scoring_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: ms_rcnn_r50_caffe_fpn_1x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth + + - Name: ms_rcnn_r50_caffe_fpn_2x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_bbox_mAP-0.388__segm_mAP-0.363_20200506_004738-ee87b137.pth + + - Name: ms_rcnn_r101_caffe_fpn_1x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.404__segm_mAP-0.376_20200506_004755-b9b12a37.pth + + - Name: ms_rcnn_r101_caffe_fpn_2x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_bbox_mAP-0.411__segm_mAP-0.381_20200506_011134-5f3cc74f.pth + + - Name: ms_rcnn_x101_32x4d_fpn_1x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.9 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206-81fd1740.pth + + - Name: ms_rcnn_x101_64x4d_fpn_1x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 11.0 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206-86ba88d2.pth + + - Name: ms_rcnn_x101_64x4d_fpn_2x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 11.0 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308-02a445e2.pth diff --git a/configs/mmdet/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py b/configs/mmdet/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..9b7dcbbf --- /dev/null +++ b/configs/mmdet/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './ms_rcnn_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py b/configs/mmdet/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py new file mode 100644 index 00000000..202bcced --- /dev/null +++ b/configs/mmdet/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './ms_rcnn_r101_caffe_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..5845125a --- /dev/null +++ b/configs/mmdet/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py' +model = dict( + type='MaskScoringRCNN', + roi_head=dict( + type='MaskScoringRoIHead', + mask_iou_head=dict( + type='MaskIoUHead', + num_convs=4, + num_fcs=2, + roi_feat_size=14, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + num_classes=80)), + # model training and testing settings + train_cfg=dict(rcnn=dict(mask_thr_binary=0.5))) diff --git a/configs/mmdet/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py b/configs/mmdet/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py new file mode 100644 index 00000000..008a70ae --- /dev/null +++ b/configs/mmdet/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './ms_rcnn_r50_caffe_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/ms_rcnn/ms_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/ms_rcnn/ms_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..0a163ce4 --- /dev/null +++ b/configs/mmdet/ms_rcnn/ms_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +model = dict( + type='MaskScoringRCNN', + roi_head=dict( + type='MaskScoringRoIHead', + mask_iou_head=dict( + type='MaskIoUHead', + num_convs=4, + num_fcs=2, + roi_feat_size=14, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + num_classes=80)), + # model training and testing settings + train_cfg=dict(rcnn=dict(mask_thr_binary=0.5))) diff --git a/configs/mmdet/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..20479bbd --- /dev/null +++ b/configs/mmdet/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ms_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..ee5b7341 --- /dev/null +++ b/configs/mmdet/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ms_rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py b/configs/mmdet/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py new file mode 100644 index 00000000..54c605b9 --- /dev/null +++ b/configs/mmdet/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './ms_rcnn_x101_64x4d_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/nas_fcos/README.md b/configs/mmdet/nas_fcos/README.md new file mode 100644 index 00000000..74453c6b --- /dev/null +++ b/configs/mmdet/nas_fcos/README.md @@ -0,0 +1,35 @@ +# NAS-FCOS + +> [NAS-FCOS: Fast Neural Architecture Search for Object Detection](https://arxiv.org/abs/1906.04423) + + + +## Abstract + +The success of deep neural networks relies on significant architecture engineering. Recently neural architecture search (NAS) has emerged as a promise to greatly reduce manual effort in network design by automatically searching for optimal architectures, although typically such algorithms need an excessive amount of computational resources, e.g., a few thousand GPU-days. To date, on challenging vision tasks such as object detection, NAS, especially fast versions of NAS, is less studied. Here we propose to search for the decoder structure of object detectors with search efficiency being taken into consideration. To be more specific, we aim to efficiently search for the feature pyramid network (FPN) as well as the prediction head of a simple anchor-free object detector, namely FCOS, using a tailored reinforcement learning paradigm. With carefully designed search space, search algorithms and strategies for evaluating network quality, we are able to efficiently search a top-performing detection architecture within 4 days using 8 V100 GPUs. The discovered architecture surpasses state-of-the-art object detection models (such as Faster R-CNN, RetinaNet and FCOS) by 1.5 to 3.5 points in AP on the COCO dataset, with comparable computation complexity and memory footprint, demonstrating the efficacy of the proposed NAS for object detection. + +
+ +
+ +## Results and Models + +| Head | Backbone | Style | GN-head | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:---------:|:-------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| NAS-FCOSHead | R-50 | caffe | Y | 1x | | | 39.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520.log.json) | +| FCOSHead | R-50 | caffe | Y | 1x | | | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521-7fdcbce0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521.log.json) | + +**Notes:** + +- To be consistent with the author's implementation, we use 4 GPUs with 4 images/GPU. + +## Citation + +```latex +@article{wang2019fcos, + title={Nas-fcos: Fast neural architecture search for object detection}, + author={Wang, Ning and Gao, Yang and Chen, Hao and Wang, Peng and Tian, Zhi and Shen, Chunhua}, + journal={arXiv preprint arXiv:1906.04423}, + year={2019} +} +``` diff --git a/configs/mmdet/nas_fcos/metafile.yml b/configs/mmdet/nas_fcos/metafile.yml new file mode 100644 index 00000000..1ea28cfc --- /dev/null +++ b/configs/mmdet/nas_fcos/metafile.yml @@ -0,0 +1,44 @@ +Collections: + - Name: NAS-FCOS + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 4x V100 GPUs + Architecture: + - FPN + - NAS-FCOS + - ResNet + Paper: + URL: https://arxiv.org/abs/1906.04423 + Title: 'NAS-FCOS: Fast Neural Architecture Search for Object Detection' + README: configs/nas_fcos/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/detectors/nasfcos.py#L6 + Version: v2.1.0 + +Models: + - Name: nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco + In Collection: NAS-FCOS + Config: configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth + + - Name: nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco + In Collection: NAS-FCOS + Config: configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521-7fdcbce0.pth diff --git a/configs/mmdet/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/mmdet/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py new file mode 100644 index 00000000..a455c928 --- /dev/null +++ b/configs/mmdet/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py @@ -0,0 +1,100 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='NASFCOS', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False, eps=0), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='NASFCOS_FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=dict(type='BN'), + conv_cfg=dict(type='DCNv2', deform_groups=2)), + bbox_head=dict( + type='FCOSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + norm_cfg=dict(type='GN', num_groups=32), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +optimizer = dict( + lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) diff --git a/configs/mmdet/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/mmdet/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py new file mode 100644 index 00000000..b7794925 --- /dev/null +++ b/configs/mmdet/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py @@ -0,0 +1,99 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='NASFCOS', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False, eps=0), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='NASFCOS_FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=dict(type='BN'), + conv_cfg=dict(type='DCNv2', deform_groups=2)), + bbox_head=dict( + type='NASFCOSHead', + num_classes=80, + in_channels=256, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + norm_cfg=dict(type='GN', num_groups=32), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +optimizer = dict( + lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) diff --git a/configs/mmdet/nas_fpn/README.md b/configs/mmdet/nas_fpn/README.md new file mode 100644 index 00000000..7b39eec5 --- /dev/null +++ b/configs/mmdet/nas_fpn/README.md @@ -0,0 +1,36 @@ +# NAS-FPN + +> [NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection](https://arxiv.org/abs/1904.07392) + + + +## Abstract + +Current state-of-the-art convolutional architectures for object detection are manually designed. Here we aim to learn a better architecture of feature pyramid network for object detection. We adopt Neural Architecture Search and discover a new feature pyramid architecture in a novel scalable search space covering all cross-scale connections. The discovered architecture, named NAS-FPN, consists of a combination of top-down and bottom-up connections to fuse features across scales. NAS-FPN, combined with various backbone models in the RetinaNet framework, achieves better accuracy and latency tradeoff compared to state-of-the-art object detection models. NAS-FPN improves mobile detection accuracy by 2 AP compared to state-of-the-art SSDLite with MobileNetV2 model in [32] and achieves 48.3 AP which surpasses Mask R-CNN [10] detection accuracy with less computation time. + +
+ +
+ +## Results and Models + +We benchmark the new training schedule (crop training, large batch, unfrozen BN, 50 epochs) introduced in NAS-FPN. RetinaNet is used in the paper. + +| Backbone | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:-----------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50-FPN | 50e | 12.9 | 22.9 | 37.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco-9b953d76.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco_20200529_095329.log.json) | +| R-50-NASFPN | 50e | 13.2 | 23.0 | 40.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco_20200528_230008.log.json) | + +**Note**: We find that it is unstable to train NAS-FPN and there is a small chance that results can be 3% mAP lower. + +## Citation + +```latex +@inproceedings{ghiasi2019fpn, + title={Nas-fpn: Learning scalable feature pyramid architecture for object detection}, + author={Ghiasi, Golnaz and Lin, Tsung-Yi and Le, Quoc V}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={7036--7045}, + year={2019} +} +``` diff --git a/configs/mmdet/nas_fpn/metafile.yml b/configs/mmdet/nas_fpn/metafile.yml new file mode 100644 index 00000000..ab8d6497 --- /dev/null +++ b/configs/mmdet/nas_fpn/metafile.yml @@ -0,0 +1,59 @@ +Collections: + - Name: NAS-FPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - NAS-FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.07392 + Title: 'NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection' + README: configs/nas_fpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/nas_fpn.py#L67 + Version: v2.0.0 + +Models: + - Name: retinanet_r50_fpn_crop640_50e_coco + In Collection: NAS-FPN + Config: configs/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py + Metadata: + Training Memory (GB): 12.9 + inference time (ms/im): + - value: 43.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco-9b953d76.pth + + - Name: retinanet_r50_nasfpn_crop640_50e_coco + In Collection: NAS-FPN + Config: configs/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py + Metadata: + Training Memory (GB): 13.2 + inference time (ms/im): + - value: 43.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth diff --git a/configs/mmdet/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py b/configs/mmdet/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py new file mode 100644 index 00000000..e4408fe8 --- /dev/null +++ b/configs/mmdet/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py @@ -0,0 +1,85 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +cudnn_benchmark = True +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + relu_before_extra_convs=True, + no_norm_on_lateral=True, + norm_cfg=norm_cfg), + bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg), + # training and testing settings + train_cfg=dict(assigner=dict(neg_iou_thr=0.5))) +# dataset settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=(640, 640), + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=(640, 640)), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(640, 640), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=64), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', + lr=0.08, + momentum=0.9, + weight_decay=0.0001, + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.1, + step=[30, 40]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=50) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py b/configs/mmdet/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py new file mode 100644 index 00000000..1387a10f --- /dev/null +++ b/configs/mmdet/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py @@ -0,0 +1,84 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +cudnn_benchmark = True +# model settings +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + type='RetinaNet', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict(type='NASFPN', stack_times=7, norm_cfg=norm_cfg), + bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg), + # training and testing settings + train_cfg=dict(assigner=dict(neg_iou_thr=0.5))) +# dataset settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=(640, 640), + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=(640, 640)), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(640, 640), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=128), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', + lr=0.08, + momentum=0.9, + weight_decay=0.0001, + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.1, + step=[30, 40]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=50) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/openimages/README.md b/configs/mmdet/openimages/README.md new file mode 100644 index 00000000..a2f2c136 --- /dev/null +++ b/configs/mmdet/openimages/README.md @@ -0,0 +1,143 @@ +# Open Images Dataset + +> [Open Images Dataset](https://arxiv.org/abs/1811.00982) + + +## Abstract + + +#### Open Images v6 + +[Open Images](https://storage.googleapis.com/openimages/web/index.html) is a dataset of ~9M images annotated with image-level labels, +object bounding boxes, object segmentation masks, visual relationships, +and localized narratives: + +- It contains a total of 16M bounding boxes for 600 object classes on +1.9M images, making it the largest existing dataset with object location +annotations. The boxes have been largely manually drawn by professional +annotators to ensure accuracy and consistency. The images are very diverse +and often contain complex scenes with several objects (8.3 per image on +average). + +- Open Images also offers visual relationship annotations, indicating pairs +of objects in particular relations (e.g. "woman playing guitar", "beer on +table"), object properties (e.g. "table is wooden"), and human actions (e.g. +"woman is jumping"). In total it has 3.3M annotations from 1,466 distinct +relationship triplets. + +- In V5 we added segmentation masks for 2.8M object instances in 350 classes. +Segmentation masks mark the outline of objects, which characterizes their +spatial extent to a much higher level of detail. + +- In V6 we added 675k localized narratives: multimodal descriptions of images +consisting of synchronized voice, text, and mouse traces over the objects being +described. (Note we originally launched localized narratives only on train in V6, +but since July 2020 we also have validation and test covered.) + +- Finally, the dataset is annotated with 59.9M image-level labels spanning 19,957 +classes. + +We believe that having a single dataset with unified annotations for image +classification, object detection, visual relationship detection, instance +segmentation, and multimodal image descriptions will enable to study these +tasks jointly and stimulate progress towards genuine scene understanding. + + +
+ +
+ +#### Open Images Challenge 2019 + +[Open Images Challenges 2019](https://storage.googleapis.com/openimages/web/challenge2019.html) is based on the V5 release of the Open +Images dataset. The images of the dataset are very varied and +often contain complex scenes with several objects (explore the dataset). + +## Citation + +``` +@article{OpenImages, + author = {Alina Kuznetsova and Hassan Rom and Neil Alldrin and Jasper Uijlings and Ivan Krasin and Jordi Pont-Tuset and Shahab Kamali and Stefan Popov and Matteo Malloci and Alexander Kolesnikov and Tom Duerig and Vittorio Ferrari}, + title = {The Open Images Dataset V4: Unified image classification, object detection, and visual relationship detection at scale}, + year = {2020}, + journal = {IJCV} +} +``` + +## Prepare Dataset + +1. You need to download and extract Open Images dataset. + +2. The Open Images dataset does not have image metas (width and height of the image), +which will be used during evaluation. We suggest to get test image metas before +training/testing by using `tools/misc/get_image_metas.py`. + + **Usage** + ```shell + python tools/misc/get_image_metas.py ${CONFIG} \ + --out ${OUTPUT FILE NAME} + ``` + +3. The directory should be like this: + + ```none + mmdetection + ├── mmdet + ├── tools + ├── configs + ├── data + │ ├── OpenImages + │ │ ├── annotations + │ │ │ ├── bbox_labels_600_hierarchy.json + │ │ │ ├── class-descriptions-boxable.csv + │ │ │ ├── oidv6-train-annotations-bbox.scv + │ │ │ ├── validation-annotations-bbox.csv + │ │ │ ├── validation-annotations-human-imagelabels-boxable.csv + │ │ │ ├── validation-image-metas.pkl # get from script + │ │ ├── challenge2019 + │ │ │ ├── challenge-2019-train-detection-bbox.txt + │ │ │ ├── challenge-2019-validation-detection-bbox.txt + │ │ │ ├── class_label_tree.np + │ │ │ ├── class_sample_train.pkl + │ │ │ ├── challenge-2019-validation-detection-human-imagelabels.csv # download from official website + │ │ │ ├── challenge-2019-validation-metas.pkl # get from script + │ │ ├── OpenImages + │ │ │ ├── train # training images + │ │ │ ├── test # testing images + │ │ │ ├── validation # validation images + ``` + +**Note**: +1. The training and validation images of Open Images Challenge dataset are based on +Open Images v6, but the test images are different. +2. The Open Images Challenges annotations are obtained from [TSD](https://github.com/Sense-X/TSD). +You can also download the annotations from [official website](https://storage.googleapis.com/openimages/web/challenge2019_downloads.html), +and set data.train.type=OpenImagesDataset, data.val.type=OpenImagesDataset, and data.test.type=OpenImagesDataset in the config +3. If users do not want to use `validation-annotations-human-imagelabels-boxable.csv` and `challenge-2019-validation-detection-human-imagelabels.csv` +users can set `data.val.load_image_level_labels=False` and `data.test.load_image_level_labels=False` in the config. +Please note that loading image-levels label is the default of Open Images evaluation metric. +More details please refer to the [official website](https://storage.googleapis.com/openimages/web/evaluation.html) + +## Results and Models + +| Architecture | Backbone | Style | Lr schd | Sampler | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:------------:|:---------:|:-------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| Faster R-CNN | R-50 | pytorch | 1x | Group Sampler | 7.7 | - | 51.6 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159.log.json) | +| Faster R-CNN | R-50 | pytorch | 1x | Class Aware Sampler | 7.7 | - | 60.0 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424-98c630e5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424.log.json) | +| Faster R-CNN (Challenge 2019) | R-50 | pytorch | 1x | Group Sampler | 7.7 | - | 54.9 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100-0e79e5df.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100.log.json) | +| Faster R-CNN (Challenge 2019) | R-50 | pytorch | 1x | Class Aware Sampler | 7.1 | - | 65.0 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021-34c402d9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021.log.json) | +| Retinanet | R-50 | pytorch | 1x | Group Sampler | 6.6 | - | 61.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954.log.json) | +| SSD | VGG16 | pytorch | 36e | Group Sampler | 10.8 | - | 35.4 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/ssd300_32x8_36e_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232.log.json) | + +**Notes:** + +- 'cas' is short for 'Class Aware Sampler' + +### Results of consider image level labels + +| Architecture | Sampler | Consider Image Level Labels | box AP| +|:------------:|:-------:|:---------------------------:|:-----:| +|Faster R-CNN r50 (Challenge 2019)| Group Sampler| w/o | 62.19 | +|Faster R-CNN r50 (Challenge 2019)| Group Sampler| w/ | 54.87 | +|Faster R-CNN r50 (Challenge 2019)| Class Aware Sampler| w/o | 71.77 | +|Faster R-CNN r50 (Challenge 2019)| Class Aware Sampler| w/ | 64.98 | diff --git a/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py b/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py new file mode 100644 index 00000000..3dfc341b --- /dev/null +++ b/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py @@ -0,0 +1,23 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/openimages_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict(roi_head=dict(bbox_head=dict(num_classes=601))) + +# Using 32 GPUS while training +optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=26000, + warmup_ratio=1.0 / 64, + step=[8, 11]) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py b/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py new file mode 100644 index 00000000..c8900adc --- /dev/null +++ b/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py @@ -0,0 +1,47 @@ +_base_ = ['faster_rcnn_r50_fpn_32x2_1x_openimages.py'] + +model = dict( + roi_head=dict(bbox_head=dict(num_classes=500)), + test_cfg=dict(rcnn=dict(score_thr=0.01))) + +# dataset settings +dataset_type = 'OpenImagesChallengeDataset' +data_root = 'data/OpenImages/' +data = dict( + train=dict( + type=dataset_type, + ann_file=data_root + + 'challenge2019/challenge-2019-train-detection-bbox.txt', + img_prefix=data_root + 'OpenImages/', + label_file=data_root + 'challenge2019/cls-label-description.csv', + hierarchy_file=data_root + 'challenge2019/class_label_tree.np'), + val=dict( + type=dataset_type, + ann_file=data_root + + 'challenge2019/challenge-2019-validation-detection-bbox.txt', + img_prefix=data_root + 'OpenImages/', + label_file=data_root + 'challenge2019/cls-label-description.csv', + hierarchy_file=data_root + 'challenge2019/class_label_tree.np', + meta_file=data_root + + 'challenge2019/challenge-2019-validation-metas.pkl', + image_level_ann_file=data_root + + 'challenge2019/challenge-2019-validation-detection-' + 'human-imagelabels.csv'), + test=dict( + type=dataset_type, + ann_file=data_root + + 'challenge2019/challenge-2019-validation-detection-bbox.txt', + img_prefix=data_root + 'OpenImages/', + label_file=data_root + 'challenge2019/cls-label-description.csv', + hierarchy_file=data_root + 'challenge2019/class_label_tree.np', + meta_file=data_root + + 'challenge2019/challenge-2019-validation-metas.pkl', + image_level_ann_file=data_root + + 'challenge2019/challenge-2019-validation-detection-' + 'human-imagelabels.csv')) +evaluation = dict(interval=1, metric='mAP') + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py b/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py new file mode 100644 index 00000000..88d029d6 --- /dev/null +++ b/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py @@ -0,0 +1,5 @@ +_base_ = ['faster_rcnn_r50_fpn_32x2_1x_openimages.py'] + +# Use ClassAwareSampler +data = dict( + train_dataloader=dict(class_aware_sampler=dict(num_sample_class=1))) diff --git a/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py b/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py new file mode 100644 index 00000000..26bd64e6 --- /dev/null +++ b/configs/mmdet/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py @@ -0,0 +1,5 @@ +_base_ = ['faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py'] + +# Use ClassAwareSampler +data = dict( + train_dataloader=dict(class_aware_sampler=dict(num_sample_class=1))) diff --git a/configs/mmdet/openimages/metafile.yml b/configs/mmdet/openimages/metafile.yml new file mode 100644 index 00000000..9be17261 --- /dev/null +++ b/configs/mmdet/openimages/metafile.yml @@ -0,0 +1,102 @@ +Models: + - Name: faster_rcnn_r50_fpn_32x2_1x_openimages + In Collection: Faster R-CNN + Config: configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py + Metadata: + Training Memory (GB): 7.7 + Epochs: 12 + Training Data: Open Images v6 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images v6 + Metrics: + box AP: 51.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth + + - Name: retinanet_r50_fpn_32x2_1x_openimages + In Collection: RetinaNet + Config: configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py + Metadata: + Training Memory (GB): 6.6 + Epochs: 12 + Training Data: Open Images v6 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images v6 + Metrics: + box AP: 61.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth + + - Name: ssd300_32x8_36e_openimages + In Collection: SSD + Config: configs/openimages/ssd300_32x8_36e_openimages + Metadata: + Training Memory (GB): 10.8 + Epochs: 36 + Training Data: Open Images v6 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images v6 + Metrics: + box AP: 35.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth + + - Name: faster_rcnn_r50_fpn_32x2_1x_openimages_challenge + In Collection: Faster R-CNN + Config: configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py + Metadata: + Training Memory (GB): 7.7 + Epochs: 12 + Training Data: Open Images Challenge 2019 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images Challenge 2019 + Metrics: + box AP: 54.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100-0e79e5df.pth + + - Name: faster_rcnn_r50_fpn_32x2_cas_1x_openimages + In Collection: Faster R-CNN + Config: configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py + Metadata: + Training Memory (GB): 7.7 + Epochs: 12 + Training Data: Open Images Challenge 2019 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images Challenge 2019 + Metrics: + box AP: 60.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424-98c630e5.pth + + - Name: faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge + In Collection: Faster R-CNN + Config: configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py + Metadata: + Training Memory (GB): 7.1 + Epochs: 12 + Training Data: Open Images Challenge 2019 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images Challenge 2019 + Metrics: + box AP: 65.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021-34c402d9.pth diff --git a/configs/mmdet/openimages/retinanet_r50_fpn_32x2_1x_openimages.py b/configs/mmdet/openimages/retinanet_r50_fpn_32x2_1x_openimages.py new file mode 100644 index 00000000..0191aa16 --- /dev/null +++ b/configs/mmdet/openimages/retinanet_r50_fpn_32x2_1x_openimages.py @@ -0,0 +1,22 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/openimages_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict(bbox_head=dict(num_classes=601)) + +optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=26000, + warmup_ratio=1.0 / 64, + step=[8, 11]) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/openimages/ssd300_32x8_36e_openimages.py b/configs/mmdet/openimages/ssd300_32x8_36e_openimages.py new file mode 100644 index 00000000..e2565b98 --- /dev/null +++ b/configs/mmdet/openimages/ssd300_32x8_36e_openimages.py @@ -0,0 +1,83 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/openimages_detection.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py' +] +model = dict( + bbox_head=dict( + num_classes=601, + anchor_generator=dict(basesize_ratio_range=(0.2, 0.9)))) +# dataset settings +dataset_type = 'OpenImagesDataset' +data_root = 'data/OpenImages/' +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True, normed_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(300, 300), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(300, 300), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, # using 32 GPUS while training. + workers_per_gpu=0, # workers_per_gpu > 0 may occur out of memory + train=dict( + _delete_=True, + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + ann_file=data_root + + 'annotations/oidv6-train-annotations-bbox.csv', + img_prefix=data_root + 'OpenImages/train/', + label_file=data_root + + 'annotations/class-descriptions-boxable.csv', + hierarchy_file=data_root + + 'annotations/bbox_labels_600_hierarchy.json', + pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.04, momentum=0.9, weight_decay=5e-4) +optimizer_config = dict() +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=20000, + warmup_ratio=0.001, + step=[8, 11]) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=256) diff --git a/configs/mmdet/paa/README.md b/configs/mmdet/paa/README.md new file mode 100644 index 00000000..0f299004 --- /dev/null +++ b/configs/mmdet/paa/README.md @@ -0,0 +1,47 @@ +# PAA + +> [Probabilistic Anchor Assignment with IoU Prediction for Object Detection](https://arxiv.org/abs/2007.08103) + + + +## Abstract + +In object detection, determining which anchors to assign as positive or negative samples, known as anchor assignment, has been revealed as a core procedure that can significantly affect a model's performance. In this paper we propose a novel anchor assignment strategy that adaptively separates anchors into positive and negative samples for a ground truth bounding box according to the model's learning status such that it is able to reason about the separation in a probabilistic manner. To do so we first calculate the scores of anchors conditioned on the model and fit a probability distribution to these scores. The model is then trained with anchors separated into positive and negative samples according to their probabilities. Moreover, we investigate the gap between the training and testing objectives and propose to predict the Intersection-over-Unions of detected boxes as a measure of localization quality to reduce the discrepancy. The combined score of classification and localization qualities serving as a box selection metric in non-maximum suppression well aligns with the proposed anchor assignment strategy and leads significant performance improvements. The proposed methods only add a single convolutional layer to RetinaNet baseline and does not require multiple anchors per location, so are efficient. Experimental results verify the effectiveness of the proposed methods. Especially, our models set new records for single-stage detectors on MS COCO test-dev dataset with various backbones. + +
+ +
+ +## Results and Models + +We provide config files to reproduce the object detection results in the +ECCV 2020 paper for Probabilistic Anchor Assignment with IoU +Prediction for Object Detection. + +| Backbone | Lr schd | Mem (GB) | Score voting | box AP | Config | Download | +|:-----------:|:-------:|:--------:|:------------:|:------:|:------:|:--------:| +| R-50-FPN | 12e | 3.7 | True | 40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.log.json) | +| R-50-FPN | 12e | 3.7 | False | 40.2 | - | +| R-50-FPN | 18e | 3.7 | True | 41.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_1.5x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.log.json) | +| R-50-FPN | 18e | 3.7 | False | 41.2 | - | +| R-50-FPN | 24e | 3.7 | True | 41.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.log.json) | +| R-50-FPN | 36e | 3.7 | True | 43.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722-06a6880b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722.log.json) | +| R-101-FPN | 12e | 6.2 | True | 42.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.log.json) | +| R-101-FPN | 12e | 6.2 | False | 42.4 | - | +| R-101-FPN | 24e | 6.2 | True | 43.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.log.json) | +| R-101-FPN | 36e | 6.2 | True | 45.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r101_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202-83250d22.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202.log.json) | + +**Note**: + +1. We find that the performance is unstable with 1x setting and may fluctuate by about 0.2 mAP. We report the best results. + +## Citation + +```latex +@inproceedings{paa-eccv2020, + title={Probabilistic Anchor Assignment with IoU Prediction for Object Detection}, + author={Kim, Kang and Lee, Hee Seok}, + booktitle = {ECCV}, + year={2020} +} +``` diff --git a/configs/mmdet/paa/metafile.yml b/configs/mmdet/paa/metafile.yml new file mode 100644 index 00000000..e08b663a --- /dev/null +++ b/configs/mmdet/paa/metafile.yml @@ -0,0 +1,104 @@ +Collections: + - Name: PAA + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - Probabilistic Anchor Assignment + - ResNet + Paper: + URL: https://arxiv.org/abs/2007.08103 + Title: 'Probabilistic Anchor Assignment with IoU Prediction for Object Detection' + README: configs/paa/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/detectors/paa.py#L6 + Version: v2.4.0 + +Models: + - Name: paa_r50_fpn_1x_coco + In Collection: PAA + Config: configs/paa/paa_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.7 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth + + - Name: paa_r50_fpn_1.5x_coco + In Collection: PAA + Config: configs/paa/paa_r50_fpn_1.5x_coco.py + Metadata: + Training Memory (GB): 3.7 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.pth + + - Name: paa_r50_fpn_2x_coco + In Collection: PAA + Config: configs/paa/paa_r50_fpn_2x_coco.py + Metadata: + Training Memory (GB): 3.7 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.pth + + - Name: paa_r50_fpn_mstrain_3x_coco + In Collection: PAA + Config: configs/paa/paa_r50_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 3.7 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722-06a6880b.pth + + - Name: paa_r101_fpn_1x_coco + In Collection: PAA + Config: configs/paa/paa_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth + + - Name: paa_r101_fpn_2x_coco + In Collection: PAA + Config: configs/paa/paa_r101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 6.2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.pth + + - Name: paa_r101_fpn_mstrain_3x_coco + In Collection: PAA + Config: configs/paa/paa_r101_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 6.2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202-83250d22.pth diff --git a/configs/mmdet/paa/paa_r101_fpn_1x_coco.py b/configs/mmdet/paa/paa_r101_fpn_1x_coco.py new file mode 100644 index 00000000..94f1c278 --- /dev/null +++ b/configs/mmdet/paa/paa_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './paa_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/paa/paa_r101_fpn_2x_coco.py b/configs/mmdet/paa/paa_r101_fpn_2x_coco.py new file mode 100644 index 00000000..641ef764 --- /dev/null +++ b/configs/mmdet/paa/paa_r101_fpn_2x_coco.py @@ -0,0 +1,3 @@ +_base_ = './paa_r101_fpn_1x_coco.py' +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/paa/paa_r101_fpn_mstrain_3x_coco.py b/configs/mmdet/paa/paa_r101_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..71858ed6 --- /dev/null +++ b/configs/mmdet/paa/paa_r101_fpn_mstrain_3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './paa_r50_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/paa/paa_r50_fpn_1.5x_coco.py b/configs/mmdet/paa/paa_r50_fpn_1.5x_coco.py new file mode 100644 index 00000000..aabce4af --- /dev/null +++ b/configs/mmdet/paa/paa_r50_fpn_1.5x_coco.py @@ -0,0 +1,3 @@ +_base_ = './paa_r50_fpn_1x_coco.py' +lr_config = dict(step=[12, 16]) +runner = dict(type='EpochBasedRunner', max_epochs=18) diff --git a/configs/mmdet/paa/paa_r50_fpn_1x_coco.py b/configs/mmdet/paa/paa_r50_fpn_1x_coco.py new file mode 100644 index 00000000..4c9c4aa7 --- /dev/null +++ b/configs/mmdet/paa/paa_r50_fpn_1x_coco.py @@ -0,0 +1,70 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='PAA', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='PAAHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.1, + neg_iou_thr=0.1, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/paa/paa_r50_fpn_2x_coco.py b/configs/mmdet/paa/paa_r50_fpn_2x_coco.py new file mode 100644 index 00000000..663d2c0d --- /dev/null +++ b/configs/mmdet/paa/paa_r50_fpn_2x_coco.py @@ -0,0 +1,3 @@ +_base_ = './paa_r50_fpn_1x_coco.py' +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/paa/paa_r50_fpn_mstrain_3x_coco.py b/configs/mmdet/paa/paa_r50_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..91fa28cd --- /dev/null +++ b/configs/mmdet/paa/paa_r50_fpn_mstrain_3x_coco.py @@ -0,0 +1,20 @@ +_base_ = './paa_r50_fpn_1x_coco.py' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +data = dict(train=dict(pipeline=train_pipeline)) +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/pafpn/README.md b/configs/mmdet/pafpn/README.md new file mode 100644 index 00000000..4a406af6 --- /dev/null +++ b/configs/mmdet/pafpn/README.md @@ -0,0 +1,34 @@ +# PAFPN + +> [Path Aggregation Network for Instance Segmentation](https://arxiv.org/abs/1803.01534) + + + +## Abstract + +The way that information propagates in neural networks is of great importance. In this paper, we propose Path Aggregation Network (PANet) aiming at boosting information flow in proposal-based instance segmentation framework. Specifically, we enhance the entire feature hierarchy with accurate localization signals in lower layers by bottom-up path augmentation, which shortens the information path between lower layers and topmost feature. We present adaptive feature pooling, which links feature grid and all feature levels to make useful information in each feature level propagate directly to following proposal subnetworks. A complementary branch capturing different views for each proposal is created to further improve mask prediction. These improvements are simple to implement, with subtle extra computational overhead. Our PANet reaches the 1st place in the COCO 2017 Challenge Instance Segmentation task and the 2nd place in Object Detection task without large-batch training. It is also state-of-the-art on MVD and Cityscapes. + +
+ +
+ +## Results and Models + +| Backbone | style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +|:-------------:|:----------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:| +| R-50-FPN | pytorch | 1x | 4.0 | 17.2 | 37.5 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pafpn/faster_rcnn_r50_pafpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_20200503_105836.log.json) | + +## Citation + +```latex +@inproceedings{liu2018path, + author = {Shu Liu and + Lu Qi and + Haifang Qin and + Jianping Shi and + Jiaya Jia}, + title = {Path Aggregation Network for Instance Segmentation}, + booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2018} +} +``` diff --git a/configs/mmdet/pafpn/faster_rcnn_r50_pafpn_1x_coco.py b/configs/mmdet/pafpn/faster_rcnn_r50_pafpn_1x_coco.py new file mode 100644 index 00000000..b2fdef91 --- /dev/null +++ b/configs/mmdet/pafpn/faster_rcnn_r50_pafpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' + +model = dict( + neck=dict( + type='PAFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/pafpn/metafile.yml b/configs/mmdet/pafpn/metafile.yml new file mode 100644 index 00000000..f9cf97c8 --- /dev/null +++ b/configs/mmdet/pafpn/metafile.yml @@ -0,0 +1,38 @@ +Collections: + - Name: PAFPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - PAFPN + Paper: + URL: https://arxiv.org/abs/1803.01534 + Title: 'Path Aggregation Network for Instance Segmentation' + README: configs/pafpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/pafpn.py#L11 + Version: v2.0.0 + +Models: + - Name: faster_rcnn_r50_pafpn_1x_coco + In Collection: PAFPN + Config: configs/pafpn/faster_rcnn_r50_pafpn_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 58.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth diff --git a/configs/mmdet/panoptic_fpn/README.md b/configs/mmdet/panoptic_fpn/README.md new file mode 100644 index 00000000..bc89293e --- /dev/null +++ b/configs/mmdet/panoptic_fpn/README.md @@ -0,0 +1,62 @@ +# Panoptic FPN + +> [Panoptic feature pyramid networks](https://arxiv.org/abs/1901.02446) + + + +## Abstract + +The recently introduced panoptic segmentation task has renewed our community's interest in unifying the tasks of instance segmentation (for thing classes) and semantic segmentation (for stuff classes). However, current state-of-the-art methods for this joint task use separate and dissimilar networks for instance and semantic segmentation, without performing any shared computation. In this work, we aim to unify these methods at the architectural level, designing a single network for both tasks. Our approach is to endow Mask R-CNN, a popular instance segmentation method, with a semantic segmentation branch using a shared Feature Pyramid Network (FPN) backbone. Surprisingly, this simple baseline not only remains effective for instance segmentation, but also yields a lightweight, top-performing method for semantic segmentation. In this work, we perform a detailed study of this minimally extended version of Mask R-CNN with FPN, which we refer to as Panoptic FPN, and show it is a robust and accurate baseline for both tasks. Given its effectiveness and conceptual simplicity, we hope our method can serve as a strong baseline and aid future research in panoptic segmentation. + +
+ +
+ +## Dataset + +PanopticFPN requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +## Results and Models + +| Backbone | style | Lr schd | Mem (GB) | Inf time (fps) | PQ | SQ | RQ | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st | Config | Download | +|:-------------:|:----------:|:-------:|:--------:|:--------------:|:----:|:----:|:----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:------:|:--------:| +| R-50-FPN | pytorch | 1x | 4.7 | | 40.2 | 77.8 | 49.3 | 47.8 | 80.9 | 57.5 | 28.9 | 73.1 | 37.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153.log.json) | +| R-50-FPN | pytorch | 3x | - | - | 42.5 | 78.1 | 51.7 | 50.3 | 81.5 | 60.3 | 30.7 | 73.0 | 38.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155-5650f98b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155.log.json) | +| R-101-FPN | pytorch | 1x | 6.7 | | 42.2 | 78.3 | 51.4 | 50.1 | 81.4 | 59.9 | 30.3 | 73.6 | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950.log.json) | +| R-101-FPN | pytorch | 3x | - | - | 44.1 | 78.9 | 53.6 | 52.1 | 81.7 | 62.3 | 32.0 | 74.6 | 40.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712-9c99acc4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712.log.json) | + +## Citation + +The base method for panoptic segmentation task. + +```latex +@inproceedings{kirillov2018panopticfpn, + author = { + Alexander Kirillov, + Ross Girshick, + Kaiming He, + Piotr Dollar, + }, + title = {Panoptic Feature Pyramid Networks}, + booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2019} +} +``` diff --git a/configs/mmdet/panoptic_fpn/metafile.yml b/configs/mmdet/panoptic_fpn/metafile.yml new file mode 100644 index 00000000..8c9d39dc --- /dev/null +++ b/configs/mmdet/panoptic_fpn/metafile.yml @@ -0,0 +1,70 @@ +Collections: + - Name: PanopticFPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - PanopticFPN + Paper: + URL: https://arxiv.org/pdf/1901.02446 + Title: 'Panoptic feature pyramid networks' + README: configs/panoptic_fpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/detectors/panoptic_fpn.py#L7 + Version: v2.16.0 + +Models: + - Name: panoptic_fpn_r50_fpn_1x_coco + In Collection: PanopticFPN + Config: configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.6 + Epochs: 12 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth + + - Name: panoptic_fpn_r50_fpn_mstrain_3x_coco + In Collection: PanopticFPN + Config: configs/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 4.6 + Epochs: 36 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 42.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155-5650f98b.pth + + - Name: panoptic_fpn_r101_fpn_1x_coco + In Collection: PanopticFPN + Config: configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.5 + Epochs: 12 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 42.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth + + - Name: panoptic_fpn_r101_fpn_mstrain_3x_coco + In Collection: PanopticFPN + Config: configs/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 6.5 + Epochs: 36 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 44.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712-9c99acc4.pth diff --git a/configs/mmdet/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py b/configs/mmdet/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..78b80798 --- /dev/null +++ b/configs/mmdet/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py b/configs/mmdet/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..057e4811 --- /dev/null +++ b/configs/mmdet/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './panoptic_fpn_r50_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py b/configs/mmdet/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..29955246 --- /dev/null +++ b/configs/mmdet/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py @@ -0,0 +1,33 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_panoptic.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='PanopticFPN', + semantic_head=dict( + type='PanopticFPNHead', + num_things_classes=80, + num_stuff_classes=53, + in_channels=256, + inner_channels=128, + start_level=0, + end_level=4, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + conv_cfg=None, + loss_seg=dict( + type='CrossEntropyLoss', ignore_index=255, loss_weight=0.5)), + panoptic_fusion_head=dict( + type='HeuristicFusionHead', + num_things_classes=80, + num_stuff_classes=53), + test_cfg=dict( + panoptic=dict( + score_thr=0.6, + max_per_img=100, + mask_thr_binary=0.5, + mask_overlap=0.5, + nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True), + stuff_area_limit=4096))) + +custom_hooks = [] diff --git a/configs/mmdet/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py b/configs/mmdet/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..b5109353 --- /dev/null +++ b/configs/mmdet/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py @@ -0,0 +1,61 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_coco.py' + +# dataset settings +dataset_type = 'CocoPanopticDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SegRescale', scale_factor=1 / 4), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# Use RepeatDataset to speed up training +data = dict( + train=dict( + _delete_=True, + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/panoptic_train2017.json', + img_prefix=data_root + 'train2017/', + seg_prefix=data_root + 'annotations/panoptic_train2017/', + pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/pascal_voc/README.md b/configs/mmdet/pascal_voc/README.md new file mode 100644 index 00000000..25797bcb --- /dev/null +++ b/configs/mmdet/pascal_voc/README.md @@ -0,0 +1,40 @@ +# Pascal VOC + +> [The Pascal Visual Object Classes (VOC) Challenge](https://link.springer.com/article/10.1007/s11263-009-0275-4) + + + +## Abstract + +The Pascal Visual Object Classes (VOC) challenge is a benchmark in visual object category recognition and detection, providing the vision and machine learning communities with a standard dataset of images and annotation, and standard evaluation procedures. Organised annually from 2005 to present, the challenge and its associated dataset has become accepted as the benchmark for object detection. + +This paper describes the dataset and evaluation procedure. We review the state-of-the-art in evaluated methods for both classification and detection, analyse whether the methods are statistically different, what they are learning from the images (e.g. the object or its context), and what the methods find easy or confuse. The paper concludes with lessons learnt in the three year history of the challenge, and proposes directions for future improvement and extension. + +
+ +
+ +## Results and Models + +| Architecture | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:------------:|:---------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| Faster R-CNN C4 | R-50 | caffe | 18k | | - | 80.9 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712//home/dong/code_sensetime/2022Q1/mmdetection/work_dirs/prepare_voc/gather/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712_20220314_234327-847a14d2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712_20220314_234327.log.json) | +| Faster R-CNN | R-50 | pytorch | 1x | 2.6 | - | 80.4 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/faster_rcnn_r50_fpn_1x_voc0712_20220320_192712-54bef0f3.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/faster_rcnn_r50_fpn_1x_voc0712_20220320_192712.log.json) | +| Retinanet | R-50 | pytorch | 1x | 2.1 | - | 77.3 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20200617-47cbdd0e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20200616_014642.log.json) | +| SSD300 | VGG16 | - | 120e | - | - | 76.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/ssd300_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd300_voc0712/ssd300_voc0712_20220320_194658-17edda1b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd300_voc0712/ssd300_voc0712_20220320_194658.log.json) | +| SSD512 | VGG16 | - | 120e | - | - | 79.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/ssd512_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd512_voc0712/ssd512_voc0712_20220320_194717-03cefefe.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd512_voc0712/ssd512_voc0712_20220320_194717.log.json) | + +## Citation + +```latex +@Article{Everingham10, + author = "Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.", + title = "The Pascal Visual Object Classes (VOC) Challenge", + journal = "International Journal of Computer Vision", + volume = "88", + year = "2010", + number = "2", + month = jun, + pages = "303--338", +} +``` diff --git a/configs/mmdet/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712.py b/configs/mmdet/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712.py new file mode 100644 index 00000000..7bb1d736 --- /dev/null +++ b/configs/mmdet/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712.py @@ -0,0 +1,81 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_caffe_c4.py', + '../_base_/default_runtime.py' +] +model = dict(roi_head=dict(bbox_head=dict(num_classes=20))) + +# dataset settings +dataset_type = 'VOCDataset' +data_root = 'data/VOCdevkit/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 512), (1333, 544), (1333, 576), + (1333, 608), (1333, 640), (1333, 672), (1333, 704), + (1333, 736), (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=[ + data_root + 'VOC2007/ImageSets/Main/trainval.txt', + data_root + 'VOC2012/ImageSets/Main/trainval.txt' + ], + img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', + img_prefix=data_root + 'VOC2007/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', + img_prefix=data_root + 'VOC2007/', + pipeline=test_pipeline)) + +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=100, + warmup_ratio=0.001, + step=[12000, 16000]) + +# Runner type +runner = dict(type='IterBasedRunner', max_iters=18000) + +checkpoint_config = dict(interval=3000) +evaluation = dict(interval=3000, metric='mAP') diff --git a/configs/mmdet/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py b/configs/mmdet/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py new file mode 100644 index 00000000..7866aceb --- /dev/null +++ b/configs/mmdet/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py @@ -0,0 +1,14 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', '../_base_/datasets/voc0712.py', + '../_base_/default_runtime.py' +] +model = dict(roi_head=dict(bbox_head=dict(num_classes=20))) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +# actual epoch = 3 * 3 = 9 +lr_config = dict(policy='step', step=[3]) +# runtime settings +runner = dict( + type='EpochBasedRunner', max_epochs=4) # actual epoch = 4 * 3 = 12 diff --git a/configs/mmdet/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712_cocofmt.py b/configs/mmdet/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712_cocofmt.py new file mode 100644 index 00000000..12eee2c1 --- /dev/null +++ b/configs/mmdet/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712_cocofmt.py @@ -0,0 +1,75 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', '../_base_/datasets/voc0712.py', + '../_base_/default_runtime.py' +] +model = dict(roi_head=dict(bbox_head=dict(num_classes=20))) + +CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', + 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', + 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/VOCdevkit/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1000, 600), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1000, 600), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + ann_file='data/voc0712_trainval.json', + img_prefix='data/VOCdevkit', + pipeline=train_pipeline, + classes=CLASSES)), + val=dict( + type=dataset_type, + ann_file='data/voc07_test.json', + img_prefix='data/VOCdevkit', + pipeline=test_pipeline, + classes=CLASSES), + test=dict( + type=dataset_type, + ann_file='data/voc07_test.json', + img_prefix='data/VOCdevkit', + pipeline=test_pipeline, + classes=CLASSES)) +evaluation = dict(interval=1, metric='bbox') + +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +# actual epoch = 3 * 3 = 9 +lr_config = dict(policy='step', step=[3]) +# runtime settings +runner = dict( + type='EpochBasedRunner', max_epochs=4) # actual epoch = 4 * 3 = 12 diff --git a/configs/mmdet/pascal_voc/retinanet_r50_fpn_1x_voc0712.py b/configs/mmdet/pascal_voc/retinanet_r50_fpn_1x_voc0712.py new file mode 100644 index 00000000..b4b050dd --- /dev/null +++ b/configs/mmdet/pascal_voc/retinanet_r50_fpn_1x_voc0712.py @@ -0,0 +1,14 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', '../_base_/datasets/voc0712.py', + '../_base_/default_runtime.py' +] +model = dict(bbox_head=dict(num_classes=20)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +# actual epoch = 3 * 3 = 9 +lr_config = dict(policy='step', step=[3]) +# runtime settings +runner = dict( + type='EpochBasedRunner', max_epochs=4) # actual epoch = 4 * 3 = 12 diff --git a/configs/mmdet/pascal_voc/ssd300_voc0712.py b/configs/mmdet/pascal_voc/ssd300_voc0712.py new file mode 100644 index 00000000..e7008aef --- /dev/null +++ b/configs/mmdet/pascal_voc/ssd300_voc0712.py @@ -0,0 +1,74 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/voc0712.py', + '../_base_/default_runtime.py' +] +model = dict( + bbox_head=dict( + num_classes=20, anchor_generator=dict(basesize_ratio_range=(0.2, + 0.9)))) +# dataset settings +dataset_type = 'VOCDataset' +data_root = 'data/VOCdevkit/' +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(300, 300), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(300, 300), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=3, + train=dict( + type='RepeatDataset', times=10, dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4) +optimizer_config = dict() +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[16, 20]) +checkpoint_config = dict(interval=1) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=24) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/pascal_voc/ssd512_voc0712.py b/configs/mmdet/pascal_voc/ssd512_voc0712.py new file mode 100644 index 00000000..f4627c2d --- /dev/null +++ b/configs/mmdet/pascal_voc/ssd512_voc0712.py @@ -0,0 +1,57 @@ +_base_ = 'ssd300_voc0712.py' +input_size = 512 +model = dict( + neck=dict( + out_channels=(512, 1024, 512, 256, 256, 256, 256), + level_strides=(2, 2, 2, 2, 1), + level_paddings=(1, 1, 1, 1, 1), + last_kernel_size=4), + bbox_head=dict( + in_channels=(512, 1024, 512, 256, 256, 256, 256), + anchor_generator=dict( + input_size=input_size, + strides=[8, 16, 32, 64, 128, 256, 512], + basesize_ratio_range=(0.15, 0.9), + ratios=([2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2])))) +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(512, 512), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(512, 512), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/pisa/README.md b/configs/mmdet/pisa/README.md new file mode 100644 index 00000000..d5329418 --- /dev/null +++ b/configs/mmdet/pisa/README.md @@ -0,0 +1,50 @@ +# PISA + +> [Prime Sample Attention in Object Detection](https://arxiv.org/abs/1904.04821) + + + +## Abstract + +It is a common paradigm in object detection frameworks to treat all samples equally and target at maximizing the performance on average. In this work, we revisit this paradigm through a careful study on how different samples contribute to the overall performance measured in terms of mAP. Our study suggests that the samples in each mini-batch are neither independent nor equally important, and therefore a better classifier on average does not necessarily mean higher mAP. Motivated by this study, we propose the notion of Prime Samples, those that play a key role in driving the detection performance. We further develop a simple yet effective sampling and learning strategy called PrIme Sample Attention (PISA) that directs the focus of the training process towards such samples. Our experiments demonstrate that it is often more effective to focus on prime samples than hard samples when training a detector. Particularly, On the MSCOCO dataset, PISA outperforms the random sampling baseline and hard mining schemes, e.g., OHEM and Focal Loss, consistently by around 2% on both single-stage and two-stage detectors, even with a strong backbone ResNeXt-101. + +
+ +
+ +## Results and Models + +| PISA | Network | Backbone | Lr schd | box AP | mask AP | Config | Download | +|:----:|:-------:|:-------------------:|:-------:|:------:|:-------:|:------:|:--------:| +| × | Faster R-CNN | R-50-FPN | 1x | 36.4 | | - | +| √ | Faster R-CNN | R-50-FPN | 1x | 38.4 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco_20200506_185619.log.json) | +| × | Faster R-CNN | X101-32x4d-FPN | 1x | 40.1 | | - | +| √ | Faster R-CNN | X101-32x4d-FPN | 1x | 41.9 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco-e4accec4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco_20200505_181503.log.json) | +| × | Mask R-CNN | R-50-FPN | 1x | 37.3 | 34.2 | - | +| √ | Mask R-CNN | R-50-FPN | 1x | 39.1 | 35.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco-dfcedba6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco_20200508_150500.log.json) | +| × | Mask R-CNN | X101-32x4d-FPN | 1x | 41.1 | 37.1 | - | +| √ | Mask R-CNN | X101-32x4d-FPN | 1x | | | | +| × | RetinaNet | R-50-FPN | 1x | 35.6 | | - | +| √ | RetinaNet | R-50-FPN | 1x | 36.9 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco-76409952.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco_20200504_014311.log.json) | +| × | RetinaNet | X101-32x4d-FPN | 1x | 39.0 | | - | +| √ | RetinaNet | X101-32x4d-FPN | 1x | 40.7 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco-a0c13c73.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco_20200505_001404.log.json) | +| × | SSD300 | VGG16 | 1x | 25.6 | | - | +| √ | SSD300 | VGG16 | 1x | 27.6 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_ssd300_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco-710e3ac9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco_20200504_144325.log.json) | +| × | SSD300 | VGG16 | 1x | 29.3 | | - | +| √ | SSD300 | VGG16 | 1x | 31.8 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_ssd512_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco-247addee.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco_20200508_131030.log.json) | + +**Notes:** + +- In the original paper, all models are trained and tested on mmdet v1.x, thus results may not be exactly the same with this release on v2.0. +- It is noted PISA only modifies the training pipeline so the inference time remains the same with the baseline. + +## Citation + +```latex +@inproceedings{cao2019prime, + title={Prime sample attention in object detection}, + author={Cao, Yuhang and Chen, Kai and Loy, Chen Change and Lin, Dahua}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2020} +} +``` diff --git a/configs/mmdet/pisa/metafile.yml b/configs/mmdet/pisa/metafile.yml new file mode 100644 index 00000000..cd43afb0 --- /dev/null +++ b/configs/mmdet/pisa/metafile.yml @@ -0,0 +1,110 @@ +Collections: + - Name: PISA + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - PISA + - RPN + - ResNet + - RoIPool + Paper: + URL: https://arxiv.org/abs/1904.04821 + Title: 'Prime Sample Attention in Object Detection' + README: configs/pisa/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/roi_heads/pisa_roi_head.py#L8 + Version: v2.1.0 + +Models: + - Name: pisa_faster_rcnn_r50_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth + + - Name: pisa_faster_rcnn_x101_32x4d_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco-e4accec4.pth + + - Name: pisa_mask_rcnn_r50_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco-dfcedba6.pth + + - Name: pisa_retinanet_r50_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/pisa_retinanet_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco-76409952.pth + + - Name: pisa_retinanet_x101_32x4d_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco-a0c13c73.pth + + - Name: pisa_ssd300_coco + In Collection: PISA + Config: configs/pisa/pisa_ssd300_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 27.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco-710e3ac9.pth + + - Name: pisa_ssd512_coco + In Collection: PISA + Config: configs/pisa/pisa_ssd512_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 31.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco-247addee.pth diff --git a/configs/mmdet/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..71e65b0b --- /dev/null +++ b/configs/mmdet/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' + +model = dict( + roi_head=dict( + type='PISARoIHead', + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + sampler=dict( + type='ScoreHLRSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True, + k=0.5, + bias=0.), + isr=dict(k=2, bias=0), + carl=dict(k=1, bias=0.2))), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/configs/mmdet/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..16edd99d --- /dev/null +++ b/configs/mmdet/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py' + +model = dict( + roi_head=dict( + type='PISARoIHead', + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + sampler=dict( + type='ScoreHLRSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True, + k=0.5, + bias=0.), + isr=dict(k=2, bias=0), + carl=dict(k=1, bias=0.2))), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/configs/mmdet/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..047a2934 --- /dev/null +++ b/configs/mmdet/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' + +model = dict( + roi_head=dict( + type='PISARoIHead', + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + sampler=dict( + type='ScoreHLRSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True, + k=0.5, + bias=0.), + isr=dict(k=2, bias=0), + carl=dict(k=1, bias=0.2))), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/configs/mmdet/pisa/pisa_mask_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/pisa/pisa_mask_rcnn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..2186a8f6 --- /dev/null +++ b/configs/mmdet/pisa/pisa_mask_rcnn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py' + +model = dict( + roi_head=dict( + type='PISARoIHead', + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + sampler=dict( + type='ScoreHLRSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True, + k=0.5, + bias=0.), + isr=dict(k=2, bias=0), + carl=dict(k=1, bias=0.2))), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/configs/mmdet/pisa/pisa_retinanet_r50_fpn_1x_coco.py b/configs/mmdet/pisa/pisa_retinanet_r50_fpn_1x_coco.py new file mode 100644 index 00000000..70f89e22 --- /dev/null +++ b/configs/mmdet/pisa/pisa_retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' + +model = dict( + bbox_head=dict( + type='PISARetinaHead', + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)), + train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2))) diff --git a/configs/mmdet/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..b97b6720 --- /dev/null +++ b/configs/mmdet/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = '../retinanet/retinanet_x101_32x4d_fpn_1x_coco.py' + +model = dict( + bbox_head=dict( + type='PISARetinaHead', + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)), + train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2))) diff --git a/configs/mmdet/pisa/pisa_ssd300_coco.py b/configs/mmdet/pisa/pisa_ssd300_coco.py new file mode 100644 index 00000000..b5cc0064 --- /dev/null +++ b/configs/mmdet/pisa/pisa_ssd300_coco.py @@ -0,0 +1,8 @@ +_base_ = '../ssd/ssd300_coco.py' + +model = dict( + bbox_head=dict(type='PISASSDHead'), + train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2))) + +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/pisa/pisa_ssd512_coco.py b/configs/mmdet/pisa/pisa_ssd512_coco.py new file mode 100644 index 00000000..3219d6d6 --- /dev/null +++ b/configs/mmdet/pisa/pisa_ssd512_coco.py @@ -0,0 +1,8 @@ +_base_ = '../ssd/ssd512_coco.py' + +model = dict( + bbox_head=dict(type='PISASSDHead'), + train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2))) + +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/point_rend/README.md b/configs/mmdet/point_rend/README.md new file mode 100644 index 00000000..a55560af --- /dev/null +++ b/configs/mmdet/point_rend/README.md @@ -0,0 +1,33 @@ +# PointRend + +> [PointRend: Image Segmentation as Rendering](https://arxiv.org/abs/1912.08193) + + + +## Abstract + +We present a new method for efficient high-quality image segmentation of objects and scenes. By analogizing classical computer graphics methods for efficient rendering with over- and undersampling challenges faced in pixel labeling tasks, we develop a unique perspective of image segmentation as a rendering problem. From this vantage, we present the PointRend (Point-based Rendering) neural network module: a module that performs point-based segmentation predictions at adaptively selected locations based on an iterative subdivision algorithm. PointRend can be flexibly applied to both instance and semantic segmentation tasks by building on top of existing state-of-the-art models. While many concrete implementations of the general idea are possible, we show that a simple design already achieves excellent results. Qualitatively, PointRend outputs crisp object boundaries in regions that are over-smoothed by previous methods. Quantitatively, PointRend yields significant gains on COCO and Cityscapes, for both instance and semantic segmentation. PointRend's efficiency enables output resolutions that are otherwise impractical in terms of memory or computation compared to existing approaches. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| R-50-FPN | caffe | 1x | 4.6 | | 38.4 | 36.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco_20200612_161407.log.json) | +| R-50-FPN | caffe | 3x | 4.6 | | 41.0 | 38.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco-e0ebb6b7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco_20200614_002632.log.json) | + +Note: All models are trained with multi-scale, the input image shorter side is randomly scaled to one of (640, 672, 704, 736, 768, 800). + +## Citation + +```latex +@InProceedings{kirillov2019pointrend, + title={{PointRend}: Image Segmentation as Rendering}, + author={Alexander Kirillov and Yuxin Wu and Kaiming He and Ross Girshick}, + journal={ArXiv:1912.08193}, + year={2019} +} +``` diff --git a/configs/mmdet/point_rend/metafile.yml b/configs/mmdet/point_rend/metafile.yml new file mode 100644 index 00000000..82aea05b --- /dev/null +++ b/configs/mmdet/point_rend/metafile.yml @@ -0,0 +1,54 @@ +Collections: + - Name: PointRend + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - PointRend + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1912.08193 + Title: 'PointRend: Image Segmentation as Rendering' + README: configs/point_rend/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/detectors/point_rend.py#L6 + Version: v2.2.0 + +Models: + - Name: point_rend_r50_caffe_fpn_mstrain_1x_coco + In Collection: PointRend + Config: configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py + Metadata: + Training Memory (GB): 4.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth + + - Name: point_rend_r50_caffe_fpn_mstrain_3x_coco + In Collection: PointRend + Config: configs/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 4.6 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco-e0ebb6b7.pth diff --git a/configs/mmdet/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py b/configs/mmdet/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py new file mode 100644 index 00000000..0c0e563d --- /dev/null +++ b/configs/mmdet/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py @@ -0,0 +1,44 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py' +# model settings +model = dict( + type='PointRend', + roi_head=dict( + type='PointRendRoIHead', + mask_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='concat', + roi_layer=dict( + _delete_=True, type='SimpleRoIAlign', output_size=14), + out_channels=256, + featmap_strides=[4]), + mask_head=dict( + _delete_=True, + type='CoarseMaskHead', + num_fcs=2, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + point_head=dict( + type='MaskPointHead', + num_fcs=3, + in_channels=256, + fc_channels=256, + num_classes=80, + coarse_pred_each_layer=True, + loss_point=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + mask_size=7, + num_points=14 * 14, + oversample_ratio=3, + importance_sample_ratio=0.75)), + test_cfg=dict( + rcnn=dict( + subdivision_steps=5, + subdivision_num_points=28 * 28, + scale_factor=2))) diff --git a/configs/mmdet/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py b/configs/mmdet/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..169278e5 --- /dev/null +++ b/configs/mmdet/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py @@ -0,0 +1,4 @@ +_base_ = './point_rend_r50_caffe_fpn_mstrain_1x_coco.py' +# learning policy +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/pvt/README.md b/configs/mmdet/pvt/README.md new file mode 100644 index 00000000..25528d0d --- /dev/null +++ b/configs/mmdet/pvt/README.md @@ -0,0 +1,57 @@ +# PVT + +> [Pyramid vision transformer: A versatile backbone for dense prediction without convolutions](https://arxiv.org/abs/2102.12122) + + + +## Abstract + +Although using convolutional neural networks (CNNs) as backbones achieves great successes in computer vision, this work investigates a simple backbone network useful for many dense prediction tasks without convolutions. Unlike the recently-proposed Transformer model (e.g., ViT) that is specially designed for image classification, we propose Pyramid Vision Transformer~(PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several merits compared to prior arts. (1) Different from ViT that typically has low-resolution outputs and high computational and memory cost, PVT can be not only trained on dense partitions of the image to achieve high output resolution, which is important for dense predictions but also using a progressive shrinking pyramid to reduce computations of large feature maps. (2) PVT inherits the advantages from both CNN and Transformer, making it a unified backbone in various vision tasks without convolutions by simply replacing CNN backbones. (3) We validate PVT by conducting extensive experiments, showing that it boosts the performance of many downstream tasks, e.g., object detection, semantic, and instance segmentation. For example, with a comparable number of parameters, RetinaNet+PVT achieves 40.4 AP on the COCO dataset, surpassing RetinNet+ResNet50 (36.3 AP) by 4.1 absolute AP. We hope PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future researches. + +Transformer recently has shown encouraging progresses in computer vision. In this work, we present new baselines by improving the original Pyramid Vision Transformer (abbreviated as PVTv1) by adding three designs, including (1) overlapping patch embedding, (2) convolutional feed-forward networks, and (3) linear complexity attention layers. +With these modifications, our PVTv2 significantly improves PVTv1 on three tasks e.g., classification, detection, and segmentation. Moreover, PVTv2 achieves comparable or better performances than recent works such as Swin Transformer. We hope this work will facilitate state-of-the-art Transformer researches in computer vision. + +
+ +
+ +## Results and Models + +### RetinaNet (PVTv1) + +| Backbone | Lr schd | Mem (GB) | box AP | Config | Download | +|:-----------:|:-------:|:--------:|:------:|:------:|:--------:| +| PVT-Tiny | 12e |8.5 |36.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_t_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110-17b566bd.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110.log.json) | +| PVT-Small | 12e |14.5 |40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_s_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921.log.json) | +| PVT-Medium | 12e |20.9 |41.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_m_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243-55effa1b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243.log.json) | + +### RetinaNet (PVTv2) + +| Backbone | Lr schd | Mem (GB) | box AP | Config | Download | +|:-----------:|:-------:|:--------:|:------:|:------:|:--------:| +| PVTv2-B0 | 12e |7.4 |37.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b0_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157-13e9aabe.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157.log.json) | +| PVTv2-B1 | 12e |9.5 |41.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b1_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318-7e169a7d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318.log.json) | +| PVTv2-B2 | 12e |16.2 |44.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b2_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843-529f0b9a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843.log.json) | +| PVTv2-B3 | 12e |23.0 |46.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b3_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512-8357deff.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512.log.json) | +| PVTv2-B4 | 12e |17.0 |46.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b4_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151-83795c86.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151.log.json) | +| PVTv2-B5 | 12e |18.7 |46.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800-3420eb57.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800.log.json) | + +## Citation + +```latex +@article{wang2021pyramid, + title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions}, + author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling}, + journal={arXiv preprint arXiv:2102.12122}, + year={2021} +} +``` + +```latex +@article{wang2021pvtv2, + title={PVTv2: Improved Baselines with Pyramid Vision Transformer}, + author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling}, + journal={arXiv preprint arXiv:2106.13797}, + year={2021} +} +``` diff --git a/configs/mmdet/pvt/metafile.yml b/configs/mmdet/pvt/metafile.yml new file mode 100644 index 00000000..58843784 --- /dev/null +++ b/configs/mmdet/pvt/metafile.yml @@ -0,0 +1,243 @@ +Models: + - Name: retinanet_pvt-t_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvt-t_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110-17b566bd.pth + Paper: + URL: https://arxiv.org/abs/2102.12122 + Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315 + Version: 2.17.0 + + - Name: retinanet_pvt-s_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvt-s_fpn_1x_coco.py + Metadata: + Training Memory (GB): 14.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth + Paper: + URL: https://arxiv.org/abs/2102.12122 + Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315 + Version: 2.17.0 + + - Name: retinanet_pvt-m_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvt-m_fpn_1x_coco.py + Metadata: + Training Memory (GB): 20.9 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243-55effa1b.pth + Paper: + URL: https://arxiv.org/abs/2102.12122 + Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b0_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157-13e9aabe.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b1_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318-7e169a7d.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b2_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py + Metadata: + Training Memory (GB): 16.2 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843-529f0b9a.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b3_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py + Metadata: + Training Memory (GB): 23.0 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512-8357deff.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b4_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py + Metadata: + Training Memory (GB): 17.0 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151-83795c86.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b5_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 18.7 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800-3420eb57.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 diff --git a/configs/mmdet/pvt/retinanet_pvt-l_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvt-l_fpn_1x_coco.py new file mode 100644 index 00000000..e299f2a0 --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvt-l_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = 'retinanet_pvt-t_fpn_1x_coco.py' +model = dict( + backbone=dict( + num_layers=[3, 8, 27, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_large.pth'))) +fp16 = dict(loss_scale=dict(init_scale=512)) diff --git a/configs/mmdet/pvt/retinanet_pvt-m_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvt-m_fpn_1x_coco.py new file mode 100644 index 00000000..b888f788 --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvt-m_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = 'retinanet_pvt-t_fpn_1x_coco.py' +model = dict( + backbone=dict( + num_layers=[3, 4, 18, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_medium.pth'))) diff --git a/configs/mmdet/pvt/retinanet_pvt-s_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvt-s_fpn_1x_coco.py new file mode 100644 index 00000000..46603488 --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvt-s_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = 'retinanet_pvt-t_fpn_1x_coco.py' +model = dict( + backbone=dict( + num_layers=[3, 4, 6, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_small.pth'))) diff --git a/configs/mmdet/pvt/retinanet_pvt-t_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvt-t_fpn_1x_coco.py new file mode 100644 index 00000000..a6cff7d0 --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvt-t_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='RetinaNet', + backbone=dict( + _delete_=True, + type='PyramidVisionTransformer', + num_layers=[2, 2, 2, 2], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_tiny.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) +# optimizer +optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001) diff --git a/configs/mmdet/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py new file mode 100644 index 00000000..cbe2295d --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='RetinaNet', + backbone=dict( + _delete_=True, + type='PyramidVisionTransformerV2', + embed_dims=32, + num_layers=[2, 2, 2, 2], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b0.pth')), + neck=dict(in_channels=[32, 64, 160, 256])) +# optimizer +optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001) diff --git a/configs/mmdet/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py new file mode 100644 index 00000000..5374c509 --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b1.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) diff --git a/configs/mmdet/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py new file mode 100644 index 00000000..cf9a18de --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + num_layers=[3, 4, 6, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b2.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) diff --git a/configs/mmdet/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py new file mode 100644 index 00000000..7a47f820 --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + num_layers=[3, 4, 18, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b3.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) diff --git a/configs/mmdet/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py new file mode 100644 index 00000000..9891d7bd --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + num_layers=[3, 8, 27, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b4.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) +# optimizer +optimizer = dict( + _delete_=True, type='AdamW', lr=0.0001 / 1.4, weight_decay=0.0001) +# dataset settings +data = dict(samples_per_gpu=1, workers_per_gpu=1) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/configs/mmdet/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py b/configs/mmdet/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py new file mode 100644 index 00000000..a9fea2eb --- /dev/null +++ b/configs/mmdet/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py @@ -0,0 +1,19 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + num_layers=[3, 6, 40, 3], + mlp_ratios=(4, 4, 4, 4), + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b5.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) +# optimizer +optimizer = dict( + _delete_=True, type='AdamW', lr=0.0001 / 1.4, weight_decay=0.0001) +# dataset settings +data = dict(samples_per_gpu=1, workers_per_gpu=1) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/configs/mmdet/queryinst/README.md b/configs/mmdet/queryinst/README.md new file mode 100644 index 00000000..c041662f --- /dev/null +++ b/configs/mmdet/queryinst/README.md @@ -0,0 +1,36 @@ +# QueryInst + +> [Instances as Queries](https://openaccess.thecvf.com/content/ICCV2021/html/Fang_Instances_As_Queries_ICCV_2021_paper.html) + + + +## Abstract + +We present QueryInst, a new perspective for instance segmentation. QueryInst is a multi-stage end-to-end system that treats instances of interest as learnable queries, enabling query based object detectors, e.g., Sparse R-CNN, to have strong instance segmentation performance. The attributes of instances such as categories, bounding boxes, instance masks, and instance association embeddings are represented by queries in a unified manner. In QueryInst, a query is shared by both detection and segmentation via dynamic convolutions and driven by parallelly-supervised multi-stage learning. We conduct extensive experiments on three challenging benchmarks, i.e., COCO, CityScapes, and YouTube-VIS to evaluate the effectiveness of QueryInst in object detection, instance segmentation, and video instance segmentation tasks. For the first time, we demonstrate that a simple end-to-end query based framework can achieve the state-of-the-art performance in various instance-level recognition tasks. + +
+ +
+ +## Results and Models + +| Model | Backbone | Style | Lr schd | Number of Proposals |Multi-Scale| RandomCrop | box AP | mask AP | Config | Download | +|:------------:|:---------:|:-------:|:-------:|:-------: |:-------: |:---------:|:------:|:------:|:------:|:--------:| +| QueryInst | R-50-FPN | pytorch | 1x | 100 | False | False | 42.0 | 37.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916.log.json) | +| QueryInst | R-50-FPN | pytorch | 3x | 100 | True | False | 44.8 | 39.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643-7837af86.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643.log.json) | +| QueryInst | R-50-FPN | pytorch | 3x | 300 | True | True | 47.5 | 41.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802-85cffbd8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802.log.json) | +| QueryInst | R-101-FPN | pytorch | 3x | 100 | True | False | 46.4 | 41.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048-91f9995b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048.log.json) | +| QueryInst | R-101-FPN | pytorch | 3x | 300 | True | True | 49.0 | 42.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621-76cce59f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621.log.json) | + +## Citation + +```latex +@InProceedings{Fang_2021_ICCV, + author = {Fang, Yuxin and Yang, Shusheng and Wang, Xinggang and Li, Yu and Fang, Chen and Shan, Ying and Feng, Bin and Liu, Wenyu}, + title = {Instances As Queries}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2021}, + pages = {6910-6919} +} +``` diff --git a/configs/mmdet/queryinst/metafile.yml b/configs/mmdet/queryinst/metafile.yml new file mode 100644 index 00000000..da7f0a72 --- /dev/null +++ b/configs/mmdet/queryinst/metafile.yml @@ -0,0 +1,100 @@ +Collections: + - Name: QueryInst + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - QueryInst + Paper: + URL: https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Instances_As_Queries_ICCV_2021_paper.pdf + Title: 'Instances as Queries' + README: configs/queryinst/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/queryinst.py + Version: v2.18.0 + +Models: + - Name: queryinst_r50_fpn_1x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth + + - Name: queryinst_r50_fpn_mstrain_480-800_3x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643-7837af86.pth + + - Name: queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802-85cffbd8.pth + + - Name: queryinst_r101_fpn_mstrain_480-800_3x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048-91f9995b.pth + + - Name: queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621-76cce59f.pth diff --git a/configs/mmdet/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py b/configs/mmdet/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py new file mode 100644 index 00000000..fd138f5a --- /dev/null +++ b/configs/mmdet/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py b/configs/mmdet/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py new file mode 100644 index 00000000..07cae19c --- /dev/null +++ b/configs/mmdet/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './queryinst_r50_fpn_mstrain_480-800_3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/queryinst/queryinst_r50_fpn_1x_coco.py b/configs/mmdet/queryinst/queryinst_r50_fpn_1x_coco.py new file mode 100644 index 00000000..48f5773b --- /dev/null +++ b/configs/mmdet/queryinst/queryinst_r50_fpn_1x_coco.py @@ -0,0 +1,138 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +num_stages = 6 +num_proposals = 100 +model = dict( + type='QueryInst', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=0, + add_extra_convs='on_input', + num_outs=4), + rpn_head=dict( + type='EmbeddingRPNHead', + num_proposals=num_proposals, + proposal_feature_channel=256), + roi_head=dict( + type='SparseRoIHead', + num_stages=num_stages, + stage_loss_weights=[1] * num_stages, + proposal_feature_channel=256, + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='DIIHead', + num_classes=80, + num_ffn_fcs=2, + num_heads=8, + num_cls_fcs=1, + num_reg_fcs=3, + feedforward_channels=2048, + in_channels=256, + dropout=0.0, + ffn_act_cfg=dict(type='ReLU', inplace=True), + dynamic_conv_cfg=dict( + type='DynamicConv', + in_channels=256, + feat_channels=64, + out_channels=256, + input_feat_shape=7, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN')), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + clip_border=False, + target_means=[0., 0., 0., 0.], + target_stds=[0.5, 0.5, 1., 1.])) for _ in range(num_stages) + ], + mask_head=[ + dict( + type='DynamicMaskHead', + dynamic_conv_cfg=dict( + type='DynamicConv', + in_channels=256, + feat_channels=64, + out_channels=256, + input_feat_shape=14, + with_proj=False, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN')), + num_convs=4, + num_classes=80, + roi_feat_size=14, + in_channels=256, + conv_kernel_size=3, + conv_out_channels=256, + class_agnostic=False, + norm_cfg=dict(type='BN'), + upsample_cfg=dict(type='deconv', scale_factor=2), + loss_mask=dict( + type='DiceLoss', + loss_weight=8.0, + use_sigmoid=True, + activate=False, + eps=1e-5)) for _ in range(num_stages) + ]), + # training and testing settings + train_cfg=dict( + rpn=None, + rcnn=[ + dict( + assigner=dict( + type='HungarianAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=5.0), + iou_cost=dict(type='IoUCost', iou_mode='giou', + weight=2.0)), + sampler=dict(type='PseudoSampler'), + pos_weight=1, + mask_size=28, + ) for _ in range(num_stages) + ]), + test_cfg=dict( + rpn=None, rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5))) + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[8, 11], warmup_iters=1000) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py b/configs/mmdet/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py new file mode 100644 index 00000000..3089b3c6 --- /dev/null +++ b/configs/mmdet/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py @@ -0,0 +1,54 @@ +_base_ = './queryinst_r50_fpn_mstrain_480-800_3x_coco.py' +num_proposals = 300 +model = dict( + rpn_head=dict(num_proposals=num_proposals), + test_cfg=dict( + _delete_=True, + rpn=None, + rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# augmentation strategy originates from DETR. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[[ + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict( + type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ]]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) +] +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py b/configs/mmdet/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py new file mode 100644 index 00000000..89e2cd10 --- /dev/null +++ b/configs/mmdet/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py @@ -0,0 +1,23 @@ +_base_ = './queryinst_r50_fpn_1x_coco.py' + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +min_values = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, value) for value in min_values], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) +] + +data = dict(train=dict(pipeline=train_pipeline)) +lr_config = dict(policy='step', step=[27, 33]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/regnet/README.md b/configs/mmdet/regnet/README.md new file mode 100644 index 00000000..cb32d9b8 --- /dev/null +++ b/configs/mmdet/regnet/README.md @@ -0,0 +1,122 @@ +# RegNet + +> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) + + + +## Abstract + + In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs. + +
+ +
+ +## Introduction + +We implement RegNetX and RegNetY models in detection systems and provide their first results on Mask R-CNN, Faster R-CNN and RetinaNet. + +The pre-trained models are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md). + +## Usage + +To use a regnet model, there are two steps to do: + +1. Convert the model to ResNet-style supported by MMDetection +2. Modify backbone and neck in config accordingly + +### Convert model + +We already prepare models of FLOPs from 400M to 12G in our model zoo. + +For more general usage, we also provide script `regnet2mmdet.py` in the tools directory to convert the key of models pretrained by [pycls](https://github.com/facebookresearch/pycls/) to +ResNet-style checkpoints used in MMDetection. + +```bash +python -u tools/model_converters/regnet2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH} +``` + +This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`. + +### Modify config + +The users can modify the config's `depth` of backbone and corresponding keys in `arch` according to the configs in the [pycls model zoo](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md). +The parameter `in_channels` in FPN can be found in the Figure 15 & 16 of the paper (`wi` in the legend). +This directory already provides some configs with their performance, using RegNetX from 800MF to 12GF level. +For other pre-trained models or self-implemented regnet models, the users are responsible to check these parameters by themselves. + +**Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model. + +## Results and Models + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :---------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| [R-50-FPN](../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py)| pytorch | 1x | 4.4 | 12.0 | 38.2 | 34.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json) | +|[RegNetX-3.2GF-FPN](./mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py)| pytorch | 1x |5.0 ||40.3|36.6|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141.log.json) | +|[RegNetX-4.0GF-FPN](./mask_rcnn_regnetx-4GF_fpn_1x_coco.py)| pytorch | 1x |5.5||41.5|37.4|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217-32e9c92d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217.log.json) | +| [R-101-FPN](../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py)| pytorch | 1x | 6.4 | 10.3 | 40.0 | 36.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204_144809.log.json) | +|[RegNetX-6.4GF-FPN](./mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py)| pytorch | 1x |6.1 ||41.0|37.1|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439-3a7aae83.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439.log.json) | +| [X-101-32x4d-FPN](../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py) | pytorch | 1x | 7.6 | 9.4 | 41.9 | 37.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205_034906.log.json) | +|[RegNetX-8.0GF-FPN](./mask_rcnn_regnetx-8GF_fpn_1x_coco.py)| pytorch | 1x |6.4 ||41.7|37.5|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515-09daa87e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515.log.json) | +|[RegNetX-12GF-FPN](./mask_rcnn_regnetx-12GF_fpn_1x_coco.py)| pytorch | 1x |7.4 ||42.2|38|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552-b538bd8b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552.log.json) | +|[RegNetX-3.2GF-FPN-DCN-C3-C5](./mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py)| pytorch | 1x |5.0 ||40.3|36.6|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726-75f40794.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726.log.json) | + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| [R-50-FPN](../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py)| pytorch | 1x | 4.0 | 18.2 | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +|[RegNetX-3.2GF-FPN](./faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py)| pytorch | 1x | 4.5||39.9|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927-126fd9bf.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927.log.json) | +|[RegNetX-3.2GF-FPN](./faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py)| pytorch | 2x | 4.5||41.1|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955-e2081918.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955.log.json) | + +### RetinaNet + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| [R-50-FPN](../retinanet/retinanet_r50_fpn_1x_coco.py) | pytorch | 1x | 3.8 | 16.6 | 36.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130_002941.log.json) | +|[RegNetX-800MF-FPN](./retinanet_regnetx-800MF_fpn_1x_coco.py)| pytorch | 1x |2.5||35.6|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403-f6f91d10.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403.log.json) | +|[RegNetX-1.6GF-FPN](./retinanet_regnetx-1.6GF_fpn_1x_coco.py)| pytorch | 1x |3.3||37.3|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403-37009a9d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403.log.json) | +|[RegNetX-3.2GF-FPN](./retinanet_regnetx-3.2GF_fpn_1x_coco.py)| pytorch | 1x |4.2 ||39.1|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141-cb1509e8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141.log.json) | + +### Pre-trained models + +We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks. + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-----: | :-----: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +|Faster RCNN |[RegNetX-400MF-FPN](./faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py)| pytorch | 3x |2.3 ||37.1|-|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112-e1967c37.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112.log.json) | +|Faster RCNN |[RegNetX-800MF-FPN](./faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py)| pytorch | 3x |2.8 ||38.8|-|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118-a2c70b20.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118.log.json) | +|Faster RCNN |[RegNetX-1.6GF-FPN](./faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py)| pytorch | 3x |3.4 ||40.5|-|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325-94aa46cc.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325.log.json) | +|Faster RCNN |[RegNetX-3.2GF-FPN](./faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py)| pytorch | 3x |4.4 ||42.3|-|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152-e16a5227.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152.log.json) | +|Faster RCNN |[RegNetX-4GF-FPN](./faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py)| pytorch | 3x |4.9 ||42.8|-|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201-65eaf841.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201.log.json) | +|Mask RCNN |[RegNetX-3.2GF-FPN](./mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py)| pytorch | 3x |5.0 ||43.1|38.7|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221.log.json) | +|Mask RCNN |[RegNetX-400MF-FPN](./mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py)| pytorch | 3x |2.5 ||37.6|34.4|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443-8aac57a4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443.log.json) | +|Mask RCNN |[RegNetX-800MF-FPN](./mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py)| pytorch | 3x |2.9 ||39.5|36.1|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641-715d51f5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641.log.json) | +|Mask RCNN |[RegNetX-1.6GF-FPN](./mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py)| pytorch | 3x |3.6 ||40.9|37.5|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641-6764cff5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641.log.json) | +|Mask RCNN |[RegNetX-3.2GF-FPN](./mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py) | pytorch | 3x |5.0 ||43.1|38.7|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221.log.json) | +|Mask RCNN |[RegNetX-4GF-FPN](./mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py) | pytorch | 3x |5.1 ||43.4|39.2|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621-00f0331c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621.log.json) | +|Cascade Mask RCNN |[RegNetX-400MF-FPN](./cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py)| pytorch | 3x |4.3||41.6|36.4|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619-5142f449.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619.log.json) | +|Cascade Mask RCNN |[RegNetX-800MF-FPN](./cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py)| pytorch | 3x |4.8||42.8|37.6|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616-dcbd13f4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616.log.json) | +|Cascade Mask RCNN |[RegNetX-1.6GF-FPN](./cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py)| pytorch | 3x |5.4||44.5|39.0|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616-75f29a61.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616.log.json) | +|Cascade Mask RCNN |[RegNetX-3.2GF-FPN](./cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py)| pytorch | 3x |6.4||45.8|40.0|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616-b9c2c58b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616.log.json) | +|Cascade Mask RCNN |[RegNetX-4GF-FPN](./cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py) | pytorch | 3x |6.9||45.8|40.0|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034-cbb1be4c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034.log.json) | + +### Notice + +1. The models are trained using a different weight decay, i.e., `weight_decay=5e-5` according to the setting in ImageNet training. This brings improvement of at least 0.7 AP absolute but does not improve the model using ResNet-50. +2. RetinaNets using RegNets are trained with learning rate 0.02 with gradient clip. We find that using learning rate 0.02 could improve the results by at least 0.7 AP absolute and gradient clip is necessary to stabilize the training. However, this does not improve the performance of ResNet-50-FPN RetinaNet. + +## Citation + +```latex +@article{radosavovic2020designing, + title={Designing Network Design Spaces}, + author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár}, + year={2020}, + eprint={2003.13678}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..358d85aa --- /dev/null +++ b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_1.6gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')), + neck=dict( + type='FPN', + in_channels=[72, 168, 408, 912], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..84645718 --- /dev/null +++ b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py @@ -0,0 +1,63 @@ +_base_ = [ + '../common/mstrain_3x_coco_instance.py', + '../_base_/models/cascade_mask_rcnn_r50_fpn.py' +] +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) +img_norm_cfg = dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + to_rgb=False) +train_pipeline = [ + # Images are converted to float32 directly after loading in PyCls + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +optimizer = dict(weight_decay=0.00005) diff --git a/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..2a8990a6 --- /dev/null +++ b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_400mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')), + neck=dict( + type='FPN', + in_channels=[32, 64, 160, 384], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..31578634 --- /dev/null +++ b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_4.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 560, 1360], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..41376ad8 --- /dev/null +++ b/configs/mmdet/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_800mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')), + neck=dict( + type='FPN', + in_channels=[64, 128, 288, 672], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..385b5ca7 --- /dev/null +++ b/configs/mmdet/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_1.6gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')), + neck=dict( + type='FPN', + in_channels=[72, 168, 408, 912], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py b/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py new file mode 100644 index 00000000..88d270e3 --- /dev/null +++ b/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py @@ -0,0 +1,57 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) +img_norm_cfg = dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005) diff --git a/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py b/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py new file mode 100644 index 00000000..612490b4 --- /dev/null +++ b/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py @@ -0,0 +1,3 @@ +_base_ = './faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py' +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..b7e6e1a3 --- /dev/null +++ b/configs/mmdet/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py @@ -0,0 +1,61 @@ +_base_ = [ + '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py' +] +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) +img_norm_cfg = dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +optimizer = dict(weight_decay=0.00005) diff --git a/configs/mmdet/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..0a05f6e4 --- /dev/null +++ b/configs/mmdet/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_400mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')), + neck=dict( + type='FPN', + in_channels=[32, 64, 160, 384], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..98b3fc2b --- /dev/null +++ b/configs/mmdet/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_4.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 560, 1360], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..67f448bd --- /dev/null +++ b/configs/mmdet/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_800mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')), + neck=dict( + type='FPN', + in_channels=[64, 128, 288, 672], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..7970c3c8 --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,26 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_1.6gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')), + neck=dict( + type='FPN', + in_channels=[72, 168, 408, 912], + out_channels=256, + num_outs=5)) + +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py new file mode 100644 index 00000000..ce3661cf --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_12gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_12gf')), + neck=dict( + type='FPN', + in_channels=[224, 448, 896, 2240], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py new file mode 100644 index 00000000..44bf0d11 --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py @@ -0,0 +1,58 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) +img_norm_cfg = dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + to_rgb=False) +train_pipeline = [ + # Images are converted to float32 directly after loading in PyCls + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py new file mode 100644 index 00000000..5b534281 --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = 'mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf'))) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..aca64d33 --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py @@ -0,0 +1,66 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) +img_norm_cfg = dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005) +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..c38dfa6a --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,26 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_400mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')), + neck=dict( + type='FPN', + in_channels=[32, 64, 160, 384], + out_channels=256, + num_outs=5)) + +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py new file mode 100644 index 00000000..874d485b --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_4.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 560, 1360], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..f0b65eab --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,26 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_4.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 560, 1360], + out_channels=256, + num_outs=5)) + +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py new file mode 100644 index 00000000..99387d86 --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_6.4gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_6.4gf')), + neck=dict( + type='FPN', + in_channels=[168, 392, 784, 1624], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py new file mode 100644 index 00000000..335ebabf --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py @@ -0,0 +1,26 @@ +_base_ = [ + '../common/mstrain-poly_3x_coco_instance.py', + '../_base_/models/mask_rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_800mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')), + neck=dict( + type='FPN', + in_channels=[64, 128, 288, 672], + out_channels=256, + num_outs=5)) + +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py b/configs/mmdet/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py new file mode 100644 index 00000000..1e7832ff --- /dev/null +++ b/configs/mmdet/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_8.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_8.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 720, 1920], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/metafile.yml b/configs/mmdet/regnet/metafile.yml new file mode 100644 index 00000000..ecd39531 --- /dev/null +++ b/configs/mmdet/regnet/metafile.yml @@ -0,0 +1,797 @@ +Models: + - Name: mask_rcnn_regnetx-3.2GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-4GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217-32e9c92d.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-6.4GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.1 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439-3a7aae83.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-8GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515-09daa87e.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-12GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552-b538bd8b.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726-75f40794.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster_rcnn_regnetx-3.2GF_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927-126fd9bf.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster_rcnn_regnetx-3.2GF_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py + Metadata: + Training Memory (GB): 4.5 + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955-e2081918.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: retinanet_regnetx-800MF_fpn_1x_coco + In Collection: RetinaNet + Config: configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 2.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403-f6f91d10.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: retinanet_regnetx-1.6GF_fpn_1x_coco + In Collection: RetinaNet + Config: configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.3 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403-37009a9d.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: retinanet_regnetx-3.2GF_fpn_1x_coco + In Collection: RetinaNet + Config: configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141-cb1509e8.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 2.3 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112-e1967c37.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 2.8 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118-a2c70b20.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 3.4 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325-94aa46cc.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 4.4 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152-e16a5227.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 4.9 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201-65eaf841.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py + Metadata: + Training Memory (GB): 2.5 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443-8aac57a4.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py + Metadata: + Training Memory (GB): 2.9 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641-715d51f5.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 3.6 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641-6764cff5.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641-6e63e19c.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 5.1 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621-00f0331c.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619-5142f449.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616-dcbd13f4.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 5.4 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616-75f29a61.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 6.4 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616-b9c2c58b.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py + Metadata: + Training Memory (GB): 6.9 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034-cbb1be4c.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 diff --git a/configs/mmdet/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py b/configs/mmdet/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py new file mode 100644 index 00000000..7395c1bf --- /dev/null +++ b/configs/mmdet/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './retinanet_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_1.6gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')), + neck=dict( + type='FPN', + in_channels=[72, 168, 408, 912], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py b/configs/mmdet/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py new file mode 100644 index 00000000..f05307c4 --- /dev/null +++ b/configs/mmdet/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py @@ -0,0 +1,59 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) +img_norm_cfg = dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/configs/mmdet/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py b/configs/mmdet/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py new file mode 100644 index 00000000..f6f89893 --- /dev/null +++ b/configs/mmdet/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './retinanet_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_800mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')), + neck=dict( + type='FPN', + in_channels=[64, 128, 288, 672], + out_channels=256, + num_outs=5)) diff --git a/configs/mmdet/reppoints/README.md b/configs/mmdet/reppoints/README.md new file mode 100644 index 00000000..205a8732 --- /dev/null +++ b/configs/mmdet/reppoints/README.md @@ -0,0 +1,59 @@ +# RepPoints + +> [RepPoints: Point Set Representation for Object Detection](https://arxiv.org/abs/1904.11490) + + + +## Abstract + +Modern object detectors rely heavily on rectangular bounding boxes, such as anchors, proposals and the final predictions, to represent objects at various recognition stages. The bounding box is convenient to use but provides only a coarse localization of objects and leads to a correspondingly coarse extraction of object features. In this paper, we present RepPoints(representative points), a new finer representation of objects as a set of sample points useful for both localization and recognition. Given ground truth localization and recognition targets for training, RepPoints learn to automatically arrange themselves in a manner that bounds the spatial extent of an object and indicates semantically significant local areas. They furthermore do not require the use of anchors to sample a space of bounding boxes. We show that an anchor-free object detector based on RepPoints can be as effective as the state-of-the-art anchor-based detection methods, with 46.5 AP and 67.4 AP50 on the COCO test-dev detection benchmark, using ResNet-101 model. + +
+ +
+ +## Introdution + +By [Ze Yang](https://yangze.tech/), [Shaohui Liu](http://b1ueber2y.me/), and [Han Hu](https://ancientmooner.github.io/). + +We provide code support and configuration files to reproduce the results in the paper for +["RepPoints: Point Set Representation for Object Detection"](https://arxiv.org/abs/1904.11490) on COCO object detection. + +**RepPoints**, initially described in [arXiv](https://arxiv.org/abs/1904.11490), is a new representation method for visual objects, on which visual understanding tasks are typically centered. Visual object representation, aiming at both geometric description and appearance feature extraction, is conventionally achieved by `bounding box + RoIPool (RoIAlign)`. The bounding box representation is convenient to use; however, it provides only a rectangular localization of objects that lacks geometric precision and may consequently degrade feature quality. Our new representation, RepPoints, models objects by a `point set` instead of a `bounding box`, which learns to adaptively position themselves over an object in a manner that circumscribes the object’s `spatial extent` and enables `semantically aligned feature extraction`. This richer and more flexible representation maintains the convenience of bounding boxes while facilitating various visual understanding applications. This repo demonstrated the effectiveness of RepPoints for COCO object detection. + +Another feature of this repo is the demonstration of an `anchor-free detector`, which can be as effective as state-of-the-art anchor-based detection methods. The anchor-free detector can utilize either `bounding box` or `RepPoints` as the basic object representation. + +## Results and Models + +The results on COCO 2017val are shown in the table below. + +| Method | Backbone | GN | Anchor | convert func | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------:|:-------------:|:---:|:------:|:------------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| BBox | R-50-FPN | Y | single | - | 1x | 3.9 | 15.9 | 36.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329-c98bfa96.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916.log.json) | +| BBox | R-50-FPN | Y | none | - | 1x | 3.9 | 15.4 | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/bbox_r50_grid_center_fpn_gn-neck+Bhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_center_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_center_fpn_gn-neck%2Bhead_1x_coco_20200330-00f73d58.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_center_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_center_fpn_gn-neck%2Bhead_1x_coco_20200330_233609.log.json) | +| RepPoints | R-50-FPN | N | none | moment | 1x | 3.3 | 18.5 | 37.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330_233609.log.json) | +| RepPoints | R-50-FPN | Y | none | moment | 1x | 3.9 | 17.5 | 38.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329-4b38409a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329_145952.log.json) | +| RepPoints | R-50-FPN | Y | none | moment | 2x | 3.9 | - | 38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329-91babaa2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329_150020.log.json) | +| RepPoints | R-101-FPN | Y | none | moment | 2x | 5.8 | 13.7 | 40.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329-4fbc7310.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329_132205.log.json) | +| RepPoints | R-101-FPN-DCN | Y | none | moment | 2x | 5.9 | 12.1 | 42.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-3309fbf2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329_132134.log.json) | +| RepPoints | X-101-FPN-DCN | Y | none | moment | 2x | 7.1 | 9.3 | 44.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-f87da1ea.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329_132201.log.json) | + +**Notes:** + +- `R-xx`, `X-xx` denote the ResNet and ResNeXt architectures, respectively. +- `DCN` denotes replacing 3x3 conv with the 3x3 deformable convolution in `c3-c5` stages of backbone. +- `none` in the `anchor` column means 2-d `center point` (x,y) is used to represent the initial object hypothesis. `single` denotes one 4-d anchor box (x,y,w,h) with IoU based label assign criterion is adopted. +- `moment`, `partial MinMax`, `MinMax` in the `convert func` column are three functions to convert a point set to a pseudo box. +- Note the results here are slightly different from those reported in the paper, due to framework change. While the original paper uses an [MXNet](https://mxnet.apache.org/) implementation, we re-implement the method in [PyTorch](https://pytorch.org/) based on mmdetection. + +## Citation + +```latex +@inproceedings{yang2019reppoints, + title={RepPoints: Point Set Representation for Object Detection}, + author={Yang, Ze and Liu, Shaohui and Hu, Han and Wang, Liwei and Lin, Stephen}, + booktitle={The IEEE International Conference on Computer Vision (ICCV)}, + month={Oct}, + year={2019} +} +``` diff --git a/configs/mmdet/reppoints/bbox_r50_grid_center_fpn_gn-neck+head_1x_coco.py b/configs/mmdet/reppoints/bbox_r50_grid_center_fpn_gn-neck+head_1x_coco.py new file mode 100644 index 00000000..b24c8db7 --- /dev/null +++ b/configs/mmdet/reppoints/bbox_r50_grid_center_fpn_gn-neck+head_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py' +model = dict(bbox_head=dict(transform_method='minmax', use_grid_points=True)) diff --git a/configs/mmdet/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py b/configs/mmdet/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py new file mode 100644 index 00000000..8d5013d3 --- /dev/null +++ b/configs/mmdet/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py' +model = dict( + bbox_head=dict(transform_method='minmax', use_grid_points=True), + # training and testing settings + train_cfg=dict( + init=dict( + assigner=dict( + _delete_=True, + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1)))) diff --git a/configs/mmdet/reppoints/metafile.yml b/configs/mmdet/reppoints/metafile.yml new file mode 100644 index 00000000..cd4312c4 --- /dev/null +++ b/configs/mmdet/reppoints/metafile.yml @@ -0,0 +1,181 @@ +Collections: + - Name: RepPoints + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Group Normalization + - FPN + - RepPoints + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.11490 + Title: 'RepPoints: Point Set Representation for Object Detection' + README: configs/reppoints/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/reppoints_detector.py#L9 + Version: v2.0.0 + +Models: + - Name: bbox_r50_grid_fpn_gn-neck+head_1x_coco + In Collection: RepPoints + Config: configs/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py + Metadata: + Training Memory (GB): 3.9 + inference time (ms/im): + - value: 62.89 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329-c98bfa96.pth + + - Name: bbox_r50_grid_center_fpn_gn-neck+Bhead_1x_coco + In Collection: RepPoints + Config: configs/reppoints/bbox_r50_grid_center_fpn_gn-neck+Bhead_1x_coco.py + Metadata: + Training Memory (GB): 3.9 + inference time (ms/im): + - value: 64.94 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_center_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_center_fpn_gn-neck%2Bhead_1x_coco_20200330-00f73d58.pth + + - Name: reppoints_moment_r50_fpn_1x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.3 + inference time (ms/im): + - value: 54.05 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth + + - Name: reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco.py + Metadata: + Training Memory (GB): 3.9 + inference time (ms/im): + - value: 57.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329-4b38409a.pth + + - Name: reppoints_moment_r50_fpn_gn-neck+head_2x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py + Metadata: + Training Memory (GB): 3.9 + inference time (ms/im): + - value: 57.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329-91babaa2.pth + + - Name: reppoints_moment_r101_fpn_gn-neck+head_2x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py + Metadata: + Training Memory (GB): 5.8 + inference time (ms/im): + - value: 72.99 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329-4fbc7310.pth + + - Name: reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py + Metadata: + Training Memory (GB): 5.9 + inference time (ms/im): + - value: 82.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-3309fbf2.pth + + - Name: reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 107.53 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-f87da1ea.pth diff --git a/configs/mmdet/reppoints/reppoints.png b/configs/mmdet/reppoints/reppoints.png new file mode 100644 index 00000000..a9306d9b Binary files /dev/null and b/configs/mmdet/reppoints/reppoints.png differ diff --git a/configs/mmdet/reppoints/reppoints_minmax_r50_fpn_gn-neck+head_1x_coco.py b/configs/mmdet/reppoints/reppoints_minmax_r50_fpn_gn-neck+head_1x_coco.py new file mode 100644 index 00000000..0f56a46b --- /dev/null +++ b/configs/mmdet/reppoints/reppoints_minmax_r50_fpn_gn-neck+head_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py' +model = dict(bbox_head=dict(transform_method='minmax')) diff --git a/configs/mmdet/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py b/configs/mmdet/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py new file mode 100644 index 00000000..e223d80f --- /dev/null +++ b/configs/mmdet/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py @@ -0,0 +1,8 @@ +_base_ = './reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py b/configs/mmdet/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py new file mode 100644 index 00000000..11854709 --- /dev/null +++ b/configs/mmdet/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/reppoints/reppoints_moment_r50_fpn_1x_coco.py b/configs/mmdet/reppoints/reppoints_moment_r50_fpn_1x_coco.py new file mode 100644 index 00000000..158a9067 --- /dev/null +++ b/configs/mmdet/reppoints/reppoints_moment_r50_fpn_1x_coco.py @@ -0,0 +1,67 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='RepPointsDetector', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5), + bbox_head=dict( + type='RepPointsHead', + num_classes=80, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment'), + # training and testing settings + train_cfg=dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)) +optimizer = dict(lr=0.01) diff --git a/configs/mmdet/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py b/configs/mmdet/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py new file mode 100644 index 00000000..337f167c --- /dev/null +++ b/configs/mmdet/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = './reppoints_moment_r50_fpn_1x_coco.py' +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict(neck=dict(norm_cfg=norm_cfg), bbox_head=dict(norm_cfg=norm_cfg)) +optimizer = dict(lr=0.01) diff --git a/configs/mmdet/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py b/configs/mmdet/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py new file mode 100644 index 00000000..feca44aa --- /dev/null +++ b/configs/mmdet/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py @@ -0,0 +1,3 @@ +_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py' +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py b/configs/mmdet/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py new file mode 100644 index 00000000..c0a12d00 --- /dev/null +++ b/configs/mmdet/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/reppoints/reppoints_partial_minmax_r50_fpn_gn-neck+head_1x_coco.py b/configs/mmdet/reppoints/reppoints_partial_minmax_r50_fpn_gn-neck+head_1x_coco.py new file mode 100644 index 00000000..9a63bd08 --- /dev/null +++ b/configs/mmdet/reppoints/reppoints_partial_minmax_r50_fpn_gn-neck+head_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py' +model = dict(bbox_head=dict(transform_method='partial_minmax')) diff --git a/configs/mmdet/res2net/README.md b/configs/mmdet/res2net/README.md new file mode 100644 index 00000000..29d1d461 --- /dev/null +++ b/configs/mmdet/res2net/README.md @@ -0,0 +1,77 @@ +# Res2Net + +> [Res2Net: A New Multi-scale Backbone Architecture](https://arxiv.org/abs/1904.01169) + + + +## Abstract + +Representing features at multiple scales is of great importance for numerous vision tasks. Recent advances in backbone convolutional neural networks (CNNs) continually demonstrate stronger multi-scale representation ability, leading to consistent performance gains on a wide range of applications. However, most existing methods represent the multi-scale features in a layer-wise manner. In this paper, we propose a novel building block for CNNs, namely Res2Net, by constructing hierarchical residual-like connections within one single residual block. The Res2Net represents multi-scale features at a granular level and increases the range of receptive fields for each network layer. The proposed Res2Net block can be plugged into the state-of-the-art backbone CNN models, e.g., ResNet, ResNeXt, and DLA. We evaluate the Res2Net block on all these models and demonstrate consistent performance gains over baseline models on widely-used datasets, e.g., CIFAR-100 and ImageNet. Further ablation studies and experimental results on representative computer vision tasks, i.e., object detection, class activation mapping, and salient object detection, further verify the superiority of the Res2Net over the state-of-the-art baseline methods. + +
+ +
+ +## Introduction + +We propose a novel building block for CNNs, namely Res2Net, by constructing hierarchical residual-like connections within one single residual block. The Res2Net represents multi-scale features at a granular level and increases the range of receptive fields for each network layer. + +| Backbone |Params. | GFLOPs | top-1 err. | top-5 err. | +| :-------------: |:----: | :-----: | :--------: | :--------: | +| ResNet-101 |44.6 M | 7.8 | 22.63 | 6.44 | +| ResNeXt-101-64x4d |83.5M | 15.5 | 20.40 | - | +| HRNetV2p-W48 | 77.5M | 16.1 | 20.70 | 5.50 | +| Res2Net-101 | 45.2M | 8.3 | 18.77 | 4.64 | + +Compared with other backbone networks, Res2Net requires fewer parameters and FLOPs. + +**Note:** + +- GFLOPs for classification are calculated with image size (224x224). + +## Results and Models + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +|R2-101-FPN | pytorch | 2x | 7.4 | - | 43.0 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/faster_rcnn_r2_101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco_20200514_231734.log.json) | + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +|R2-101-FPN | pytorch | 2x | 7.9 | - | 43.6 | 38.7 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/mask_rcnn_r2_101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco-17f061e8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco_20200515_002413.log.json) | + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +|R2-101-FPN | pytorch | 20e | 7.8 | - | 45.7 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco-f4b7b7db.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco_20200515_091644.log.json) | + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +R2-101-FPN | pytorch | 20e | 9.5 | - | 46.4 | 40.0 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco-8a7b41e1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco_20200515_091645.log.json) | + +### Hybrid Task Cascade (HTC) + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| R2-101-FPN | pytorch | 20e | - | - | 47.5 | 41.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/htc_r2_101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco-3a8d2112.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco_20200515_150029.log.json) | + +- Res2Net ImageNet pretrained models are in [Res2Net-PretrainedModels](https://github.com/Res2Net/Res2Net-PretrainedModels). +- More applications of Res2Net are in [Res2Net-Github](https://github.com/Res2Net/). + +## Citation + +```latex +@article{gao2019res2net, + title={Res2Net: A New Multi-scale Backbone Architecture}, + author={Gao, Shang-Hua and Cheng, Ming-Ming and Zhao, Kai and Zhang, Xin-Yu and Yang, Ming-Hsuan and Torr, Philip}, + journal={IEEE TPAMI}, + year={2020}, + doi={10.1109/TPAMI.2019.2938758}, +} +``` diff --git a/configs/mmdet/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py b/configs/mmdet/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py new file mode 100644 index 00000000..6b6c0010 --- /dev/null +++ b/configs/mmdet/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py @@ -0,0 +1,10 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/configs/mmdet/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py b/configs/mmdet/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py new file mode 100644 index 00000000..10dddbb4 --- /dev/null +++ b/configs/mmdet/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py @@ -0,0 +1,10 @@ +_base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/configs/mmdet/res2net/faster_rcnn_r2_101_fpn_2x_coco.py b/configs/mmdet/res2net/faster_rcnn_r2_101_fpn_2x_coco.py new file mode 100644 index 00000000..fc2221cb --- /dev/null +++ b/configs/mmdet/res2net/faster_rcnn_r2_101_fpn_2x_coco.py @@ -0,0 +1,10 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/configs/mmdet/res2net/htc_r2_101_fpn_20e_coco.py b/configs/mmdet/res2net/htc_r2_101_fpn_20e_coco.py new file mode 100644 index 00000000..22d0c5da --- /dev/null +++ b/configs/mmdet/res2net/htc_r2_101_fpn_20e_coco.py @@ -0,0 +1,13 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/res2net/mask_rcnn_r2_101_fpn_2x_coco.py b/configs/mmdet/res2net/mask_rcnn_r2_101_fpn_2x_coco.py new file mode 100644 index 00000000..33aef1a5 --- /dev/null +++ b/configs/mmdet/res2net/mask_rcnn_r2_101_fpn_2x_coco.py @@ -0,0 +1,10 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/configs/mmdet/res2net/metafile.yml b/configs/mmdet/res2net/metafile.yml new file mode 100644 index 00000000..27bac8c1 --- /dev/null +++ b/configs/mmdet/res2net/metafile.yml @@ -0,0 +1,146 @@ +Models: + - Name: faster_rcnn_r2_101_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/res2net/faster_rcnn_r2_101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 + + - Name: mask_rcnn_r2_101_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/res2net/mask_rcnn_r2_101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.9 + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco-17f061e8.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 + + - Name: cascade_rcnn_r2_101_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco-f4b7b7db.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 + + - Name: cascade_mask_rcnn_r2_101_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 9.5 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco-8a7b41e1.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 + + - Name: htc_r2_101_fpn_20e_coco + In Collection: HTC + Config: configs/res2net/htc_r2_101_fpn_20e_coco.py + Metadata: + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco-3a8d2112.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 diff --git a/configs/mmdet/resnest/README.md b/configs/mmdet/resnest/README.md new file mode 100644 index 00000000..02c0cad5 --- /dev/null +++ b/configs/mmdet/resnest/README.md @@ -0,0 +1,54 @@ +# ResNeSt + +> [ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955) + + + +## Abstract + +It is well known that featuremap attention and multi-path representation are important for visual recognition. In this paper, we present a modularized architecture, which applies the channel-wise attention on different network branches to leverage their success in capturing cross-feature interactions and learning diverse representations. Our design results in a simple and unified computation block, which can be parameterized using only a few variables. Our model, named ResNeSt, outperforms EfficientNet in accuracy and latency trade-off on image classification. In addition, ResNeSt has achieved superior transfer learning results on several public benchmarks serving as the backbone, and has been adopted by the winning entries of COCO-LVIS challenge. + +
+ +
+ +## Results and Models + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +|S-50-FPN | pytorch | 1x | 4.8 | - | 42.0 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20200926_125502-20289c16.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20200926_125502.log.json) | +|S-101-FPN | pytorch | 1x | 7.1 | - | 44.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201006_021058-421517f1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201006_021058.log.json) | + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +|S-50-FPN | pytorch | 1x | 5.5 | - | 42.6 | 38.1 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20200926_125503-8a2c3d47.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20200926_125503.log.json) | +|S-101-FPN | pytorch | 1x | 7.8 | - | 45.2 | 40.2 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_215831-af60cdf9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201005_215831.log.json) | + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +|S-50-FPN | pytorch | 1x | - | - | 44.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201122_213640-763cc7b5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201005_113242.log.json) | +|S-101-FPN | pytorch | 1x | 8.4 | - | 46.8 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201005_113242-b9459f8f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201122_213640.log.json) | + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +|S-50-FPN | pytorch | 1x | - | - | 45.4 | 39.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201122_104428-99eca4c7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201122_104428.log.json) | +|S-101-FPN | pytorch | 1x | 10.5 | - | 47.7 | 41.4 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_113243-42607475.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201005_113243.log.json) | + +## Citation + +```latex +@article{zhang2020resnest, +title={ResNeSt: Split-Attention Networks}, +author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander}, +journal={arXiv preprint arXiv:2004.08955}, +year={2020} +} +``` diff --git a/configs/mmdet/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py b/configs/mmdet/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py new file mode 100644 index 00000000..406f39db --- /dev/null +++ b/configs/mmdet/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py' +model = dict( + backbone=dict( + stem_channels=128, + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='open-mmlab://resnest101'))) diff --git a/configs/mmdet/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py b/configs/mmdet/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py new file mode 100644 index 00000000..83d75372 --- /dev/null +++ b/configs/mmdet/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py @@ -0,0 +1,118 @@ +_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + backbone=dict( + type='ResNeSt', + stem_channels=64, + depth=50, + radix=2, + reduction_factor=4, + avg_down_stride=True, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')), + roi_head=dict( + bbox_head=[ + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_head=dict(norm_cfg=norm_cfg))) +# # use ResNeSt img_norm +img_norm_cfg = dict( + mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py b/configs/mmdet/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py new file mode 100644 index 00000000..0a7476a3 --- /dev/null +++ b/configs/mmdet/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py' +model = dict( + backbone=dict( + stem_channels=128, + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='open-mmlab://resnest101'))) diff --git a/configs/mmdet/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py b/configs/mmdet/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py new file mode 100644 index 00000000..6ed77301 --- /dev/null +++ b/configs/mmdet/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py @@ -0,0 +1,116 @@ +_base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + backbone=dict( + type='ResNeSt', + stem_channels=64, + depth=50, + radix=2, + reduction_factor=4, + avg_down_stride=True, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')), + roi_head=dict( + bbox_head=[ + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], )) +# # use ResNeSt img_norm +img_norm_cfg = dict( + mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=False, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py b/configs/mmdet/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py new file mode 100644 index 00000000..40a2f1f2 --- /dev/null +++ b/configs/mmdet/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py' +model = dict( + backbone=dict( + stem_channels=128, + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='open-mmlab://resnest101'))) diff --git a/configs/mmdet/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py b/configs/mmdet/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py new file mode 100644 index 00000000..eb1ecd22 --- /dev/null +++ b/configs/mmdet/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py @@ -0,0 +1,62 @@ +_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + backbone=dict( + type='ResNeSt', + stem_channels=64, + depth=50, + radix=2, + reduction_factor=4, + avg_down_stride=True, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg))) +# # use ResNeSt img_norm +img_norm_cfg = dict( + mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=False, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py b/configs/mmdet/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py new file mode 100644 index 00000000..c882ba14 --- /dev/null +++ b/configs/mmdet/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py' +model = dict( + backbone=dict( + stem_channels=128, + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='open-mmlab://resnest101'))) diff --git a/configs/mmdet/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py b/configs/mmdet/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py new file mode 100644 index 00000000..4e50deac --- /dev/null +++ b/configs/mmdet/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py @@ -0,0 +1,64 @@ +_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + backbone=dict( + type='ResNeSt', + stem_channels=64, + depth=50, + radix=2, + reduction_factor=4, + avg_down_stride=True, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg), + mask_head=dict(norm_cfg=norm_cfg))) +# # use ResNeSt img_norm +img_norm_cfg = dict( + mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/resnest/metafile.yml b/configs/mmdet/resnest/metafile.yml new file mode 100644 index 00000000..cfeec719 --- /dev/null +++ b/configs/mmdet/resnest/metafile.yml @@ -0,0 +1,230 @@ +Models: + - Name: faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco + In Collection: Faster R-CNN + Config: configs/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20200926_125502-20289c16.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco + In Collection: Faster R-CNN + Config: configs/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py + Metadata: + Training Memory (GB): 7.1 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201006_021058-421517f1.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco + In Collection: Mask R-CNN + Config: configs/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20200926_125503-8a2c3d47.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco + In Collection: Mask R-CNN + Config: configs/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_215831-af60cdf9.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco + In Collection: Cascade R-CNN + Config: configs/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py + Metadata: + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201122_213640-763cc7b5.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco + In Collection: Cascade R-CNN + Config: configs/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py + Metadata: + Training Memory (GB): 8.4 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201005_113242-b9459f8f.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco + In Collection: Cascade R-CNN + Config: configs/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py + Metadata: + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201122_104428-99eca4c7.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco + In Collection: Cascade R-CNN + Config: configs/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py + Metadata: + Training Memory (GB): 10.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_113243-42607475.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 diff --git a/configs/mmdet/resnet_strikes_back/README.md b/configs/mmdet/resnet_strikes_back/README.md new file mode 100644 index 00000000..eec03826 --- /dev/null +++ b/configs/mmdet/resnet_strikes_back/README.md @@ -0,0 +1,40 @@ +# ResNet strikes back + +> [ResNet strikes back: An improved training procedure in timm](https://arxiv.org/abs/2110.00476) + + + +## Abstract + +The influential Residual Networks designed by He et al. remain the gold-standard architecture in numerous scientific publications. They typically serve as the default architecture in studies, or as baselines when new architectures are proposed. Yet there has been significant progress on best practices for training neural networks since the inception of the ResNet architecture in 2015. Novel optimization & dataaugmentation have increased the effectiveness of the training recipes. + +In this paper, we re-evaluate the performance of the vanilla ResNet-50 when trained with a procedure that integrates such advances. We share competitive training settings and pre-trained models in the timm open-source library, with the hope that they will serve as better baselines for future work. For instance, with our more demanding training setting, a vanilla ResNet-50 reaches 80.4% top-1 accuracy at resolution 224×224 on ImageNet-val without extra data or distillation. We also report the performance achieved with popular models with our training procedure. + +
+ +
+ +## Results and Models + +| Method | Backbone | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------: | :-------------: | :-----: | :------: | :------------: | :----: | :------:| :------: | :--------: | +| Faster R-CNN | R-50 rsb | 1x | 3.9 | - | 40.8 (+3.4) | - | [Config](./faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229.log.json)| +| Mask R-CNN | R-50 rsb | 1x | 4.5 | - | 41.2 (+3.0) | 38.2 (+3.0) | [Config](./mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054.log.json)| +| Cascade Mask R-CNN | R-50 rsb | 1x | 6.2 | - | 44.8 (+3.6) | 39.9 (+3.6) | [Config](./cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636.log.json)| +| RetinaNet | R-50 rsb | 1x | 3.8 | - | 39.0 (+2.5) | - | [Config](./retinanet_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432.log.json)| + +**Notes:** + +- 'rsb' is short for 'resnet strikes back' +- We have done some grid searches on learning rate and weight decay and get these optimal hyper-parameters. + +## Citation + +```latex +@article{wightman2021resnet, +title={Resnet strikes back: An improved training procedure in timm}, +author={Ross Wightman, Hugo Touvron, Hervé Jégou}, +journal={arXiv preprint arXiv:2110.00476}, +year={2021} +} +``` diff --git a/configs/mmdet/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/mmdet/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py new file mode 100644 index 00000000..8b601f05 --- /dev/null +++ b/configs/mmdet/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/cascade_mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth' # noqa +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint))) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0002, + weight_decay=0.05, + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/configs/mmdet/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/mmdet/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py new file mode 100644 index 00000000..fe866843 --- /dev/null +++ b/configs/mmdet/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth' # noqa +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint))) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0002, + weight_decay=0.05, + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/configs/mmdet/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/mmdet/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py new file mode 100644 index 00000000..321d98eb --- /dev/null +++ b/configs/mmdet/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth' # noqa +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint))) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0002, + weight_decay=0.05, + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/configs/mmdet/resnet_strikes_back/metafile.yml b/configs/mmdet/resnet_strikes_back/metafile.yml new file mode 100644 index 00000000..4c85a16d --- /dev/null +++ b/configs/mmdet/resnet_strikes_back/metafile.yml @@ -0,0 +1,116 @@ +Models: + - Name: faster_rcnn_r50_fpn_rsb-pretrain_1x_coco + In Collection: Faster R-CNN + Config: configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py + Metadata: + Training Memory (GB): 3.9 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth + Paper: + URL: https://arxiv.org/abs/2110.00476 + Title: 'ResNet strikes back: An improved training procedure in timm' + README: configs/resnet_strikes_back/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md + Version: v2.22.0 + + - Name: cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco + In Collection: Cascade R-CNN + Config: configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth + Paper: + URL: https://arxiv.org/abs/2110.00476 + Title: 'ResNet strikes back: An improved training procedure in timm' + README: configs/resnet_strikes_back/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md + Version: v2.22.0 + + - Name: retinanet_r50_fpn_rsb-pretrain_1x_coco + In Collection: RetinaNet + Config: configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth + Paper: + URL: https://arxiv.org/abs/2110.00476 + Title: 'ResNet strikes back: An improved training procedure in timm' + README: configs/resnet_strikes_back/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md + Version: v2.22.0 + + - Name: mask_rcnn_r50_fpn_rsb-pretrain_1x_coco + In Collection: Mask R-CNN + Config: configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth + Paper: + URL: https://arxiv.org/abs/2110.00476 + Title: 'ResNet strikes back: An improved training procedure in timm' + README: configs/resnet_strikes_back/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md + Version: v2.22.0 diff --git a/configs/mmdet/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py b/configs/mmdet/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py new file mode 100644 index 00000000..480697a0 --- /dev/null +++ b/configs/mmdet/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth' # noqa +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint))) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0001, + weight_decay=0.05, + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/configs/mmdet/retinanet/README.md b/configs/mmdet/retinanet/README.md new file mode 100644 index 00000000..50293cf9 --- /dev/null +++ b/configs/mmdet/retinanet/README.md @@ -0,0 +1,53 @@ +# RetinaNet + +> [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) + + + +## Abstract + +The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| R-18-FPN | pytorch | 1x | 1.7 | | 31.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r18_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055.log.json) | +| R-18-FPN | pytorch | 1x(1 x 8 BS)| 5.0 | | 31.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255-4ea310d7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255.log.json) | +| R-50-FPN | caffe | 1x | 3.5 | 18.6 | 36.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531_012518.log.json) | +| R-50-FPN | pytorch | 1x | 3.8 | 19.0 | 36.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130_002941.log.json) | +| R-50-FPN (FP16) | pytorch | 1x | 2.8 | 31.6 | 36.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702-0dbfb212.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702_020127.log.json) | +| R-50-FPN | pytorch | 2x | - | - | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131_114738.log.json) | +| R-101-FPN | caffe | 1x | 5.5 | 14.7 | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531-b428fa0f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531_012536.log.json) | +| R-101-FPN | pytorch | 1x | 5.7 | 15.0 | 38.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130-7a93545f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130_003055.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 38.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131-5560aee8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131_114859.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.0 | 12.1 | 39.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130-5c8b7ec4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130_003004.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 40.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131-237fc5e1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131_114812.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.0 | 8.7 | 41.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130-366f5af1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130_003008.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 40.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131-bca068ab.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131_114833.log.json) | + +## Pre-trained Models + +We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks. + +| Backbone | Style | Lr schd | Mem (GB) | box AP | Config | Download | +| :----------------: | :-----: | :-----: | :------: | :----: | :------: | :--------: | +| R-50-FPN | pytorch| 3x | 3.5 | 39.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.log.json) +| R-101-FPN | caffe | 3x | 5.4 | 40.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.log.json) +| R-101-FPN | pytorch| 3x | 5.4 | 41 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.log.json) +| X-101-64x4d-FPN | pytorch| 3x | 9.8 | 41.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.log.json) + +## Citation + +```latex +@inproceedings{lin2017focal, + title={Focal loss for dense object detection}, + author={Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + year={2017} +} +``` diff --git a/configs/mmdet/retinanet/metafile.yml b/configs/mmdet/retinanet/metafile.yml new file mode 100644 index 00000000..20807710 --- /dev/null +++ b/configs/mmdet/retinanet/metafile.yml @@ -0,0 +1,312 @@ +Collections: + - Name: RetinaNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Focal Loss + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1708.02002 + Title: "Focal Loss for Dense Object Detection" + README: configs/retinanet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/retinanet.py#L6 + Version: v2.0.0 + +Models: + - Name: retinanet_r18_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r18_fpn_1x_coco.py + Metadata: + Training Memory (GB): 1.7 + Training Resources: 8x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 31.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth + + - Name: retinanet_r18_fpn_1x8_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + Training Resources: 1x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 31.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255-4ea310d7.pth + + - Name: retinanet_r50_caffe_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.5 + inference time (ms/im): + - value: 53.76 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth + + - Name: retinanet_r50_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + inference time (ms/im): + - value: 52.63 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth + + - Name: retinanet_r50_fpn_fp16_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py + Metadata: + Training Memory (GB): 2.8 + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + inference time (ms/im): + - value: 31.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP16 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702-0dbfb212.pth + + - Name: retinanet_r50_fpn_2x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_fpn_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth + + - Name: retinanet_r50_fpn_mstrain_3x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.pth + + - Name: retinanet_r101_caffe_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101_caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + inference time (ms/im): + - value: 68.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531-b428fa0f.pth + + - Name: retinanet_r101_caffe_fpn_mstrain_3x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101_caffe_fpn_1x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.pth + + - Name: retinanet_r101_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.7 + inference time (ms/im): + - value: 66.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130-7a93545f.pth + + - Name: retinanet_r101_fpn_2x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 5.7 + inference time (ms/im): + - value: 66.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131-5560aee8.pth + + - Name: retinanet_r101_fpn_mstrain_3x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101_fpn_2x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.pth + + - Name: retinanet_x101_32x4d_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 82.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130-5c8b7ec4.pth + + - Name: retinanet_x101_32x4d_fpn_2x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 82.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131-237fc5e1.pth + + - Name: retinanet_x101_64x4d_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.0 + inference time (ms/im): + - value: 114.94 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130-366f5af1.pth + + - Name: retinanet_x101_64x4d_fpn_2x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 10.0 + inference time (ms/im): + - value: 114.94 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131-bca068ab.pth + + - Name: retinanet_x101_64x4d_fpn_mstrain_3x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.pth diff --git a/configs/mmdet/retinanet/retinanet_r101_caffe_fpn_1x_coco.py b/configs/mmdet/retinanet/retinanet_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..56eaae20 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './retinanet_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py b/configs/mmdet/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..b87295e6 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './retinanet_r50_caffe_fpn_mstrain_1x_coco.py' +# learning policy +model = dict( + pretrained='open-mmlab://detectron2/resnet101_caffe', + backbone=dict(depth=101)) +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/retinanet/retinanet_r101_fpn_1x_coco.py b/configs/mmdet/retinanet/retinanet_r101_fpn_1x_coco.py new file mode 100644 index 00000000..a7f06002 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/retinanet/retinanet_r101_fpn_2x_coco.py b/configs/mmdet/retinanet/retinanet_r101_fpn_2x_coco.py new file mode 100644 index 00000000..721112a2 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './retinanet_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py b/configs/mmdet/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py new file mode 100644 index 00000000..6bbcac4f --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', '../common/mstrain_3x_coco.py' +] +# optimizer +model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101)) +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/retinanet/retinanet_r18_fpn_1x8_1x_coco.py b/configs/mmdet/retinanet/retinanet_r18_fpn_1x8_1x_coco.py new file mode 100644 index 00000000..01a35f23 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r18_fpn_1x8_1x_coco.py @@ -0,0 +1,23 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# data +data = dict(samples_per_gpu=8) + +# optimizer +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) + +# Note: If the learning rate is set to 0.0025, the mAP will be 32.4. +optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (1 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/configs/mmdet/retinanet/retinanet_r18_fpn_1x_coco.py b/configs/mmdet/retinanet/retinanet_r18_fpn_1x_coco.py new file mode 100644 index 00000000..6197b32d --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r18_fpn_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# optimizer +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_1x_coco.py b/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..04c9af58 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,41 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_1x_coco.py b/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_1x_coco.py new file mode 100644 index 00000000..4d7b8f2b --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_1x_coco.py @@ -0,0 +1,46 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_2x_coco.py b/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..eea9690e --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './retinanet_r50_caffe_fpn_mstrain_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 23]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_3x_coco.py b/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_3x_coco.py new file mode 100644 index 00000000..80576507 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_caffe_fpn_mstrain_3x_coco.py @@ -0,0 +1,4 @@ +_base_ = './retinanet_r50_caffe_fpn_mstrain_1x_coco.py' +# learning policy +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/retinanet/retinanet_r50_fpn_1x_coco.py b/configs/mmdet/retinanet/retinanet_r50_fpn_1x_coco.py new file mode 100644 index 00000000..04bd696b --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/retinanet/retinanet_r50_fpn_2x_coco.py b/configs/mmdet/retinanet/retinanet_r50_fpn_2x_coco.py new file mode 100644 index 00000000..927915fa --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_fpn_2x_coco.py @@ -0,0 +1,4 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/retinanet/retinanet_r50_fpn_90k_coco.py b/configs/mmdet/retinanet/retinanet_r50_fpn_90k_coco.py new file mode 100644 index 00000000..ceda3279 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_fpn_90k_coco.py @@ -0,0 +1,15 @@ +_base_ = 'retinanet_r50_fpn_1x_coco.py' + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[60000, 80000]) + +# Runner type +runner = dict(_delete_=True, type='IterBasedRunner', max_iters=90000) + +checkpoint_config = dict(interval=10000) +evaluation = dict(interval=10000, metric='bbox') diff --git a/configs/mmdet/retinanet/retinanet_r50_fpn_fp16_1x_coco.py b/configs/mmdet/retinanet/retinanet_r50_fpn_fp16_1x_coco.py new file mode 100644 index 00000000..6b6cebe4 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_fpn_fp16_1x_coco.py @@ -0,0 +1,3 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +# fp16 settings +fp16 = dict(loss_scale=512.) diff --git a/configs/mmdet/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py b/configs/mmdet/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py new file mode 100644 index 00000000..02a2c291 --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', '../common/mstrain_3x_coco.py' +] +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..765a4c2c --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py b/configs/mmdet/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py new file mode 100644 index 00000000..14de96fa --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..948cd18e --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py b/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py new file mode 100644 index 00000000..ad04b6ee --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py b/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py new file mode 100644 index 00000000..f6ab512f --- /dev/null +++ b/configs/mmdet/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py @@ -0,0 +1,8 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', '../common/mstrain_3x_coco.py' +] +# optimizer +model = dict( + pretrained='open-mmlab://resnext101_64x4d', + backbone=dict(type='ResNeXt', depth=101, groups=64, base_width=4)) +optimizer = dict(type='SGD', lr=0.01) diff --git a/configs/mmdet/rpn/README.md b/configs/mmdet/rpn/README.md new file mode 100644 index 00000000..654515cf --- /dev/null +++ b/configs/mmdet/rpn/README.md @@ -0,0 +1,39 @@ +# RPN + +> [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497) + + + +## Abstract + +State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. + +
+ +
+ +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | AR1000 | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| R-50-FPN | caffe | 1x | 3.5 | 22.6 | 58.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_caffe_fpn_1x_coco/rpn_r50_caffe_fpn_1x_coco_20200531-5b903a37.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_caffe_fpn_1x_coco/rpn_r50_caffe_fpn_1x_coco_20200531_012334.log.json) | +| R-50-FPN | pytorch | 1x | 3.8 | 22.3 | 58.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218_151240.log.json) | +| R-50-FPN | pytorch | 2x | - | - | 58.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_2x_coco/rpn_r50_fpn_2x_coco_20200131-0728c9b3.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_2x_coco/rpn_r50_fpn_2x_coco_20200131_190631.log.json) | +| R-101-FPN | caffe | 1x | 5.4 | 17.3 | 60.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_caffe_fpn_1x_coco/rpn_r101_caffe_fpn_1x_coco_20200531-0629a2e2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_caffe_fpn_1x_coco/rpn_r101_caffe_fpn_1x_coco_20200531_012345.log.json) | +| R-101-FPN | pytorch | 1x | 5.8 | 16.5 | 59.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_1x_coco/rpn_r101_fpn_1x_coco_20200131-2ace2249.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_1x_coco/rpn_r101_fpn_1x_coco_20200131_191000.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 60.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_2x_coco/rpn_r101_fpn_2x_coco_20200131-24e3db1a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_2x_coco/rpn_r101_fpn_2x_coco_20200131_191106.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.0 | 13.0 | 60.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_1x_coco/rpn_x101_32x4d_fpn_1x_coco_20200219-b02646c6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_1x_coco/rpn_x101_32x4d_fpn_1x_coco_20200219_012037.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 61.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_x101_32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_2x_coco/rpn_x101_32x4d_fpn_2x_coco_20200208-d22bd0bb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_2x_coco/rpn_x101_32x4d_fpn_2x_coco_20200208_200752.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.1 | 9.1 | 61.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_1x_coco/rpn_x101_64x4d_fpn_1x_coco_20200208-cde6f7dd.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_1x_coco/rpn_x101_64x4d_fpn_1x_coco_20200208_200752.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 61.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_x101_64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_2x_coco/rpn_x101_64x4d_fpn_2x_coco_20200208-c65f524f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_2x_coco/rpn_x101_64x4d_fpn_2x_coco_20200208_200752.log.json) | + +## Citation + +```latex +@inproceedings{ren2015faster, + title={Faster r-cnn: Towards real-time object detection with region proposal networks}, + author={Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian}, + booktitle={Advances in neural information processing systems}, + year={2015} +} +``` diff --git a/configs/mmdet/rpn/rpn_r101_caffe_fpn_1x_coco.py b/configs/mmdet/rpn/rpn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..27be9463 --- /dev/null +++ b/configs/mmdet/rpn/rpn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './rpn_r50_caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/configs/mmdet/rpn/rpn_r101_fpn_1x_coco.py b/configs/mmdet/rpn/rpn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..962728ff --- /dev/null +++ b/configs/mmdet/rpn/rpn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/rpn/rpn_r101_fpn_2x_coco.py b/configs/mmdet/rpn/rpn_r101_fpn_2x_coco.py new file mode 100644 index 00000000..ac7671c1 --- /dev/null +++ b/configs/mmdet/rpn/rpn_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './rpn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/rpn/rpn_r50_caffe_c4_1x_coco.py b/configs/mmdet/rpn/rpn_r50_caffe_c4_1x_coco.py new file mode 100644 index 00000000..6da0ee94 --- /dev/null +++ b/configs/mmdet/rpn/rpn_r50_caffe_c4_1x_coco.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/rpn_r50_caffe_c4.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# dataset settings +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_label=False), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='proposal_fast') diff --git a/configs/mmdet/rpn/rpn_r50_caffe_fpn_1x_coco.py b/configs/mmdet/rpn/rpn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 00000000..68c36fa8 --- /dev/null +++ b/configs/mmdet/rpn/rpn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,41 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_label=False), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/rpn/rpn_r50_fpn_1x_coco.py b/configs/mmdet/rpn/rpn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..26f95a34 --- /dev/null +++ b/configs/mmdet/rpn/rpn_r50_fpn_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/rpn_r50_fpn.py', '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_label=False), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes']), +] +data = dict(train=dict(pipeline=train_pipeline)) +evaluation = dict(interval=1, metric='proposal_fast') diff --git a/configs/mmdet/rpn/rpn_r50_fpn_2x_coco.py b/configs/mmdet/rpn/rpn_r50_fpn_2x_coco.py new file mode 100644 index 00000000..2f264bfe --- /dev/null +++ b/configs/mmdet/rpn/rpn_r50_fpn_2x_coco.py @@ -0,0 +1,5 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' + +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/rpn/rpn_x101_32x4d_fpn_1x_coco.py b/configs/mmdet/rpn/rpn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 00000000..d0c73948 --- /dev/null +++ b/configs/mmdet/rpn/rpn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/rpn/rpn_x101_32x4d_fpn_2x_coco.py b/configs/mmdet/rpn/rpn_x101_32x4d_fpn_2x_coco.py new file mode 100644 index 00000000..c6880b76 --- /dev/null +++ b/configs/mmdet/rpn/rpn_x101_32x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './rpn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/rpn/rpn_x101_64x4d_fpn_1x_coco.py b/configs/mmdet/rpn/rpn_x101_64x4d_fpn_1x_coco.py new file mode 100644 index 00000000..96e691a9 --- /dev/null +++ b/configs/mmdet/rpn/rpn_x101_64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/rpn/rpn_x101_64x4d_fpn_2x_coco.py b/configs/mmdet/rpn/rpn_x101_64x4d_fpn_2x_coco.py new file mode 100644 index 00000000..4182a396 --- /dev/null +++ b/configs/mmdet/rpn/rpn_x101_64x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './rpn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/sabl/README.md b/configs/mmdet/sabl/README.md new file mode 100644 index 00000000..516bd702 --- /dev/null +++ b/configs/mmdet/sabl/README.md @@ -0,0 +1,47 @@ +# SABL + +> [Side-Aware Boundary Localization for More Precise Object Detection](https://arxiv.org/abs/1912.04260) + + + +## Abstract + +Current object detection frameworks mainly rely on bounding box regression to localize objects. Despite the remarkable progress in recent years, the precision of bounding box regression remains unsatisfactory, hence limiting performance in object detection. We observe that precise localization requires careful placement of each side of the bounding box. However, the mainstream approach, which focuses on predicting centers and sizes, is not the most effective way to accomplish this task, especially when there exists displacements with large variance between the anchors and the targets. In this paper, we propose an alternative approach, named as Side-Aware Boundary Localization (SABL), where each side of the bounding box is respectively localized with a dedicated network branch. To tackle the difficulty of precise localization in the presence of displacements with large variance, we further propose a two-step localization scheme, which first predicts a range of movement through bucket prediction and then pinpoints the precise position within the predicted bucket. We test the proposed method on both two-stage and single-stage detection frameworks. Replacing the standard bounding box regression branch with the proposed design leads to significant improvements on Faster R-CNN, RetinaNet, and Cascade R-CNN, by 3.0%, 1.7%, and 0.9%, respectively. + +
+ +
+ +## Results and Models + +The results on COCO 2017 val is shown in the below table. (results on test-dev are usually slightly higher than val). +Single-scale testing (1333x800) is adopted in all results. + +| Method | Backbone | Lr schd | ms-train | box AP | Config | Download | +| :----------------: | :-------: | :-----: | :------: | :----: | :----------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| SABL Faster R-CNN | R-50-FPN | 1x | N | 39.9 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/20200830_130324.log.json) | +| SABL Faster R-CNN | R-101-FPN | 1x | N | 41.7 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/sabl_faster_rcnn_r101_fpn_1x_coco-f804c6c1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/20200830_183949.log.json) | +| SABL Cascade R-CNN | R-50-FPN | 1x | N | 41.6 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/sabl_cascade_rcnn_r50_fpn_1x_coco-e1748e5e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/20200831_033726.log.json) | +| SABL Cascade R-CNN | R-101-FPN | 1x | N | 43.0 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/sabl_cascade_rcnn_r101_fpn_1x_coco-2b83e87c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/20200831_141745.log.json) | + +| Method | Backbone | GN | Lr schd | ms-train | box AP | Config | Download | +| :------------: | :-------: | :---: | :-----: | :---------: | :----: | :---------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| SABL RetinaNet | R-50-FPN | N | 1x | N | 37.7 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/20200830_053451.log.json) | +| SABL RetinaNet | R-50-FPN | Y | 1x | N | 38.8 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/sabl_retinanet_r50_fpn_gn_1x_coco-e16dfcf1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/20200831_141955.log.json) | +| SABL RetinaNet | R-101-FPN | N | 1x | N | 39.7 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/sabl_retinanet_r101_fpn_1x_coco-42026904.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/20200831_034256.log.json) | +| SABL RetinaNet | R-101-FPN | Y | 1x | N | 40.5 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/sabl_retinanet_r101_fpn_gn_1x_coco-40a893e8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/20200830_201422.log.json) | +| SABL RetinaNet | R-101-FPN | Y | 2x | Y (640~800) | 42.9 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco-1e63382c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/20200830_144807.log.json) | +| SABL RetinaNet | R-101-FPN | Y | 2x | Y (480~960) | 43.6 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco-5342f857.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/20200830_164537.log.json) | + +## Citation + +We provide config files to reproduce the object detection results in the ECCV 2020 Spotlight paper for [Side-Aware Boundary Localization for More Precise Object Detection](https://arxiv.org/abs/1912.04260). + +```latex +@inproceedings{Wang_2020_ECCV, + title = {Side-Aware Boundary Localization for More Precise Object Detection}, + author = {Jiaqi Wang and Wenwei Zhang and Yuhang Cao and Kai Chen and Jiangmiao Pang and Tao Gong and Jianping Shi and Chen Change Loy and Dahua Lin}, + booktitle = {ECCV}, + year = {2020} +} +``` diff --git a/configs/mmdet/sabl/metafile.yml b/configs/mmdet/sabl/metafile.yml new file mode 100644 index 00000000..23c51cff --- /dev/null +++ b/configs/mmdet/sabl/metafile.yml @@ -0,0 +1,140 @@ +Collections: + - Name: SABL + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - SABL + Paper: + URL: https://arxiv.org/abs/1912.04260 + Title: 'Side-Aware Boundary Localization for More Precise Object Detection' + README: configs/sabl/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/roi_heads/bbox_heads/sabl_head.py#L14 + Version: v2.4.0 + +Models: + - Name: sabl_faster_rcnn_r50_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth + + - Name: sabl_faster_rcnn_r101_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/sabl_faster_rcnn_r101_fpn_1x_coco-f804c6c1.pth + + - Name: sabl_cascade_rcnn_r50_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/sabl_cascade_rcnn_r50_fpn_1x_coco-e1748e5e.pth + + - Name: sabl_cascade_rcnn_r101_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/sabl_cascade_rcnn_r101_fpn_1x_coco-2b83e87c.pth + + - Name: sabl_retinanet_r50_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl_retinanet_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth + + - Name: sabl_retinanet_r50_fpn_gn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/sabl_retinanet_r50_fpn_gn_1x_coco-e16dfcf1.pth + + - Name: sabl_retinanet_r101_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl_retinanet_r101_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/sabl_retinanet_r101_fpn_1x_coco-42026904.pth + + - Name: sabl_retinanet_r101_fpn_gn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/sabl_retinanet_r101_fpn_gn_1x_coco-40a893e8.pth + + - Name: sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco + In Collection: SABL + Config: configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco-1e63382c.pth + + - Name: sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco + In Collection: SABL + Config: configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco-5342f857.pth diff --git a/configs/mmdet/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py b/configs/mmdet/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..64fe2304 --- /dev/null +++ b/configs/mmdet/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/models/cascade_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + roi_head=dict(bbox_head=[ + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.5), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.3), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0)) + ])) diff --git a/configs/mmdet/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..4b28a592 --- /dev/null +++ b/configs/mmdet/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,86 @@ +_base_ = [ + '../_base_/models/cascade_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + roi_head=dict(bbox_head=[ + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.5), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.3), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0)) + ])) diff --git a/configs/mmdet/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py b/configs/mmdet/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 00000000..e48d4259 --- /dev/null +++ b/configs/mmdet/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + roi_head=dict( + bbox_head=dict( + _delete_=True, + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)))) diff --git a/configs/mmdet/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..732c7ba3 --- /dev/null +++ b/configs/mmdet/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,34 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict( + _delete_=True, + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)))) diff --git a/configs/mmdet/sabl/sabl_retinanet_r101_fpn_1x_coco.py b/configs/mmdet/sabl/sabl_retinanet_r101_fpn_1x_coco.py new file mode 100644 index 00000000..b08e916c --- /dev/null +++ b/configs/mmdet/sabl/sabl_retinanet_r101_fpn_1x_coco.py @@ -0,0 +1,54 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py b/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py new file mode 100644 index 00000000..fc30d63d --- /dev/null +++ b/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py @@ -0,0 +1,56 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + norm_cfg=norm_cfg, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py b/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py new file mode 100644 index 00000000..e8fe1664 --- /dev/null +++ b/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py @@ -0,0 +1,73 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + norm_cfg=norm_cfg, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 960)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +data = dict(train=dict(pipeline=train_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py b/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py new file mode 100644 index 00000000..30c43399 --- /dev/null +++ b/configs/mmdet/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py @@ -0,0 +1,73 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + norm_cfg=norm_cfg, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +data = dict(train=dict(pipeline=train_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/sabl/sabl_retinanet_r50_fpn_1x_coco.py b/configs/mmdet/sabl/sabl_retinanet_r50_fpn_1x_coco.py new file mode 100644 index 00000000..6fe6bd66 --- /dev/null +++ b/configs/mmdet/sabl/sabl_retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,50 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py b/configs/mmdet/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py new file mode 100644 index 00000000..6acf080a --- /dev/null +++ b/configs/mmdet/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py @@ -0,0 +1,52 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + norm_cfg=norm_cfg, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/scnet/README.md b/configs/mmdet/scnet/README.md new file mode 100644 index 00000000..3769d808 --- /dev/null +++ b/configs/mmdet/scnet/README.md @@ -0,0 +1,63 @@ +# SCNet + +> [SCNet: Training Inference Sample Consistency for Instance Segmentation](https://arxiv.org/abs/2012.10150) + + + +## Abstract + + + +Cascaded architectures have brought significant performance improvement in object detection and instance segmentation. However, there are lingering issues regarding the disparity in the Intersection-over-Union (IoU) distribution of the samples between training and inference. This disparity can potentially exacerbate detection accuracy. This paper proposes an architecture referred to as Sample Consistency Network (SCNet) to ensure that the IoU distribution of the samples at training time is close to that at inference time. Furthermore, SCNet incorporates feature relay and utilizes global contextual information to further reinforce the reciprocal relationships among classifying, detecting, and segmenting sub-tasks. Extensive experiments on the standard COCO dataset reveal the effectiveness of the proposed method over multiple evaluation metrics, including box AP, mask AP, and inference speed. In particular, while running 38\% faster, the proposed SCNet improves the AP of the box and mask predictions by respectively 1.3 and 2.3 points compared to the strong Cascade Mask R-CNN baseline. + +
+ +
+ +## Dataset + +SCNet requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +| | ├── stuffthingmaps +``` + +## Results and Models + +The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val) + +| Backbone | Style | Lr schd | Mem (GB) | Inf speed (fps) | box AP | mask AP | TTA box AP | TTA mask AP | Config | Download | +|:---------------:|:-------:|:-------:|:--------:|:---------------:|:------:|:-------:|:----------:|:-----------:|:------:|:------------:| +| R-50-FPN | pytorch | 1x | 7.0 | 6.2 | 43.5 | 39.2 | 44.8 | 40.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco-c3f09857.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco_20210117_192725.log.json) | +| R-50-FPN | pytorch | 20e | 7.0 | 6.2 | 44.5 | 40.0 | 45.8 | 41.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_r50_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco-a569f645.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco_20210116_060148.log.json) | +| R-101-FPN | pytorch | 20e | 8.9 | 5.8 | 45.8 | 40.9 | 47.3 | 42.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_r101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco-294e312c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco_20210118_175824.log.json) | +| X-101-64x4d-FPN | pytorch | 20e | 13.2 | 4.9 | 47.5 | 42.3 | 48.9 | 44.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_x101_64x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco-fb09dec9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco_20210120_045959.log.json) | + +### Notes + +- Training hyper-parameters are identical to those of [HTC](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc). +- TTA means Test Time Augmentation, which applies horizontal flip and multi-scale testing. Refer to [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_r50_fpn_1x_coco.py). + +## Citation + +We provide the code for reproducing experiment results of [SCNet](https://arxiv.org/abs/2012.10150). + +```latex +@inproceedings{vu2019cascade, + title={SCNet: Training Inference Sample Consistency for Instance Segmentation}, + author={Vu, Thang and Haeyong, Kang and Yoo, Chang D}, + booktitle={AAAI}, + year={2021} +} +``` diff --git a/configs/mmdet/scnet/metafile.yml b/configs/mmdet/scnet/metafile.yml new file mode 100644 index 00000000..15eaebfa --- /dev/null +++ b/configs/mmdet/scnet/metafile.yml @@ -0,0 +1,116 @@ +Collections: + - Name: SCNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - SCNet + Paper: + URL: https://arxiv.org/abs/2012.10150 + Title: 'SCNet: Training Inference Sample Consistency for Instance Segmentation' + README: configs/scnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.9.0/mmdet/models/detectors/scnet.py#L6 + Version: v2.9.0 + +Models: + - Name: scnet_r50_fpn_1x_coco + In Collection: SCNet + Config: configs/scnet/scnet_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 161.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco-c3f09857.pth + + - Name: scnet_r50_fpn_20e_coco + In Collection: SCNet + Config: configs/scnet/scnet_r50_fpn_20e_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 161.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco-a569f645.pth + + - Name: scnet_r101_fpn_20e_coco + In Collection: SCNet + Config: configs/scnet/scnet_r101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 8.9 + inference time (ms/im): + - value: 172.41 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco-294e312c.pth + + - Name: scnet_x101_64x4d_fpn_20e_coco + In Collection: SCNet + Config: configs/scnet/scnet_x101_64x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 13.2 + inference time (ms/im): + - value: 204.08 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco-fb09dec9.pth diff --git a/configs/mmdet/scnet/scnet_r101_fpn_20e_coco.py b/configs/mmdet/scnet/scnet_r101_fpn_20e_coco.py new file mode 100644 index 00000000..ebba5297 --- /dev/null +++ b/configs/mmdet/scnet/scnet_r101_fpn_20e_coco.py @@ -0,0 +1,6 @@ +_base_ = './scnet_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/scnet/scnet_r50_fpn_1x_coco.py b/configs/mmdet/scnet/scnet_r50_fpn_1x_coco.py new file mode 100644 index 00000000..fe03b0d4 --- /dev/null +++ b/configs/mmdet/scnet/scnet_r50_fpn_1x_coco.py @@ -0,0 +1,136 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' +# model settings +model = dict( + type='SCNet', + roi_head=dict( + _delete_=True, + type='SCNetRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SCNetBBoxHead', + num_shared_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='SCNetBBoxHead', + num_shared_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='SCNetBBoxHead', + num_shared_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='SCNetMaskHead', + num_convs=12, + in_channels=256, + conv_out_channels=256, + num_classes=80, + conv_to_res=True, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='SCNetSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + loss_seg=dict( + type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2), + conv_to_res=True), + glbctx_head=dict( + type='GlobalContextHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_weight=3.0, + conv_to_res=True), + feat_relay_head=dict( + type='FeatureRelayHead', + in_channels=1024, + out_conv_channels=256, + roi_feat_size=7, + scale_factor=2))) + +# uncomment below code to enable test time augmentations +# img_norm_cfg = dict( +# mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +# test_pipeline = [ +# dict(type='LoadImageFromFile'), +# dict( +# type='MultiScaleFlipAug', +# img_scale=[(600, 900), (800, 1200), (1000, 1500), (1200, 1800), +# (1400, 2100)], +# flip=True, +# transforms=[ +# dict(type='Resize', keep_ratio=True), +# dict(type='RandomFlip', flip_ratio=0.5), +# dict(type='Normalize', **img_norm_cfg), +# dict(type='Pad', size_divisor=32), +# dict(type='ImageToTensor', keys=['img']), +# dict(type='Collect', keys=['img']), +# ]) +# ] +# data = dict( +# val=dict(pipeline=test_pipeline), +# test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/scnet/scnet_r50_fpn_20e_coco.py b/configs/mmdet/scnet/scnet_r50_fpn_20e_coco.py new file mode 100644 index 00000000..3b121a6a --- /dev/null +++ b/configs/mmdet/scnet/scnet_r50_fpn_20e_coco.py @@ -0,0 +1,4 @@ +_base_ = './scnet_r50_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 19]) +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/configs/mmdet/scnet/scnet_x101_64x4d_fpn_20e_coco.py b/configs/mmdet/scnet/scnet_x101_64x4d_fpn_20e_coco.py new file mode 100644 index 00000000..1e54b030 --- /dev/null +++ b/configs/mmdet/scnet/scnet_x101_64x4d_fpn_20e_coco.py @@ -0,0 +1,15 @@ +_base_ = './scnet_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/scnet/scnet_x101_64x4d_fpn_8x1_20e_coco.py b/configs/mmdet/scnet/scnet_x101_64x4d_fpn_8x1_20e_coco.py new file mode 100644 index 00000000..be8ddc51 --- /dev/null +++ b/configs/mmdet/scnet/scnet_x101_64x4d_fpn_8x1_20e_coco.py @@ -0,0 +1,8 @@ +_base_ = './scnet_x101_64x4d_fpn_20e_coco.py' +data = dict(samples_per_gpu=1, workers_per_gpu=1) +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/configs/mmdet/scratch/README.md b/configs/mmdet/scratch/README.md new file mode 100644 index 00000000..52239030 --- /dev/null +++ b/configs/mmdet/scratch/README.md @@ -0,0 +1,35 @@ +# Scratch + +> [Rethinking ImageNet Pre-training](https://arxiv.org/abs/1811.08883) + + + +## Abstract + +We report competitive results on object detection and instance segmentation on the COCO dataset using standard models trained from random initialization. The results are no worse than their ImageNet pre-training counterparts even when using the hyper-parameters of the baseline system (Mask R-CNN) that were optimized for fine-tuning pre-trained models, with the sole exception of increasing the number of training iterations so the randomly initialized models may converge. Training from random initialization is surprisingly robust; our results hold even when: (i) using only 10% of the training data, (ii) for deeper and wider models, and (iii) for multiple tasks and metrics. Experiments show that ImageNet pre-training speeds up convergence early in training, but does not necessarily provide regularization or improve final target task accuracy. To push the envelope we demonstrate 50.9 AP on COCO object detection without using any external data---a result on par with the top COCO 2017 competition results that used ImageNet pre-training. These observations challenge the conventional wisdom of ImageNet pre-training for dependent tasks and we expect these discoveries will encourage people to rethink the current de facto paradigm of `pre-training and fine-tuning' in computer vision. + +
+ +
+ +## Results and Models + +| Model | Backbone | Style | Lr schd | box AP | mask AP | Config | Download | +|:------------:|:---------:|:-------:|:-------:|:------:|:-------:|:------:|:--------:| +| Faster R-CNN | R-50-FPN | pytorch | 6x | 40.7 | | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_bbox_mAP-0.407_20200201_193013-90813d01.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_20200201_193013.log.json) | +| Mask R-CNN | R-50-FPN | pytorch | 6x | 41.2 | 37.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_20200201_193051.log.json) | + +Note: + +- The above models are trained with 16 GPUs. + +## Citation + +```latex +@article{he2018rethinking, + title={Rethinking imagenet pre-training}, + author={He, Kaiming and Girshick, Ross and Doll{\'a}r, Piotr}, + journal={arXiv preprint arXiv:1811.08883}, + year={2018} +} +``` diff --git a/configs/mmdet/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py b/configs/mmdet/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py new file mode 100644 index 00000000..55aa3a6e --- /dev/null +++ b/configs/mmdet/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py @@ -0,0 +1,24 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + frozen_stages=-1, + zero_init_residual=False, + norm_cfg=norm_cfg, + init_cfg=None), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg))) +# optimizer +optimizer = dict(paramwise_cfg=dict(norm_decay_mult=0)) +optimizer_config = dict(_delete_=True, grad_clip=None) +# learning policy +lr_config = dict(warmup_ratio=0.1, step=[65, 71]) +runner = dict(type='EpochBasedRunner', max_epochs=73) diff --git a/configs/mmdet/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py b/configs/mmdet/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py new file mode 100644 index 00000000..cc52cb8f --- /dev/null +++ b/configs/mmdet/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py @@ -0,0 +1,25 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + frozen_stages=-1, + zero_init_residual=False, + norm_cfg=norm_cfg, + init_cfg=None), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg), + mask_head=dict(norm_cfg=norm_cfg))) +# optimizer +optimizer = dict(paramwise_cfg=dict(norm_decay_mult=0)) +optimizer_config = dict(_delete_=True, grad_clip=None) +# learning policy +lr_config = dict(warmup_ratio=0.1, step=[65, 71]) +runner = dict(type='EpochBasedRunner', max_epochs=73) diff --git a/configs/mmdet/scratch/metafile.yml b/configs/mmdet/scratch/metafile.yml new file mode 100644 index 00000000..65025fac --- /dev/null +++ b/configs/mmdet/scratch/metafile.yml @@ -0,0 +1,48 @@ +Collections: + - Name: Rethinking ImageNet Pre-training + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - RPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1811.08883 + Title: 'Rethinking ImageNet Pre-training' + README: configs/scratch/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py + Version: v2.0.0 + +Models: + - Name: faster_rcnn_r50_fpn_gn-all_scratch_6x_coco + In Collection: Rethinking ImageNet Pre-training + Config: configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py + Metadata: + Epochs: 72 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_bbox_mAP-0.407_20200201_193013-90813d01.pth + + - Name: mask_rcnn_r50_fpn_gn-all_scratch_6x_coco + In Collection: Rethinking ImageNet Pre-training + Config: configs/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py + Metadata: + Epochs: 72 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth diff --git a/configs/mmdet/seesaw_loss/README.md b/configs/mmdet/seesaw_loss/README.md new file mode 100644 index 00000000..c1c00ccd --- /dev/null +++ b/configs/mmdet/seesaw_loss/README.md @@ -0,0 +1,48 @@ +# Seesaw Loss + +> [Seesaw Loss for Long-Tailed Instance Segmentation](https://arxiv.org/abs/2008.10032) + + + +## Abstract + +Instance segmentation has witnessed a remarkable progress on class-balanced benchmarks. However, they fail to perform as accurately in real-world scenarios, where the category distribution of objects naturally comes with a long tail. Instances of head classes dominate a long-tailed dataset and they serve as negative samples of tail categories. The overwhelming gradients of negative samples on tail classes lead to a biased learning process for classifiers. Consequently, objects of tail categories are more likely to be misclassified as backgrounds or head categories. To tackle this problem, we propose Seesaw Loss to dynamically re-balance gradients of positive and negative samples for each category, with two complementary factors, i.e., mitigation factor and compensation factor. The mitigation factor reduces punishments to tail categories w.r.t. the ratio of cumulative training instances between different categories. Meanwhile, the compensation factor increases the penalty of misclassified instances to avoid false positives of tail categories. We conduct extensive experiments on Seesaw Loss with mainstream frameworks and different data sampling strategies. With a simple end-to-end training pipeline, Seesaw Loss obtains significant gains over Cross-Entropy Loss, and achieves state-of-the-art performance on LVIS dataset without bells and whistles. + +
+ +
+ +* Please setup [LVIS dataset](../lvis/README.md) for MMDetection. + +* RFS indicates to use oversample strategy [here](../../docs/tutorials/customize_dataset.md#class-balanced-dataset) with oversample threshold `1e-3`. + +## Results and models of Seasaw Loss on LVIS v1 dataset + + +| Method | Backbone | Style | Lr schd | Data Sampler | Norm Mask | box AP | mask AP | Config | Download | +| :----------------: | :-------: | :-----: | :-----: | :----------: | :-------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Mask R-CNN | R-50-FPN | pytorch | 2x | random | N | 25.6 | 25.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-a698dd3d.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-50-FPN | pytorch | 2x | random | Y | 25.6 | 25.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a1c11314.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-101-FPN | pytorch | 2x | random | N | 27.4 | 26.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-8e6e6dd5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-101-FPN | pytorch | 2x | random | Y | 27.2 | 27.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a0b59c42.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-50-FPN | pytorch | 2x | RFS | N | 27.6 | 26.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-392a804b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-50-FPN | pytorch | 2x | RFS | Y | 27.6 | 26.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-cd0f6a12.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-101-FPN | pytorch | 2x | RFS | N | 28.9 | 27.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-e68eb464.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-101-FPN | pytorch | 2x | RFS | Y | 28.9 | 28.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-1d817139.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Cascade Mask R-CNN | R-101-FPN | pytorch | 2x | random | N | 33.1 | 29.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-71e2215e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Cascade Mask R-CNN | R-101-FPN | pytorch | 2x | random | Y | 33.0 | 30.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-8b5a6745.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Cascade Mask R-CNN | R-101-FPN | pytorch | 2x | RFS | N | 30.0 | 29.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-5d8ca2a4.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Cascade Mask R-CNN | R-101-FPN | pytorch | 2x | RFS | Y | 32.8 | 30.1 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-c8551505.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | + +## Citation + +We provide config files to reproduce the instance segmentation performance in the CVPR 2021 paper for [Seesaw Loss for Long-Tailed Instance Segmentation](https://arxiv.org/abs/2008.10032). + +```latex +@inproceedings{wang2021seesaw, + title={Seesaw Loss for Long-Tailed Instance Segmentation}, + author={Jiaqi Wang and Wenwei Zhang and Yuhang Zang and Yuhang Cao and Jiangmiao Pang and Tao Gong and Kai Chen and Ziwei Liu and Chen Change Loy and Dahua Lin}, + booktitle={Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition}, + year={2021} +} +``` diff --git a/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..beeb0d1e --- /dev/null +++ b/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py @@ -0,0 +1,132 @@ +_base_ = [ + '../_base_/models/cascade_mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + roi_head=dict( + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +dataset_type = 'LVISV1Dataset' +data_root = 'data/lvis_v1/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_train.json', + img_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_val.json', + img_prefix=data_root, + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_val.json', + img_prefix=data_root, + pipeline=test_pipeline)) +evaluation = dict(interval=24, metric=['bbox', 'segm']) diff --git a/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..0f299484 --- /dev/null +++ b/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py @@ -0,0 +1,5 @@ +_base_ = './cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py' # noqa: E501 +model = dict( + roi_head=dict( + mask_head=dict( + predictor_cfg=dict(type='NormedConv2d', tempearture=20)))) diff --git a/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..bb88750f --- /dev/null +++ b/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py @@ -0,0 +1,98 @@ +_base_ = [ + '../_base_/models/cascade_mask_rcnn_r50_fpn.py', + '../_base_/datasets/lvis_v1_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + roi_head=dict( + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(dataset=dict(pipeline=train_pipeline))) +evaluation = dict(interval=24, metric=['bbox', 'segm']) diff --git a/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..262e76bd --- /dev/null +++ b/configs/mmdet/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py @@ -0,0 +1,5 @@ +_base_ = './cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py' # noqa: E501 +model = dict( + roi_head=dict( + mask_head=dict( + predictor_cfg=dict(type='NormedConv2d', tempearture=20)))) diff --git a/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..57deab10 --- /dev/null +++ b/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..a5399292 --- /dev/null +++ b/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py' # noqa: E501 +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..1f5065e7 --- /dev/null +++ b/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..13d0b5f2 --- /dev/null +++ b/configs/mmdet/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py' # noqa: E501 +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..743f5f26 --- /dev/null +++ b/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py @@ -0,0 +1,75 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict( + num_classes=1203, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0)), + mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +dataset_type = 'LVISV1Dataset' +data_root = 'data/lvis_v1/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_train.json', + img_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_val.json', + img_prefix=data_root, + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/lvis_v1_val.json', + img_prefix=data_root, + pipeline=test_pipeline)) +evaluation = dict(interval=24, metric=['bbox', 'segm']) diff --git a/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..0af89210 --- /dev/null +++ b/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py @@ -0,0 +1,5 @@ +_base_ = './mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py' +model = dict( + roi_head=dict( + mask_head=dict( + predictor_cfg=dict(type='NormedConv2d', tempearture=20)))) diff --git a/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..4fc15049 --- /dev/null +++ b/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py @@ -0,0 +1,41 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/lvis_v1_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict( + num_classes=1203, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0)), + mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(dataset=dict(pipeline=train_pipeline))) +evaluation = dict(interval=12, metric=['bbox', 'segm']) diff --git a/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py new file mode 100644 index 00000000..0ef6bd2c --- /dev/null +++ b/configs/mmdet/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py @@ -0,0 +1,5 @@ +_base_ = './mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py' +model = dict( + roi_head=dict( + mask_head=dict( + predictor_cfg=dict(type='NormedConv2d', tempearture=20)))) diff --git a/configs/mmdet/seesaw_loss/metafile.yml b/configs/mmdet/seesaw_loss/metafile.yml new file mode 100644 index 00000000..70dd2fe6 --- /dev/null +++ b/configs/mmdet/seesaw_loss/metafile.yml @@ -0,0 +1,203 @@ +Collections: + - Name: Seesaw Loss + Metadata: + Training Data: LVIS + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Softmax + - RPN + - Convolution + - Dense Connections + - FPN + - ResNet + - RoIAlign + - Seesaw Loss + Paper: + URL: https://arxiv.org/abs/2008.10032 + Title: 'Seesaw Loss for Long-Tailed Instance Segmentation' + README: configs/seesaw_loss/README.md + +Models: + - Name: mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 25.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 25.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-a698dd3d.pth + - Name: mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 25.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 25.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a1c11314.pth + - Name: mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.4 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 26.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-8e6e6dd5.pth + - Name: mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.2 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 27.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a0b59c42.pth + - Name: mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 26.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-392a804b.pth + - Name: mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 26.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-cd0f6a12.pth + - Name: mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 28.9 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 27.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-e68eb464.pth + - Name: mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 28.9 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 28.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-1d817139.pth + - Name: cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 33.1 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 29.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-71e2215e.pth + - Name: cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 33.0 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 30.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-8b5a6745.pth + - Name: cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 30.0 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 29.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-5d8ca2a4.pth + - Name: cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 32.8 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 30.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-c8551505.pth diff --git a/configs/mmdet/selfsup_pretrain/README.md b/configs/mmdet/selfsup_pretrain/README.md new file mode 100644 index 00000000..cff4cfc9 --- /dev/null +++ b/configs/mmdet/selfsup_pretrain/README.md @@ -0,0 +1,109 @@ +# Backbones Trained by Self-Supervise Algorithms + + + +## Abstract + +Unsupervised image representations have significantly reduced the gap with supervised pretraining, notably with the recent achievements of contrastive learning methods. These contrastive methods typically work online and rely on a large number of explicit pairwise feature comparisons, which is computationally challenging. In this paper, we propose an online algorithm, SwAV, that takes advantage of contrastive methods without requiring to compute pairwise comparisons. Specifically, our method simultaneously clusters the data while enforcing consistency between cluster assignments produced for different augmentations (or views) of the same image, instead of comparing features directly as in contrastive learning. Simply put, we use a swapped prediction mechanism where we predict the cluster assignment of a view from the representation of another view. Our method can be trained with large and small batches and can scale to unlimited amounts of data. Compared to previous contrastive methods, our method is more memory efficient since it does not require a large memory bank or a special momentum network. In addition, we also propose a new data augmentation strategy, multi-crop, that uses a mix of views with different resolutions in place of two full-resolution views, without increasing the memory or compute requirements much. We validate our findings by achieving 75.3% top-1 accuracy on ImageNet with ResNet-50, as well as surpassing supervised pretraining on all the considered transfer tasks. + +
+ +
+ +We present Momentum Contrast (MoCo) for unsupervised visual representation learning. From a perspective on contrastive learning as dictionary look-up, we build a dynamic dictionary with a queue and a moving-averaged encoder. This enables building a large and consistent dictionary on-the-fly that facilitates contrastive unsupervised learning. MoCo provides competitive results under the common linear protocol on ImageNet classification. More importantly, the representations learned by MoCo transfer well to downstream tasks. MoCo can outperform its supervised pre-training counterpart in 7 detection/segmentation tasks on PASCAL VOC, COCO, and other datasets, sometimes surpassing it by large margins. This suggests that the gap between unsupervised and supervised representation learning has been largely closed in many vision tasks. + +
+ +
+ +## Usage + +To use a self-supervisely pretrained backbone, there are two steps to do: + +1. Download and convert the model to PyTorch-style supported by MMDetection +2. Modify the config and change the training setting accordingly + +### Convert model + +For more general usage, we also provide script `selfsup2mmdet.py` in the tools directory to convert the key of models pretrained by different self-supervised methods to PyTorch-style checkpoints used in MMDetection. + +```bash +python -u tools/model_converters/selfsup2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH} --selfsup ${method} +``` + +This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`. + +For example, to use a ResNet-50 backbone released by MoCo, you can download it from [here](https://dl.fbaipublicfiles.com/moco/moco_checkpoints/moco_v2_800ep/moco_v2_800ep_pretrain.pth.tar) and use the following command + +```bash +python -u tools/model_converters/selfsup2mmdet.py ./moco_v2_800ep_pretrain.pth.tar mocov2_r50_800ep_pretrain.pth --selfsup moco +``` + +To use the ResNet-50 backbone released by SwAV, you can download it from [here](https://dl.fbaipublicfiles.com/deepcluster/swav_800ep_pretrain.pth.tar) + +### Modify config + +The backbone requires SyncBN and the `frozen_stages` need to be changed. A config that use the moco backbone is as below + +```python +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + pretrained='./mocov2_r50_800ep_pretrain.pth', + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False)) + +``` + +## Results and Models + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-----: | :-----: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +|Mask RCNN |[R50 by MoCo v2](./mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py)| pytorch |1x|| |38.0|34.3|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco_20210604_114614-a8b63483.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco_20210604_114614.log.json)| +|Mask RCNN |[R50 by MoCo v2](./mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py)| pytorch | multi-scale 2x || |40.8|36.8|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco_20210605_163717-d95df20a.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco_20210605_163717.log.json)| +|Mask RCNN |[R50 by SwAV](./mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py)| pytorch | 1x || |39.1 | 35.7|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco/mask_rcnn_r50_fpn_swav-pretrain_1x_coco_20210604_114640-7b9baf28.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco/mask_rcnn_r50_fpn_swav-pretrain_1x_coco_20210604_114640.log.json)| +|Mask RCNN |[R50 by SwAV](./mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py)| pytorch | multi-scale 2x || |41.3|37.3|[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco_20210605_163717-08e26fca.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco_20210605_163717.log.json)| + +### Notice + +1. We only provide single-scale 1x and multi-scale 2x configs as examples to show how to use backbones trained by self-supervised algorithms. We will try to reproduce the results in their corresponding paper using the released backbone in the future. Please stay tuned. + +## Citation + +We support to apply the backbone models pre-trained by different self-supervised methods in detection systems and provide their results on Mask R-CNN. + +The pre-trained models are converted from [MoCo](https://github.com/facebookresearch/moco) and downloaded from [SwAV](https://github.com/facebookresearch/swav). + +For SwAV, please cite + +```latex +@article{caron2020unsupervised, + title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments}, + author={Caron, Mathilde and Misra, Ishan and Mairal, Julien and Goyal, Priya and Bojanowski, Piotr and Joulin, Armand}, + booktitle={Proceedings of Advances in Neural Information Processing Systems (NeurIPS)}, + year={2020} +} +``` + +For MoCo, please cite + +```latex +@Article{he2019moco, + author = {Kaiming He and Haoqi Fan and Yuxin Wu and Saining Xie and Ross Girshick}, + title = {Momentum Contrast for Unsupervised Visual Representation Learning}, + journal = {arXiv preprint arXiv:1911.05722}, + year = {2019}, +} +@Article{chen2020mocov2, + author = {Xinlei Chen and Haoqi Fan and Ross Girshick and Kaiming He}, + title = {Improved Baselines with Momentum Contrastive Learning}, + journal = {arXiv preprint arXiv:2003.04297}, + year = {2020}, +} +``` diff --git a/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py b/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py new file mode 100644 index 00000000..f1e06152 --- /dev/null +++ b/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + init_cfg=dict( + type='Pretrained', checkpoint='./mocov2_r50_800ep_pretrain.pth'))) diff --git a/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py b/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py new file mode 100644 index 00000000..09aa1560 --- /dev/null +++ b/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py @@ -0,0 +1,32 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + init_cfg=dict( + type='Pretrained', checkpoint='./mocov2_r50_800ep_pretrain.pth'))) + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) +] + +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py b/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py new file mode 100644 index 00000000..f92a3453 --- /dev/null +++ b/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + init_cfg=dict( + type='Pretrained', checkpoint='./swav_800ep_pretrain.pth.tar'))) diff --git a/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py b/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py new file mode 100644 index 00000000..fe473613 --- /dev/null +++ b/configs/mmdet/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py @@ -0,0 +1,32 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + init_cfg=dict( + type='Pretrained', checkpoint='./swav_800ep_pretrain.pth.tar'))) + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) +] + +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/simple_copy_paste/README.md b/configs/mmdet/simple_copy_paste/README.md new file mode 100644 index 00000000..a7a6f194 --- /dev/null +++ b/configs/mmdet/simple_copy_paste/README.md @@ -0,0 +1,38 @@ +# SimpleCopyPaste + +> [Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation](https://arxiv.org/abs/2012.07177) + + + +## Abstract + +Building instance segmentation models that are data-efficient and can handle rare object categories is an important challenge in computer vision. Leveraging data augmentations is a promising direction towards addressing this challenge. Here, we perform a systematic study of the Copy-Paste augmentation ([13, 12]) for instance segmentation where we randomly paste objects onto an image. Prior studies on Copy-Paste relied on modeling the surrounding visual context for pasting the objects. However, we find that the simple mechanism of pasting objects randomly is good enough and can provide solid gains on top of strong baselines. Furthermore, we show Copy-Paste is additive with semi-supervised methods that leverage extra data through pseudo labeling (e.g. self-training). On COCO instance segmentation, we achieve 49.1 mask AP and 57.3 box AP, an improvement of +0.6 mask AP and +1.5 box AP over the previous state-of-the-art. We further demonstrate that Copy-Paste can lead to significant improvements on the LVIS benchmark. Our baseline model outperforms the LVIS 2020 Challenge winning entry by +3.6 mask AP on rare categories. + +
+ +
+ +## Results and Models + +### Mask R-CNN with Standard Scale Jittering (SSJ) and Simple Copy-Paste(SCP) + +Standard Scale Jittering(SSJ) resizes and crops an image with a resize range of 0.8 to 1.25 of the original image size, and Simple Copy-Paste(SCP) selects a random subset of objects from one of the images and pastes them onto the other image. + +| Backbone | Training schedule | Augmentation | batch size | box AP | mask AP | Config | +|:--------:|:-----------------:|:------------:|:----------:|:------:|:-------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:| +| R-50 | 90k | SSJ | 64 | 43.3 | 39.2 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py) | +| R-50 | 90k | SSJ+SCP | 64 | 43.9 | 39.2 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py) | +| R-50 | 270k | SSJ | 64 | 43.5 | 39.1 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py) | +| R-50 | 270k | SSJ+SCP | 64 | 45.1 | 40.3 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py) | + +## Citation + +```latex +@inproceedings{ghiasi2021simple, + title={Simple copy-paste is a strong data augmentation method for instance segmentation}, + author={Ghiasi, Golnaz and Cui, Yin and Srinivas, Aravind and Qian, Rui and Lin, Tsung-Yi and Cubuk, Ekin D and Le, Quoc V and Zoph, Barret}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2918--2928}, + year={2021} +} +``` diff --git a/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py b/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py new file mode 100644 index 00000000..d0ce9176 --- /dev/null +++ b/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + # 270k iterations with batch_size 64 is roughly equivalent to 144 epochs + '../common/ssj_270k_coco_instance.py', +] + +norm_cfg = dict(type='SyncBN', requires_grad=True) +# Use MMSyncBN that handles empty tensor in head. It can be changed to +# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed. +head_norm_cfg = dict(type='MMSyncBN', requires_grad=True) +model = dict( + backbone=dict(frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg), + neck=dict(norm_cfg=norm_cfg), + rpn_head=dict(num_convs=2), # leads to 0.1+ mAP + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=head_norm_cfg), + mask_head=dict(norm_cfg=head_norm_cfg))) diff --git a/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py b/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py new file mode 100644 index 00000000..1eee95fe --- /dev/null +++ b/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py @@ -0,0 +1,7 @@ +_base_ = 'mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py' + +# lr steps at [0.9, 0.95, 0.975] of the maximum iterations +lr_config = dict( + warmup_iters=500, warmup_ratio=0.067, step=[81000, 85500, 87750]) +# 90k iterations with batch_size 64 is roughly equivalent to 48 epochs +runner = dict(type='IterBasedRunner', max_iters=90000) diff --git a/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py b/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py new file mode 100644 index 00000000..bd28ddda --- /dev/null +++ b/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + # 270k iterations with batch_size 64 is roughly equivalent to 144 epochs + '../common/ssj_scp_270k_coco_instance.py' +] + +norm_cfg = dict(type='SyncBN', requires_grad=True) +# Use MMSyncBN that handles empty tensor in head. It can be changed to +# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed. +head_norm_cfg = dict(type='MMSyncBN', requires_grad=True) +model = dict( + backbone=dict(frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg), + neck=dict(norm_cfg=norm_cfg), + rpn_head=dict(num_convs=2), # leads to 0.1+ mAP + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=head_norm_cfg), + mask_head=dict(norm_cfg=head_norm_cfg))) diff --git a/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py b/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py new file mode 100644 index 00000000..b632c13a --- /dev/null +++ b/configs/mmdet/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py @@ -0,0 +1,7 @@ +_base_ = 'mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py' + +# lr steps at [0.9, 0.95, 0.975] of the maximum iterations +lr_config = dict( + warmup_iters=500, warmup_ratio=0.067, step=[81000, 85500, 87750]) +# 90k iterations with batch_size 64 is roughly equivalent to 48 epochs +runner = dict(type='IterBasedRunner', max_iters=90000) diff --git a/configs/mmdet/solo/README.md b/configs/mmdet/solo/README.md new file mode 100644 index 00000000..8bd04325 --- /dev/null +++ b/configs/mmdet/solo/README.md @@ -0,0 +1,54 @@ +# SOLO + +> [SOLO: Segmenting Objects by Locations](https://arxiv.org/abs/1912.04488) + + + +## Abstract + +We present a new, embarrassingly simple approach to instance segmentation in images. Compared to many other dense prediction tasks, e.g., semantic segmentation, it is the arbitrary number of instances that have made instance segmentation much more challenging. In order to predict a mask for each instance, mainstream approaches either follow the 'detect-thensegment' strategy as used by Mask R-CNN, or predict category masks first then use clustering techniques to group pixels into individual instances. We view the task of instance segmentation from a completely new perspective by introducing the notion of "instance categories", which assigns categories to each pixel within an instance according to the instance's location and size, thus nicely converting instance mask segmentation into a classification-solvable problem. Now instance segmentation is decomposed into two classification tasks. We demonstrate a much simpler and flexible instance segmentation framework with strong performance, achieving on par accuracy with Mask R-CNN and outperforming recent singleshot instance segmenters in accuracy. We hope that this very simple and strong framework can serve as a baseline for many instance-level recognition tasks besides instance segmentation. + +
+ +
+ +## Results and Models + +### SOLO + +| Backbone | Style | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP | Download | +|:---------:|:-------:|:--------:|:-------:|:--------:|:--------------:|:------:|:--------:| +| R-50 | pytorch | N | 1x | 8.0 | 14.0 | 33.1 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055-2290a6b8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055.log.json) | +| R-50 | pytorch | Y | 3x | 7.4 | 14.0 | 35.9 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353.log.json) | + +### Decoupled SOLO + +| Backbone | Style | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP | Download | +|:---------:|:-------:|:--------:|:-------:|:--------:|:--------------:|:-------:|:--------:| +| R-50 | pytorch | N | 1x | 7.8 | 12.5 | 33.9 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348.log.json) | +| R-50 | pytorch | Y | 3x | 7.9 | 12.5 | 36.7 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504-7b3301ec.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504.log.json) | + +- Decoupled SOLO has a decoupled head which is different from SOLO head. +Decoupled SOLO serves as an efficient and equivalent variant in accuracy +of SOLO. Please refer to the corresponding config files for details. + +### Decoupled Light SOLO + +| Backbone | Style | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP | Download | +|:---------:|:-------:|:--------:|:-------:|:--------:|:--------------:|:------:|:--------:| +| R-50 | pytorch | Y | 3x | 2.2 | 31.2 | 32.9 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703-e70e226f.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703.log.json) | + +- Decoupled Light SOLO using decoupled structure similar to Decoupled +SOLO head, with light-weight head and smaller input size, Please refer +to the corresponding config files for details. + +## Citation + +```latex +@inproceedings{wang2020solo, + title = {{SOLO}: Segmenting Objects by Locations}, + author = {Wang, Xinlong and Kong, Tao and Shen, Chunhua and Jiang, Yuning and Li, Lei}, + booktitle = {Proc. Eur. Conf. Computer Vision (ECCV)}, + year = {2020} +} +``` diff --git a/configs/mmdet/solo/decoupled_solo_light_r50_fpn_3x_coco.py b/configs/mmdet/solo/decoupled_solo_light_r50_fpn_3x_coco.py new file mode 100644 index 00000000..101f8f1d --- /dev/null +++ b/configs/mmdet/solo/decoupled_solo_light_r50_fpn_3x_coco.py @@ -0,0 +1,63 @@ +_base_ = './decoupled_solo_r50_fpn_3x_coco.py' + +# model settings +model = dict( + mask_head=dict( + type='DecoupledSOLOLightHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict( + type='DiceLoss', use_sigmoid=True, activate=False, + loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(852, 512), (852, 480), (852, 448), (852, 416), (852, 384), + (852, 352)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(852, 512), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/solo/decoupled_solo_r50_fpn_1x_coco.py b/configs/mmdet/solo/decoupled_solo_r50_fpn_1x_coco.py new file mode 100644 index 00000000..b611cdf4 --- /dev/null +++ b/configs/mmdet/solo/decoupled_solo_r50_fpn_1x_coco.py @@ -0,0 +1,28 @@ +_base_ = [ + './solo_r50_fpn_1x_coco.py', +] +# model settings +model = dict( + mask_head=dict( + type='DecoupledSOLOHead', + num_classes=80, + in_channels=256, + stacked_convs=7, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict( + type='DiceLoss', use_sigmoid=True, activate=False, + loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) + +optimizer = dict(type='SGD', lr=0.01) diff --git a/configs/mmdet/solo/decoupled_solo_r50_fpn_3x_coco.py b/configs/mmdet/solo/decoupled_solo_r50_fpn_3x_coco.py new file mode 100644 index 00000000..4a8c19de --- /dev/null +++ b/configs/mmdet/solo/decoupled_solo_r50_fpn_3x_coco.py @@ -0,0 +1,25 @@ +_base_ = './solo_r50_fpn_3x_coco.py' + +# model settings +model = dict( + mask_head=dict( + type='DecoupledSOLOHead', + num_classes=80, + in_channels=256, + stacked_convs=7, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict( + type='DiceLoss', use_sigmoid=True, activate=False, + loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) diff --git a/configs/mmdet/solo/metafile.yml b/configs/mmdet/solo/metafile.yml new file mode 100644 index 00000000..b6244e80 --- /dev/null +++ b/configs/mmdet/solo/metafile.yml @@ -0,0 +1,115 @@ +Collections: + - Name: SOLO + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - Convolution + - ResNet + Paper: https://arxiv.org/abs/1912.04488 + README: configs/solo/README.md + +Models: + - Name: decoupled_solo_r50_fpn_1x_coco + In Collection: SOLO + Config: configs/solo/decoupled_solo_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 12 + inference time (ms/im): + - value: 116.4 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1333, 800) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 33.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth + + - Name: decoupled_solo_r50_fpn_3x_coco + In Collection: SOLO + Config: configs/solo/decoupled_solo_r50_fpn_3x_coco.py + Metadata: + Training Memory (GB): 7.9 + Epochs: 36 + inference time (ms/im): + - value: 117.2 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1333, 800) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504-7b3301ec.pth + + - Name: decoupled_solo_light_r50_fpn_3x_coco + In Collection: SOLO + Config: configs/solo/decoupled_solo_light_r50_fpn_3x_coco.py + Metadata: + Training Memory (GB): 2.2 + Epochs: 36 + inference time (ms/im): + - value: 35.0 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (852, 512) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703-e70e226f.pth + + - Name: solo_r50_fpn_3x_coco + In Collection: SOLO + Config: configs/solo/solo_r50_fpn_3x_coco.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 36 + inference time (ms/im): + - value: 94.2 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1333, 800) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth + + - Name: solo_r50_fpn_1x_coco + In Collection: SOLO + Config: configs/solo/solo_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.0 + Epochs: 12 + inference time (ms/im): + - value: 95.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1333, 800) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 33.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055-2290a6b8.pth diff --git a/configs/mmdet/solo/solo_r50_fpn_1x_coco.py b/configs/mmdet/solo/solo_r50_fpn_1x_coco.py new file mode 100644 index 00000000..9093a504 --- /dev/null +++ b/configs/mmdet/solo/solo_r50_fpn_1x_coco.py @@ -0,0 +1,53 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='SOLO', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=0, + num_outs=5), + mask_head=dict( + type='SOLOHead', + num_classes=80, + in_channels=256, + stacked_convs=7, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), + # model training and testing settings + test_cfg=dict( + nms_pre=500, + score_thr=0.1, + mask_thr=0.5, + filter_thr=0.05, + kernel='gaussian', # gaussian/linear + sigma=2.0, + max_per_img=100)) + +# optimizer +optimizer = dict(type='SGD', lr=0.01) diff --git a/configs/mmdet/solo/solo_r50_fpn_3x_coco.py b/configs/mmdet/solo/solo_r50_fpn_3x_coco.py new file mode 100644 index 00000000..52302cdf --- /dev/null +++ b/configs/mmdet/solo/solo_r50_fpn_3x_coco.py @@ -0,0 +1,28 @@ +_base_ = './solo_r50_fpn_1x_coco.py' + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1333, 800), (1333, 768), (1333, 736), (1333, 704), + (1333, 672), (1333, 640)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(pipeline=train_pipeline)) + +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[27, 33]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/sparse_rcnn/README.md b/configs/mmdet/sparse_rcnn/README.md new file mode 100644 index 00000000..8aa50f4b --- /dev/null +++ b/configs/mmdet/sparse_rcnn/README.md @@ -0,0 +1,38 @@ +# Sparse R-CNN + +> [Sparse R-CNN: End-to-End Object Detection with Learnable Proposals](https://arxiv.org/abs/2011.12450) + + + +## Abstract + +We present Sparse R-CNN, a purely sparse method for object detection in images. Existing works on object detection heavily rely on dense object candidates, such as k anchor boxes pre-defined on all grids of image feature map of size H×W. In our method, however, a fixed sparse set of learned object proposals, total length of N, are provided to object recognition head to perform classification and location. By eliminating HWk (up to hundreds of thousands) hand-designed object candidates to N (e.g. 100) learnable proposals, Sparse R-CNN completely avoids all efforts related to object candidates design and many-to-one label assignment. More importantly, final predictions are directly output without non-maximum suppression post-procedure. Sparse R-CNN demonstrates accuracy, run-time and training convergence performance on par with the well-established detector baselines on the challenging COCO dataset, e.g., achieving 45.0 AP in standard 3× training schedule and running at 22 fps using ResNet-50 FPN model. We hope our work could inspire re-thinking the convention of dense prior in object detectors. + +
+ +
+ +## Results and Models + +| Model | Backbone | Style | Lr schd | Number of Proposals |Multi-Scale| RandomCrop | box AP | Config | Download | +|:------------:|:---------:|:-------:|:-------:|:-------: |:-------: |:---------:|:------:|:------:|:--------:| +| Sparse R-CNN | R-50-FPN | pytorch | 1x | 100 | False | False | 37.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.log.json) | +| Sparse R-CNN | R-50-FPN | pytorch | 3x | 100 | True | False | 42.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.log.json) | +| Sparse R-CNN | R-50-FPN | pytorch | 3x | 300 | True | True | 45.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.log.json) | +| Sparse R-CNN | R-101-FPN | pytorch | 3x | 100 | True | False | 44.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.log.json) | +| Sparse R-CNN | R-101-FPN | pytorch | 3x | 300 | True | True | 46.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.log.json) | + +### Notes + +We observe about 0.3 AP noise especially when using ResNet-101 as the backbone. + +## Citation + +```latex +@article{peize2020sparse, + title = {{SparseR-CNN}: End-to-End Object Detection with Learnable Proposals}, + author = {Peize Sun and Rufeng Zhang and Yi Jiang and Tao Kong and Chenfeng Xu and Wei Zhan and Masayoshi Tomizuka and Lei Li and Zehuan Yuan and Changhu Wang and Ping Luo}, + journal = {arXiv preprint arXiv:2011.12450}, + year = {2020} +} +``` diff --git a/configs/mmdet/sparse_rcnn/metafile.yml b/configs/mmdet/sparse_rcnn/metafile.yml new file mode 100644 index 00000000..bb1273ec --- /dev/null +++ b/configs/mmdet/sparse_rcnn/metafile.yml @@ -0,0 +1,80 @@ +Collections: + - Name: Sparse R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - Sparse R-CNN + Paper: + URL: https://arxiv.org/abs/2011.12450 + Title: 'Sparse R-CNN: End-to-End Object Detection with Learnable Proposals' + README: configs/sparse_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.9.0/mmdet/models/detectors/sparse_rcnn.py#L6 + Version: v2.9.0 + +Models: + - Name: sparse_rcnn_r50_fpn_1x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth + + - Name: sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.pth + + - Name: sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.pth + + - Name: sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.pth + + - Name: sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.pth diff --git a/configs/mmdet/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py b/configs/mmdet/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py new file mode 100644 index 00000000..de323bdf --- /dev/null +++ b/configs/mmdet/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py b/configs/mmdet/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py new file mode 100644 index 00000000..ab4c5f68 --- /dev/null +++ b/configs/mmdet/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py b/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 00000000..b383ee48 --- /dev/null +++ b/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,95 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +num_stages = 6 +num_proposals = 100 +model = dict( + type='SparseRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=0, + add_extra_convs='on_input', + num_outs=4), + rpn_head=dict( + type='EmbeddingRPNHead', + num_proposals=num_proposals, + proposal_feature_channel=256), + roi_head=dict( + type='SparseRoIHead', + num_stages=num_stages, + stage_loss_weights=[1] * num_stages, + proposal_feature_channel=256, + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='DIIHead', + num_classes=80, + num_ffn_fcs=2, + num_heads=8, + num_cls_fcs=1, + num_reg_fcs=3, + feedforward_channels=2048, + in_channels=256, + dropout=0.0, + ffn_act_cfg=dict(type='ReLU', inplace=True), + dynamic_conv_cfg=dict( + type='DynamicConv', + in_channels=256, + feat_channels=64, + out_channels=256, + input_feat_shape=7, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN')), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + clip_border=False, + target_means=[0., 0., 0., 0.], + target_stds=[0.5, 0.5, 1., 1.])) for _ in range(num_stages) + ]), + # training and testing settings + train_cfg=dict( + rpn=None, + rcnn=[ + dict( + assigner=dict( + type='HungarianAssigner', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=5.0), + iou_cost=dict(type='IoUCost', iou_mode='giou', + weight=2.0)), + sampler=dict(type='PseudoSampler'), + pos_weight=1) for _ in range(num_stages) + ]), + test_cfg=dict(rpn=None, rcnn=dict(max_per_img=num_proposals))) + +# optimizer +optimizer = dict(_delete_=True, type='AdamW', lr=0.000025, weight_decay=0.0001) +optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=1, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[8, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py b/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py new file mode 100644 index 00000000..36f1d62e --- /dev/null +++ b/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py @@ -0,0 +1,52 @@ +_base_ = './sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py' +num_proposals = 300 +model = dict( + rpn_head=dict(num_proposals=num_proposals), + test_cfg=dict( + _delete_=True, rpn=None, rcnn=dict(max_per_img=num_proposals))) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# augmentation strategy originates from DETR. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[[ + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict( + type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ]]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py b/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py new file mode 100644 index 00000000..2fa2a807 --- /dev/null +++ b/configs/mmdet/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py @@ -0,0 +1,23 @@ +_base_ = './sparse_rcnn_r50_fpn_1x_coco.py' + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +min_values = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, value) for value in min_values], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] + +data = dict(train=dict(pipeline=train_pipeline)) +lr_config = dict(policy='step', step=[27, 33]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/ssd/README.md b/configs/mmdet/ssd/README.md new file mode 100644 index 00000000..917f691a --- /dev/null +++ b/configs/mmdet/ssd/README.md @@ -0,0 +1,62 @@ +# SSD + +> [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325) + + + +## Abstract + +We present a method for detecting objects in images using a single deep neural network. Our approach, named SSD, discretizes the output space of bounding boxes into a set of default boxes over different aspect ratios and scales per feature map location. At prediction time, the network generates scores for the presence of each object category in each default box and produces adjustments to the box to better match the object shape. Additionally, the network combines predictions from multiple feature maps with different resolutions to naturally handle objects of various sizes. Our SSD model is simple relative to methods that require object proposals because it completely eliminates proposal generation and subsequent pixel or feature resampling stage and encapsulates all computation in a single network. This makes SSD easy to train and straightforward to integrate into systems that require a detection component. Experimental results on the PASCAL VOC, MS COCO, and ILSVRC datasets confirm that SSD has comparable accuracy to methods that utilize an additional object proposal step and is much faster, while providing a unified framework for both training and inference. Compared to other single stage methods, SSD has much better accuracy, even with a smaller input image size. For 300×300 input, SSD achieves 72.1% mAP on VOC2007 test at 58 FPS on a Nvidia Titan X and for 500×500 input, SSD achieves 75.1% mAP, outperforming a comparable state of the art Faster R-CNN model. + +
+ +
+ +## Results and models of SSD + +| Backbone | Size | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :---: | :---: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| VGG16 | 300 | caffe | 120e | 9.9 | 43.7 | 25.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ssd/ssd300_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428-d231a06e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428.log.json) | +| VGG16 | 512 | caffe | 120e | 19.4 | 30.7 | 29.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ssd/ssd512_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849-0a47a1ca.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849.log.json) | + +## Results and models of SSD-Lite + +| Backbone | Size | Training from scratch | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------------: | :---: | :-------------------: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| MobileNetV2 | 320 | yes | 600e | 4.0 | 69.9 | 21.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627.log.json) | + +## Notice + +### Compatibility + +In v2.14.0, [PR5291](https://github.com/open-mmlab/mmdetection/pull/5291) refactored SSD neck and head for more +flexible usage. If users want to use the SSD checkpoint trained in the older versions, we provide a scripts +`tools/model_converters/upgrade_ssd_version.py` to convert the model weights. + +```bash +python tools/model_converters/upgrade_ssd_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH} + +``` + +- OLD_MODEL_PATH: the path to load the old version SSD model. +- NEW_MODEL_PATH: the path to save the converted model weights. + +### SSD-Lite training settings + +There are some differences between our implementation of MobileNetV2 SSD-Lite and the one in [TensorFlow 1.x detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md) . + +1. Use 320x320 as input size instead of 300x300. +2. The anchor sizes are different. +3. The C4 feature map is taken from the last layer of stage 4 instead of the middle of the block. +4. The model in TensorFlow1.x is trained on coco 2014 and validated on coco minival2014, but we trained and validated the model on coco 2017. The mAP on val2017 is usually a little lower than minival2014 (refer to the results in TensorFlow Object Detection API, e.g., MobileNetV2 SSD gets 22 mAP on minival2014 but 20.2 mAP on val2017). + +## Citation + +```latex +@article{Liu_2016, + title={SSD: Single Shot MultiBox Detector}, + journal={ECCV}, + author={Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.}, + year={2016}, +} +``` diff --git a/configs/mmdet/ssd/metafile.yml b/configs/mmdet/ssd/metafile.yml new file mode 100644 index 00000000..b9ee79cd --- /dev/null +++ b/configs/mmdet/ssd/metafile.yml @@ -0,0 +1,78 @@ +Collections: + - Name: SSD + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - VGG + Paper: + URL: https://arxiv.org/abs/1512.02325 + Title: 'SSD: Single Shot MultiBox Detector' + README: configs/ssd/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.14.0/mmdet/models/dense_heads/ssd_head.py#L16 + Version: v2.14.0 + +Models: + - Name: ssd300_coco + In Collection: SSD + Config: configs/ssd/ssd300_coco.py + Metadata: + Training Memory (GB): 9.9 + inference time (ms/im): + - value: 22.88 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (300, 300) + Epochs: 120 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 25.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428-d231a06e.pth + + - Name: ssd512_coco + In Collection: SSD + Config: configs/ssd/ssd512_coco.py + Metadata: + Training Memory (GB): 19.4 + inference time (ms/im): + - value: 32.57 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (512, 512) + Epochs: 120 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 29.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849-0a47a1ca.pth + + - Name: ssdlite_mobilenetv2_scratch_600e_coco + In Collection: SSD + Config: configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 14.3 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (320, 320) + Epochs: 600 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 21.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth diff --git a/configs/mmdet/ssd/ssd300_coco.py b/configs/mmdet/ssd/ssd300_coco.py new file mode 100644 index 00000000..1891bade --- /dev/null +++ b/configs/mmdet/ssd/ssd300_coco.py @@ -0,0 +1,71 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(300, 300), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(300, 300), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=3, + train=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4) +optimizer_config = dict(_delete_=True) +custom_hooks = [ + dict(type='NumClassCheckHook'), + dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW') +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/ssd/ssd512_coco.py b/configs/mmdet/ssd/ssd512_coco.py new file mode 100644 index 00000000..117777ff --- /dev/null +++ b/configs/mmdet/ssd/ssd512_coco.py @@ -0,0 +1,84 @@ +_base_ = 'ssd300_coco.py' +input_size = 512 +model = dict( + neck=dict( + out_channels=(512, 1024, 512, 256, 256, 256, 256), + level_strides=(2, 2, 2, 2, 1), + level_paddings=(1, 1, 1, 1, 1), + last_kernel_size=4), + bbox_head=dict( + in_channels=(512, 1024, 512, 256, 256, 256, 256), + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + input_size=input_size, + basesize_ratio_range=(0.1, 0.9), + strides=[8, 16, 32, 64, 128, 256, 512], + ratios=[[2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]]))) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(512, 512), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(512, 512), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=3, + train=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4) +optimizer_config = dict(_delete_=True) +custom_hooks = [ + dict(type='NumClassCheckHook'), + dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW') +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py b/configs/mmdet/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py new file mode 100644 index 00000000..929eb6c6 --- /dev/null +++ b/configs/mmdet/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py @@ -0,0 +1,150 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] + +model = dict( + type='SingleStageDetector', + backbone=dict( + type='MobileNetV2', + out_indices=(4, 7), + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)), + neck=dict( + type='SSDNeck', + in_channels=(96, 1280), + out_channels=(96, 1280, 512, 256, 256, 128), + level_strides=(2, 2, 2, 2), + level_paddings=(1, 1, 1, 1), + l2_norm_scale=None, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)), + bbox_head=dict( + type='SSDHead', + in_channels=(96, 1280, 512, 256, 256, 128), + num_classes=80, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='Normal', layer='Conv2d', std=0.001), + + # set anchor size manually instead of using the predefined + # SSD300 setting. + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + strides=[16, 32, 64, 107, 160, 320], + ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], + min_sizes=[48, 100, 150, 202, 253, 304], + max_sizes=[100, 150, 202, 253, 304, 320]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2])), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False), + test_cfg=dict( + nms_pre=1000, + nms=dict(type='nms', iou_threshold=0.45), + min_bbox_size=0, + score_thr=0.02, + max_per_img=200)) +cudnn_benchmark = True + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(320, 320), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=320), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(320, 320), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=320), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=24, + workers_per_gpu=4, + train=dict( + _delete_=True, + type='RepeatDataset', # use RepeatDataset to speed up training + times=5, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +# optimizer +optimizer = dict(type='SGD', lr=0.015, momentum=0.9, weight_decay=4.0e-5) +optimizer_config = dict(grad_clip=None) + +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + min_lr=0) +runner = dict(type='EpochBasedRunner', max_epochs=120) + +# Avoid evaluation and saving weights too frequently +evaluation = dict(interval=5, metric='bbox') +checkpoint_config = dict(interval=5) +custom_hooks = [ + dict(type='NumClassCheckHook'), + dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW') +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (24 samples per GPU) +auto_scale_lr = dict(base_batch_size=192) diff --git a/configs/mmdet/strong_baselines/README.md b/configs/mmdet/strong_baselines/README.md new file mode 100644 index 00000000..7c1be045 --- /dev/null +++ b/configs/mmdet/strong_baselines/README.md @@ -0,0 +1,20 @@ +# Strong Baselines + + + +We train Mask R-CNN with large-scale jitter and longer schedule as strong baselines. +The modifications follow those in [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/configs/new_baselines). + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: | +| R-50-FPN | pytorch | 50e | | | | | [config](./mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_50e_coco.py) | [model]() | [log]() | +| R-50-FPN | pytorch | 100e | | | | | [config](./mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py) | [model]() | [log]() | +| R-50-FPN | caffe | 100e | | | 44.7 | 40.4 | [config](./mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py) | [model]() | [log]() | +| R-50-FPN | caffe | 400e | | | | | [config](./mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_400e_coco.py) | [model]() | [log]() | + +## Notice + +When using large-scale jittering, there are sometimes empty proposals in the box and mask heads during training. +This requires MMSyncBN that allows empty tensors. Therefore, please use mmcv-full>=1.3.14 to train models supported in this directory. diff --git a/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py b/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py new file mode 100644 index 00000000..a40d6a03 --- /dev/null +++ b/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py @@ -0,0 +1,80 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../common/lsj_100e_coco_instance.py' +] + +norm_cfg = dict(type='SyncBN', requires_grad=True) +# Use MMSyncBN that handles empty tensor in head. It can be changed to +# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed +# Requires MMCV-full after https://github.com/open-mmlab/mmcv/pull/1205. +head_norm_cfg = dict(type='MMSyncBN', requires_grad=True) +model = dict( + backbone=dict( + frozen_stages=-1, + norm_eval=False, + norm_cfg=norm_cfg, + init_cfg=None, + style='caffe'), + neck=dict(norm_cfg=norm_cfg), + rpn_head=dict(num_convs=2), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=head_norm_cfg), + mask_head=dict(norm_cfg=head_norm_cfg))) + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +image_size = (1024, 1024) +train_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=image_size), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# Use RepeatDataset to speed up training +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py b/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py new file mode 100644 index 00000000..31824eb5 --- /dev/null +++ b/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py @@ -0,0 +1,2 @@ +_base_ = 'mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py' +fp16 = dict(loss_scale=512.) diff --git a/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_400e_coco.py b/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_400e_coco.py new file mode 100644 index 00000000..1211925d --- /dev/null +++ b/configs/mmdet/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_400e_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py' + +# Use RepeatDataset to speed up training +# change repeat time from 4 (for 100 epochs) to 16 (for 400 epochs) +data = dict(train=dict(times=4 * 4)) +lr_config = dict(warmup_iters=500 * 4) diff --git a/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py b/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py new file mode 100644 index 00000000..4a15d698 --- /dev/null +++ b/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py @@ -0,0 +1,22 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../common/lsj_100e_coco_instance.py' +] + +norm_cfg = dict(type='SyncBN', requires_grad=True) +# Use MMSyncBN that handles empty tensor in head. It can be changed to +# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed +# Requires MMCV-full after https://github.com/open-mmlab/mmcv/pull/1205. +head_norm_cfg = dict(type='MMSyncBN', requires_grad=True) +model = dict( + # the model is trained from scratch, so init_cfg is None + backbone=dict( + frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg, init_cfg=None), + neck=dict(norm_cfg=norm_cfg), + rpn_head=dict(num_convs=2), # leads to 0.1+ mAP + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=head_norm_cfg), + mask_head=dict(norm_cfg=head_norm_cfg))) diff --git a/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py b/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py new file mode 100644 index 00000000..7b97960a --- /dev/null +++ b/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py @@ -0,0 +1,3 @@ +_base_ = 'mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py' +# use FP16 +fp16 = dict(loss_scale=512.) diff --git a/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_50e_coco.py b/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_50e_coco.py new file mode 100644 index 00000000..922579a1 --- /dev/null +++ b/configs/mmdet/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_50e_coco.py @@ -0,0 +1,5 @@ +_base_ = 'mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py' + +# Use RepeatDataset to speed up training +# change repeat time from 4 (for 100 epochs) to 2 (for 50 epochs) +data = dict(train=dict(times=2)) diff --git a/configs/mmdet/swin/README.md b/configs/mmdet/swin/README.md new file mode 100644 index 00000000..abab315a --- /dev/null +++ b/configs/mmdet/swin/README.md @@ -0,0 +1,40 @@ +# Swin + +> [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) + + + +## Abstract + +This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures. + +
+ +
+ +## Results and Models + +### Mask R-CNN + +| Backbone | Pretrain | Lr schd | Multi-scale crop | FP16 |Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :------: | :---------: | :-----: | :-------------------:| :------: |:------: | :------------: | :----: | :-----: | :------: | :--------: | +| Swin-T | ImageNet-1K | 1x | no | no | 7.6 | | 42.7 | 39.3 | [config](./mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937.log.json) | +| Swin-T | ImageNet-1K | 3x | yes | no | 10.2 | | 46.0 | 41.6 | [config](./mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725.log.json) | +| Swin-T | ImageNet-1K | 3x | yes | yes | 7.8 | | 46.0 | 41.7 | [config](./mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006.log.json) | +| Swin-S | ImageNet-1K | 3x | yes | yes | 11.9 | | 48.2 | 43.2 | [config](./mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808-b92c91f1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808.log.json) | + +### Notice +Please follow the example +of `retinanet_swin-t-p4-w7_fpn_1x_coco.py` when you want to combine Swin Transformer with +the one-stage detector. Because there is a layer norm at the outs of Swin Transformer, you must set `start_level` as 0 in FPN, so we have to set the `out_indices` of backbone as `[1,2,3]`. + +## Citation + +```latex +@article{liu2021Swin, + title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows}, + author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining}, + journal={arXiv preprint arXiv:2103.14030}, + year={2021} +} +``` diff --git a/configs/mmdet/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py b/configs/mmdet/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py new file mode 100644 index 00000000..15d50a02 --- /dev/null +++ b/configs/mmdet/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py' +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa +model = dict( + backbone=dict( + depths=[2, 2, 18, 2], + init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py b/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py new file mode 100644 index 00000000..337e8581 --- /dev/null +++ b/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py @@ -0,0 +1,42 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa +model = dict( + type='MaskRCNN', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[96, 192, 384, 768])) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0001, + betas=(0.9, 0.999), + weight_decay=0.05, + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) +lr_config = dict(warmup_iters=1000, step=[8, 11]) +runner = dict(max_epochs=12) diff --git a/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py b/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py new file mode 100644 index 00000000..2be31143 --- /dev/null +++ b/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py @@ -0,0 +1,3 @@ +_base_ = './mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py' +# you need to set mode='dynamic' if you are using pytorch<=1.5.0 +fp16 = dict(loss_scale=dict(init_scale=512)) diff --git a/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py b/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py new file mode 100644 index 00000000..2612f6e3 --- /dev/null +++ b/configs/mmdet/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py @@ -0,0 +1,91 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +model = dict( + type='MaskRCNN', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[96, 192, 384, 768])) + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# augmentation strategy originates from DETR / Sparse RCNN +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[[ + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict( + type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ]]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +data = dict(train=dict(pipeline=train_pipeline)) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0001, + betas=(0.9, 0.999), + weight_decay=0.05, + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) +lr_config = dict(warmup_iters=1000, step=[27, 33]) +runner = dict(max_epochs=36) diff --git a/configs/mmdet/swin/metafile.yml b/configs/mmdet/swin/metafile.yml new file mode 100644 index 00000000..6c07f175 --- /dev/null +++ b/configs/mmdet/swin/metafile.yml @@ -0,0 +1,120 @@ +Models: + - Name: mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco + In Collection: Mask R-CNN + Config: configs/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 11.9 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + Training Resources: 8x V100 GPUs + Architecture: + - Swin Transformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808-b92c91f1.pth + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' + README: configs/swin/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 + + - Name: mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco + In Collection: Mask R-CNN + Config: configs/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 10.2 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + Training Resources: 8x V100 GPUs + Architecture: + - Swin Transformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' + README: configs/swin/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 + + - Name: mask_rcnn_swin-t-p4-w7_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + Epochs: 12 + Training Data: COCO + Training Techniques: + - AdamW + Training Resources: 8x V100 GPUs + Architecture: + - Swin Transformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' + README: configs/swin/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 + + - Name: mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco + In Collection: Mask R-CNN + Config: configs/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + Training Resources: 8x V100 GPUs + Architecture: + - Swin Transformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' + README: configs/swin/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 diff --git a/configs/mmdet/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py b/configs/mmdet/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py new file mode 100644 index 00000000..33150932 --- /dev/null +++ b/configs/mmdet/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + # Please only add indices that would be used + # in FPN, otherwise some parameter will not be used + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[192, 384, 768], start_level=0, num_outs=5)) + +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/timm_example/README.md b/configs/mmdet/timm_example/README.md new file mode 100644 index 00000000..0eb30cb5 --- /dev/null +++ b/configs/mmdet/timm_example/README.md @@ -0,0 +1,62 @@ +# Timm Example + +> [PyTorch Image Models](https://github.com/rwightman/pytorch-image-models) + + + +## Abstract + +Py**T**orch **Im**age **M**odels (`timm`) is a collection of image models, layers, utilities, optimizers, schedulers, data-loaders / augmentations, and reference training / validation scripts that aim to pull together a wide variety of SOTA models with ability to reproduce ImageNet training results. + + + +## Results and Models + +### RetinaNet + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +|:---------------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:| +| R-50 | pytorch | 1x | | | | [config](./retinanet_timm_tv_resnet50_fpn_1x_coco.py) | | +| EfficientNet-B1 | - | 1x | | | | [config](./retinanet_timm_efficientnet_b1_fpn_1x_coco.py) | | + +## Usage + +### Install additional requirements + +MMDetection supports timm backbones via `TIMMBackbone`, a wrapper class in MMClassification. +Thus, you need to install `mmcls` in addition to timm. +If you have already installed requirements for mmdet, run + +```shell +pip install 'dataclasses; python_version<"3.7"' +pip install timm +pip install 'mmcls>=0.20.0' +``` + +See [this document](https://mmclassification.readthedocs.io/en/latest/install.html) for the details of MMClassification installation. + +### Edit config + +* See example configs for basic usage. +* See the documents of [timm feature extraction](https://rwightman.github.io/pytorch-image-models/feature_extraction/#multi-scale-feature-maps-feature-pyramid) and [TIMMBackbone](https://mmclassification.readthedocs.io/en/latest/api.html#mmcls.models.backbones.TIMMBackbone) for details. +* Which feature map is output depends on the backbone. + Please check `backbone out_channels` and `backbone out_strides` in your log, and modify `model.neck.in_channels` and `model.backbone.out_indices` if necessary. +* If you use Vision Transformer models that do not support `features_only=True`, add `custom_hooks = []` to your config to disable `NumClassCheckHook`. + +## Citation + +```latex +@misc{rw2019timm, + author = {Ross Wightman}, + title = {PyTorch Image Models}, + year = {2019}, + publisher = {GitHub}, + journal = {GitHub repository}, + doi = {10.5281/zenodo.4414861}, + howpublished = {\url{https://github.com/rwightman/pytorch-image-models}} +} +``` diff --git a/configs/mmdet/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py b/configs/mmdet/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py new file mode 100644 index 00000000..65001167 --- /dev/null +++ b/configs/mmdet/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# please install mmcls>=0.20.0 +# import mmcls.models to trigger register_module in mmcls +custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) +model = dict( + backbone=dict( + _delete_=True, + type='mmcls.TIMMBackbone', + model_name='efficientnet_b1', + features_only=True, + pretrained=True, + out_indices=(1, 2, 3, 4)), + neck=dict(in_channels=[24, 40, 112, 320])) + +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py b/configs/mmdet/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py new file mode 100644 index 00000000..0c5b7a89 --- /dev/null +++ b/configs/mmdet/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py @@ -0,0 +1,19 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# please install mmcls>=0.20.0 +# import mmcls.models to trigger register_module in mmcls +custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) +model = dict( + backbone=dict( + _delete_=True, + type='mmcls.TIMMBackbone', + model_name='tv_resnet50', # ResNet-50 with torchvision weights + features_only=True, + pretrained=True, + out_indices=(1, 2, 3, 4))) + +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) diff --git a/configs/mmdet/tood/README.md b/configs/mmdet/tood/README.md new file mode 100644 index 00000000..6cfbffcd --- /dev/null +++ b/configs/mmdet/tood/README.md @@ -0,0 +1,40 @@ +# TOOD + +> [TOOD: Task-aligned One-stage Object Detection](https://arxiv.org/abs/2108.07755) + + + +## Abstract + +One-stage object detection is commonly implemented by optimizing two sub-tasks: object classification and localization, using heads with two parallel branches, which might lead to a certain level of spatial misalignment in predictions between the two tasks. In this work, we propose a Task-aligned One-stage Object Detection (TOOD) that explicitly aligns the two tasks in a learning-based manner. First, we design a novel Task-aligned Head (T-Head) which offers a better balance between learning task-interactive and task-specific features, as well as a greater flexibility to learn the alignment via a task-aligned predictor. Second, we propose Task Alignment Learning (TAL) to explicitly pull closer (or even unify) the optimal anchors for the two tasks during training via a designed sample assignment scheme and a task-aligned loss. Extensive experiments are conducted on MS-COCO, where TOOD achieves a 51.1 AP at single-model single-scale testing. This surpasses the recent one-stage detectors by a large margin, such as ATSS (47.7 AP), GFL (48.2 AP), and PAA (49.0 AP), with fewer parameters and FLOPs. Qualitative results also demonstrate the effectiveness of TOOD for better aligning the tasks of object classification and localization. + +
+ +
+ +## Results and Models + +| Backbone | Style | Anchor Type | Lr schd | Multi-scale Training| Mem (GB)| Inf time (fps) | box AP | Config | Download | +|:-----------------:|:-------:|:------------:|:-------:|:-------------------:|:-------:|:--------------:|:------:|:------:|:--------:| +| R-50 | pytorch | Anchor-free | 1x | N | 4.1 | | 42.4 | [config](./tood_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425.log) | +| R-50 | pytorch | Anchor-based | 1x | N | 4.1 | | 42.4 | [config](./tood_r50_fpn_anchor_based_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105-b776c134.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105.log) | +| R-50 | pytorch | Anchor-free | 2x | Y | 4.1 | | 44.5 | [config](./tood_r50_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231-3b23174c.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231.log) | +| R-101 | pytorch | Anchor-free | 2x | Y | 6.0 | | 46.1 | [config](./tood_r101_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232-a18f53c8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232.log) | +| R-101-dcnv2 | pytorch | Anchor-free | 2x | Y | 6.2 | | 49.3 | [config](./tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728-4a824142.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728.log) | +| X-101-64x4d | pytorch | Anchor-free | 2x | Y | 10.2 | | 47.6 | [config](./tood_x101_64x4d_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519-a4f36113.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519.log) | +| X-101-64x4d-dcnv2 | pytorch | Anchor-free | 2x | Y | | | | [config](./tood_x101_64x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py) | [model]() | [log]() | + +[1] *1x and 2x mean the model is trained for 90K and 180K iterations, respectively.* \ +[2] *All results are obtained with a single model and without any test time data augmentation such as multi-scale, flipping and etc..* \ +[3] *`dcnv2` denotes deformable convolutional networks v2.* \ + +## Citation + +```latex +@inproceedings{feng2021tood, + title={TOOD: Task-aligned One-stage Object Detection}, + author={Feng, Chengjian and Zhong, Yujie and Gao, Yu and Scott, Matthew R and Huang, Weilin}, + booktitle={ICCV}, + year={2021} +} +``` diff --git a/configs/mmdet/tood/metafile.yml b/configs/mmdet/tood/metafile.yml new file mode 100644 index 00000000..27a0f8db --- /dev/null +++ b/configs/mmdet/tood/metafile.yml @@ -0,0 +1,95 @@ +Collections: + - Name: TOOD + Metadata: + Training Data: COCO + Training Techniques: + - SGD + Training Resources: 8x V100 GPUs + Architecture: + - TOOD + Paper: + URL: https://arxiv.org/abs/2108.07755 + Title: 'TOOD: Task-aligned One-stage Object Detection' + README: configs/tood/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.20.0/mmdet/models/detectors/tood.py#L7 + Version: v2.20.0 + +Models: + - Name: tood_r101_fpn_mstrain_2x_coco + In Collection: TOOD + Config: configs/tood/tood_r101_fpn_mstrain_2x_coco.py + Metadata: + Training Memory (GB): 6.0 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232-a18f53c8.pth + + - Name: tood_x101_64x4d_fpn_mstrain_2x_coco + In Collection: TOOD + Config: configs/tood/tood_x101_64x4d_fpn_mstrain_2x_coco.py + Metadata: + Training Memory (GB): 10.2 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519-a4f36113.pth + + - Name: tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco + In Collection: TOOD + Config: configs/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728-4a824142.pth + + - Name: tood_r50_fpn_anchor_based_1x_coco + In Collection: TOOD + Config: configs/tood/tood_r50_fpn_anchor_based_1x_coco.py + Metadata: + Training Memory (GB): 4.1 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105-b776c134.pth + + - Name: tood_r50_fpn_1x_coco + In Collection: TOOD + Config: configs/tood/tood_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.1 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth + + - Name: tood_r50_fpn_mstrain_2x_coco + In Collection: TOOD + Config: configs/tood/tood_r50_fpn_mstrain_2x_coco.py + Metadata: + Training Memory (GB): 4.1 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231-3b23174c.pth diff --git a/configs/mmdet/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py b/configs/mmdet/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..c7f1bbcb --- /dev/null +++ b/configs/mmdet/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './tood_r101_fpn_mstrain_2x_coco.py' + +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + bbox_head=dict(num_dcn=2)) diff --git a/configs/mmdet/tood/tood_r101_fpn_mstrain_2x_coco.py b/configs/mmdet/tood/tood_r101_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..d9d2c32d --- /dev/null +++ b/configs/mmdet/tood/tood_r101_fpn_mstrain_2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './tood_r50_fpn_mstrain_2x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/tood/tood_r50_fpn_1x_coco.py b/configs/mmdet/tood/tood_r50_fpn_1x_coco.py new file mode 100644 index 00000000..35a77a40 --- /dev/null +++ b/configs/mmdet/tood/tood_r50_fpn_1x_coco.py @@ -0,0 +1,74 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='TOOD', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='TOODHead', + num_classes=80, + in_channels=256, + stacked_convs=6, + feat_channels=256, + anchor_type='anchor_free', + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + initial_loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + activated=True, # use probability instead of logit as input + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + activated=True, # use probability instead of logit as input + beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0)), + train_cfg=dict( + initial_epoch=4, + initial_assigner=dict(type='ATSSAssigner', topk=9), + assigner=dict(type='TaskAlignedAssigner', topk=13), + alpha=1, + beta=6, + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) + +# custom hooks +custom_hooks = [dict(type='SetEpochInfoHook')] diff --git a/configs/mmdet/tood/tood_r50_fpn_anchor_based_1x_coco.py b/configs/mmdet/tood/tood_r50_fpn_anchor_based_1x_coco.py new file mode 100644 index 00000000..c7fbf6af --- /dev/null +++ b/configs/mmdet/tood/tood_r50_fpn_anchor_based_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './tood_r50_fpn_1x_coco.py' +model = dict(bbox_head=dict(anchor_type='anchor_based')) diff --git a/configs/mmdet/tood/tood_r50_fpn_mstrain_2x_coco.py b/configs/mmdet/tood/tood_r50_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..157d13a4 --- /dev/null +++ b/configs/mmdet/tood/tood_r50_fpn_mstrain_2x_coco.py @@ -0,0 +1,22 @@ +_base_ = './tood_r50_fpn_1x_coco.py' +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) +# multi-scale training +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 800)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/tood/tood_x101_64x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py b/configs/mmdet/tood/tood_x101_64x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..47c92695 --- /dev/null +++ b/configs/mmdet/tood/tood_x101_64x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './tood_x101_64x4d_fpn_mstrain_2x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, False, True, True), + ), + bbox_head=dict(num_dcn=2)) diff --git a/configs/mmdet/tood/tood_x101_64x4d_fpn_mstrain_2x_coco.py b/configs/mmdet/tood/tood_x101_64x4d_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..842f320e --- /dev/null +++ b/configs/mmdet/tood/tood_x101_64x4d_fpn_mstrain_2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './tood_r50_fpn_mstrain_2x_coco.py' + +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/tridentnet/README.md b/configs/mmdet/tridentnet/README.md new file mode 100644 index 00000000..d35eca01 --- /dev/null +++ b/configs/mmdet/tridentnet/README.md @@ -0,0 +1,38 @@ +# TridentNet + +> [Scale-Aware Trident Networks for Object Detection](https://arxiv.org/abs/1901.01892) + + + +## Abstract + +Scale variation is one of the key challenges in object detection. In this work, we first present a controlled experiment to investigate the effect of receptive fields for scale variation in object detection. Based on the findings from the exploration experiments, we propose a novel Trident Network (TridentNet) aiming to generate scale-specific feature maps with a uniform representational power. We construct a parallel multi-branch architecture in which each branch shares the same transformation parameters but with different receptive fields. Then, we adopt a scale-aware training scheme to specialize each branch by sampling object instances of proper scales for training. As a bonus, a fast approximation version of TridentNet could achieve significant improvements without any additional parameters and computational cost compared with the vanilla detector. On the COCO dataset, our TridentNet with ResNet-101 backbone achieves state-of-the-art single-model results of 48.4 mAP. + +
+ +
+ +## Results and Models + +We reports the test results using only one branch for inference. + +| Backbone | Style | mstrain | Lr schd | Mem (GB) | Inf time (fps) | box AP | Download | +| :-------------: | :-----: | :-----: | :-----: | :------: | :------------: | :----: | :------: | +| R-50 | caffe | N | 1x | | | 37.7 |[model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838.log.json) | +| R-50 | caffe | Y | 1x | | | 37.6 |[model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839-6ce55ccb.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839.log.json) | +| R-50 | caffe | Y | 3x | | | 40.3 |[model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539-46d227ba.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539.log.json) | + +**Note** + +Similar to [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/projects/TridentNet), we haven't implemented the Scale-aware Training Scheme in section 4.2 of the paper. + +## Citation + +```latex +@InProceedings{li2019scale, + title={Scale-Aware Trident Networks for Object Detection}, + author={Li, Yanghao and Chen, Yuntao and Wang, Naiyan and Zhang, Zhaoxiang}, + journal={The International Conference on Computer Vision (ICCV)}, + year={2019} +} +``` diff --git a/configs/mmdet/tridentnet/metafile.yml b/configs/mmdet/tridentnet/metafile.yml new file mode 100644 index 00000000..2536f976 --- /dev/null +++ b/configs/mmdet/tridentnet/metafile.yml @@ -0,0 +1,55 @@ +Collections: + - Name: TridentNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - TridentNet Block + Paper: + URL: https://arxiv.org/abs/1901.01892 + Title: 'Scale-Aware Trident Networks for Object Detection' + README: configs/tridentnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.8.0/mmdet/models/detectors/trident_faster_rcnn.py#L6 + Version: v2.8.0 + +Models: + - Name: tridentnet_r50_caffe_1x_coco + In Collection: TridentNet + Config: configs/tridentnet/tridentnet_r50_caffe_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth + + - Name: tridentnet_r50_caffe_mstrain_1x_coco + In Collection: TridentNet + Config: configs/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839-6ce55ccb.pth + + - Name: tridentnet_r50_caffe_mstrain_3x_coco + In Collection: TridentNet + Config: configs/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539-46d227ba.pth diff --git a/configs/mmdet/tridentnet/tridentnet_r50_caffe_1x_coco.py b/configs/mmdet/tridentnet/tridentnet_r50_caffe_1x_coco.py new file mode 100644 index 00000000..d779f75f --- /dev/null +++ b/configs/mmdet/tridentnet/tridentnet_r50_caffe_1x_coco.py @@ -0,0 +1,55 @@ +_base_ = [ + '../_base_/models/faster_rcnn_r50_caffe_c4.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='TridentFasterRCNN', + backbone=dict( + type='TridentResNet', + trident_dilations=(1, 2, 3), + num_branch=3, + test_branch_idx=1, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + roi_head=dict(type='TridentRoIHead', num_branch=3, test_branch_idx=1), + train_cfg=dict( + rpn_proposal=dict(max_per_img=500), + rcnn=dict( + sampler=dict(num=128, pos_fraction=0.5, + add_gt_as_proposals=False)))) + +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco.py b/configs/mmdet/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco.py new file mode 100644 index 00000000..c73d9eaa --- /dev/null +++ b/configs/mmdet/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco.py @@ -0,0 +1,22 @@ +_base_ = 'tridentnet_r50_caffe_1x_coco.py' + +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] + +data = dict(train=dict(pipeline=train_pipeline)) diff --git a/configs/mmdet/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco.py b/configs/mmdet/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco.py new file mode 100644 index 00000000..0f402826 --- /dev/null +++ b/configs/mmdet/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco.py @@ -0,0 +1,4 @@ +_base_ = 'tridentnet_r50_caffe_mstrain_1x_coco.py' + +lr_config = dict(step=[28, 34]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/configs/mmdet/vfnet/README.md b/configs/mmdet/vfnet/README.md new file mode 100644 index 00000000..43ade0e7 --- /dev/null +++ b/configs/mmdet/vfnet/README.md @@ -0,0 +1,48 @@ +# VarifocalNet + +> [VarifocalNet: An IoU-aware Dense Object Detector](https://arxiv.org/abs/2008.13367) + + + +## Abstract + +Accurately ranking the vast number of candidate detections is crucial for dense object detectors to achieve high performance. Prior work uses the classification score or a combination of classification and predicted localization scores to rank candidates. However, neither option results in a reliable ranking, thus degrading detection performance. In this paper, we propose to learn an Iou-aware Classification Score (IACS) as a joint representation of object presence confidence and localization accuracy. We show that dense object detectors can achieve a more accurate ranking of candidate detections based on the IACS. We design a new loss function, named Varifocal Loss, to train a dense object detector to predict the IACS, and propose a new star-shaped bounding box feature representation for IACS prediction and bounding box refinement. Combining these two new components and a bounding box refinement branch, we build an IoU-aware dense object detector based on the FCOS+ATSS architecture, that we call VarifocalNet or VFNet for short. Extensive experiments on MS COCO show that our VFNet consistently surpasses the strong baseline by ∼2.0 AP with different backbones. Our best model VFNet-X-1200 with Res2Net-101-DCN achieves a single-model single-scale AP of 55.1 on COCO test-dev, which is state-of-the-art among various object detectors. + +
+ +
+ +## Introduction + +**VarifocalNet (VFNet)** learns to predict the IoU-aware classification score which mixes the object presence confidence and localization accuracy together as the detection score for a bounding box. The learning is supervised by the proposed Varifocal Loss (VFL), based on a new star-shaped bounding box feature representation (the features at nine yellow sampling points). Given the new representation, the object localization accuracy is further improved by refining the initially regressed bounding box. The full paper is available at: [https://arxiv.org/abs/2008.13367](https://arxiv.org/abs/2008.13367). + +## Results and Models + +| Backbone | Style | DCN | MS train | Lr schd |Inf time (fps) | box AP (val) | box AP (test-dev) | Config | Download | +|:------------:|:---------:|:-------:|:--------:|:-------:|:-------------:|:------------:|:-----------------:|:------:|:--------:| +| R-50 | pytorch | N | N | 1x | - | 41.6 | 41.6 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco.json)| +| R-50 | pytorch | N | Y | 2x | - | 44.5 | 44.8 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco_20201027-7cc75bd2.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco.json)| +| R-50 | pytorch | Y | Y | 2x | - | 47.8 | 48.0 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-6879c318.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.json)| +| R-101 | pytorch | N | N | 1x | - | 43.0 | 43.6 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco_20201027pth-c831ece7.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco.json)| +| R-101 | pytorch | N | Y | 2x | - | 46.2 | 46.7 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco_20201027pth-4a5d53f1.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco.json)| +| R-101 | pytorch | Y | Y | 2x | - | 49.0 | 49.2 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-7729adb5.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.json)| +| X-101-32x4d | pytorch | Y | Y | 2x | - | 49.7 | 50.0 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-d300a6fc.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.json)| +| X-101-64x4d | pytorch | Y | Y | 2x | - | 50.4 | 50.8 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-b5f6da5e.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.json)| + +**Notes:** + +- The MS-train scale range is 1333x[480:960] (`range` mode) and the inference scale keeps 1333x800. +- DCN means using `DCNv2` in both backbone and head. +- Inference time will be updated soon. +- More results and pre-trained models can be found in [VarifocalNet-Github](https://github.com/hyz-xmaster/VarifocalNet) + +## Citation + +```latex +@article{zhang2020varifocalnet, + title={VarifocalNet: An IoU-aware Dense Object Detector}, + author={Zhang, Haoyang and Wang, Ying and Dayoub, Feras and S{\"u}nderhauf, Niko}, + journal={arXiv preprint arXiv:2008.13367}, + year={2020} +} +``` diff --git a/configs/mmdet/vfnet/metafile.yml b/configs/mmdet/vfnet/metafile.yml new file mode 100644 index 00000000..bcbe576f --- /dev/null +++ b/configs/mmdet/vfnet/metafile.yml @@ -0,0 +1,116 @@ +Collections: + - Name: VFNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - Varifocal Loss + Paper: + URL: https://arxiv.org/abs/2008.13367 + Title: 'VarifocalNet: An IoU-aware Dense Object Detector' + README: configs/vfnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.6.0/mmdet/models/detectors/vfnet.py#L6 + Version: v2.6.0 + +Models: + - Name: vfnet_r50_fpn_1x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth + + - Name: vfnet_r50_fpn_mstrain_2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco_20201027-7cc75bd2.pth + + - Name: vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-6879c318.pth + + - Name: vfnet_r101_fpn_1x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r101_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco_20201027pth-c831ece7.pth + + - Name: vfnet_r101_fpn_mstrain_2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco_20201027pth-4a5d53f1.pth + + - Name: vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-7729adb5.pth + + - Name: vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-d300a6fc.pth + + - Name: vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-b5f6da5e.pth diff --git a/configs/mmdet/vfnet/vfnet_r101_fpn_1x_coco.py b/configs/mmdet/vfnet/vfnet_r101_fpn_1x_coco.py new file mode 100644 index 00000000..b296a079 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './vfnet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/vfnet/vfnet_r101_fpn_2x_coco.py b/configs/mmdet/vfnet/vfnet_r101_fpn_2x_coco.py new file mode 100644 index 00000000..27962f3a --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r101_fpn_2x_coco.py @@ -0,0 +1,8 @@ +_base_ = './vfnet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..e438c247 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..eae69a01 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/vfnet/vfnet_r2_101_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_r2_101_fpn_mdconv_c3-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..815a36e0 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r2_101_fpn_mdconv_c3-c5_mstrain_2x_coco.py @@ -0,0 +1,18 @@ +_base_ = './vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/configs/mmdet/vfnet/vfnet_r2_101_fpn_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_r2_101_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..58022e0e --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r2_101_fpn_mstrain_2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/configs/mmdet/vfnet/vfnet_r50_fpn_1x_coco.py b/configs/mmdet/vfnet/vfnet_r50_fpn_1x_coco.py new file mode 100644 index 00000000..7de64296 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r50_fpn_1x_coco.py @@ -0,0 +1,107 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='VFNet', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='VFNetHead', + num_classes=80, + in_channels=256, + stacked_convs=3, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + center_sampling=False, + dcn_on_last_conv=False, + use_atss=True, + use_vfl=True, + loss_cls=dict( + type='VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.5), + loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# data setting +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +# optimizer +optimizer = dict( + lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.1, + step=[8, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/configs/mmdet/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..24d2093b --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + bbox_head=dict(dcn_on_last_conv=True)) diff --git a/configs/mmdet/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..6078bb98 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py @@ -0,0 +1,39 @@ +_base_ = './vfnet_r50_fpn_1x_coco.py' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 960)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# learning policy +lr_config = dict(step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/configs/mmdet/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..7efa0517 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/vfnet/vfnet_x101_32x4d_fpn_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_x101_32x4d_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..49a43121 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_x101_32x4d_fpn_mstrain_2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/configs/mmdet/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py new file mode 100644 index 00000000..7e1ee429 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/vfnet/vfnet_x101_64x4d_fpn_mstrain_2x_coco.py b/configs/mmdet/vfnet/vfnet_x101_64x4d_fpn_mstrain_2x_coco.py new file mode 100644 index 00000000..e51064e7 --- /dev/null +++ b/configs/mmdet/vfnet/vfnet_x101_64x4d_fpn_mstrain_2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/configs/mmdet/wider_face/README.md b/configs/mmdet/wider_face/README.md new file mode 100644 index 00000000..1904506c --- /dev/null +++ b/configs/mmdet/wider_face/README.md @@ -0,0 +1,57 @@ +# WIDER FACE + +> [WIDER FACE: A Face Detection Benchmark](https://arxiv.org/abs/1511.06523) + + + +## Abstract + +Face detection is one of the most studied topics in the computer vision community. Much of the progresses have been made by the availability of face detection benchmark datasets. We show that there is a gap between current face detection performance and the real world requirements. To facilitate future face detection research, we introduce the WIDER FACE dataset, which is 10 times larger than existing datasets. The dataset contains rich annotations, including occlusions, poses, event categories, and face bounding boxes. Faces in the proposed dataset are extremely challenging due to large variations in scale, pose and occlusion, as shown in Fig. 1. Furthermore, we show that WIDER FACE dataset is an effective training source for face detection. We benchmark several representative detection systems, providing an overview of state-of-the-art performance and propose a solution to deal with large scale variation. Finally, we discuss common failure cases that worth to be further investigated. + +
+ +
+ +## Introduction + +To use the WIDER Face dataset you need to download it +and extract to the `data/WIDERFace` folder. Annotation in the VOC format +can be found in this [repo](https://github.com/sovrasov/wider-face-pascal-voc-annotations.git). +You should move the annotation files from `WIDER_train_annotations` and `WIDER_val_annotations` folders +to the `Annotation` folders inside the corresponding directories `WIDER_train` and `WIDER_val`. +Also annotation lists `val.txt` and `train.txt` should be copied to `data/WIDERFace` from `WIDER_train_annotations` and `WIDER_val_annotations`. +The directory should be like this: + +``` +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── WIDERFace +│ │ ├── WIDER_train +│ | │ ├──0--Parade +│ | │ ├── ... +│ | │ ├── Annotations +│ │ ├── WIDER_val +│ | │ ├──0--Parade +│ | │ ├── ... +│ | │ ├── Annotations +│ │ ├── val.txt +│ │ ├── train.txt + +``` + +After that you can train the SSD300 on WIDER by launching training with the `ssd300_wider_face.py` config or +create your own config based on the presented one. + +## Citation + +```latex +@inproceedings{yang2016wider, + Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou}, + Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + Title = {WIDER FACE: A Face Detection Benchmark}, + Year = {2016} +} +``` diff --git a/configs/mmdet/wider_face/ssd300_wider_face.py b/configs/mmdet/wider_face/ssd300_wider_face.py new file mode 100644 index 00000000..5a3eb38d --- /dev/null +++ b/configs/mmdet/wider_face/ssd300_wider_face.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/wider_face.py', + '../_base_/default_runtime.py' +] +model = dict(bbox_head=dict(num_classes=1)) +# optimizer +optimizer = dict(type='SGD', lr=0.012, momentum=0.9, weight_decay=5e-4) +optimizer_config = dict() +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[16, 20]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=24) +log_config = dict(interval=1) diff --git a/configs/mmdet/yolact/README.md b/configs/mmdet/yolact/README.md new file mode 100644 index 00000000..9e84de19 --- /dev/null +++ b/configs/mmdet/yolact/README.md @@ -0,0 +1,74 @@ +# YOLACT + +> [YOLACT: Real-time Instance Segmentation](https://arxiv.org/abs/1904.02689) + + + +## Abstract + +We present a simple, fully-convolutional model for real-time instance segmentation that achieves 29.8 mAP on MS COCO at 33.5 fps evaluated on a single Titan Xp, which is significantly faster than any previous competitive approach. Moreover, we obtain this result after training on only one GPU. We accomplish this by breaking instance segmentation into two parallel subtasks: (1) generating a set of prototype masks and (2) predicting per-instance mask coefficients. Then we produce instance masks by linearly combining the prototypes with the mask coefficients. We find that because this process doesn't depend on repooling, this approach produces very high-quality masks and exhibits temporal stability for free. Furthermore, we analyze the emergent behavior of our prototypes and show they learn to localize instances on their own in a translation variant manner, despite being fully-convolutional. Finally, we also propose Fast NMS, a drop-in 12 ms faster replacement for standard NMS that only has a marginal performance penalty. + +
+ +
+ +## Introduction + +A simple, fully convolutional model for real-time instance segmentation. This is the code for our paper: + +- [YOLACT: Real-time Instance Segmentation](https://arxiv.org/abs/1904.02689) + + +For a real-time demo, check out our ICCV video: +[![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/0pMfmo8qfpQ/0.jpg)](https://www.youtube.com/watch?v=0pMfmo8qfpQ) + +## Evaluation + +Here are our YOLACT models along with their FPS on a Titan Xp and mAP on COCO's `val`: + +| Image Size | GPU x BS | Backbone | *FPS | mAP | Weights | Configs | Download | +|:----------:|:--------:|:-------------:|:-----:|:----:|:-------:|:------:|:--------:| +| 550 | 1x8 | Resnet50-FPN | 42.5 | 29.0 | | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolact/yolact_r50_1x8_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_1x8_coco/yolact_r50_1x8_coco_20200908-f38d58df.pth) | +| 550 | 8x8 | Resnet50-FPN | 42.5 | 28.4 | | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolact/yolact_r50_8x8_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_8x8_coco/yolact_r50_8x8_coco_20200908-ca34f5db.pth) | +| 550 | 1x8 | Resnet101-FPN | 33.5 | 30.4 | | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolact/yolact_r101_1x8_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r101_1x8_coco/yolact_r101_1x8_coco_20200908-4cbe9101.pth) | + +*Note: The FPS is evaluated by the [original implementation](https://github.com/dbolya/yolact). When calculating FPS, only the model inference time is taken into account. Data loading and post-processing operations such as converting masks to RLE code, generating COCO JSON results, image rendering are not included. + +## Training + +All the aforementioned models are trained with a single GPU. It typically takes ~12GB VRAM when using resnet-101 as the backbone. If you want to try multiple GPUs training, you may have to modify the configuration files accordingly, such as adjusting the training schedule and freezing batch norm. + +```Shell +# Trains using the resnet-101 backbone with a batch size of 8 on a single GPU. +./tools/dist_train.sh configs/yolact/yolact_r101.py 1 +``` + +## Testing + +Please refer to [mmdetection/docs/getting_started.md](https://mmdetection.readthedocs.io/en/latest/1_exist_data_model.html#test-existing-models). + +## Citation + +If you use YOLACT or this code base in your work, please cite + +```latex +@inproceedings{yolact-iccv2019, + author = {Daniel Bolya and Chong Zhou and Fanyi Xiao and Yong Jae Lee}, + title = {YOLACT: {Real-time} Instance Segmentation}, + booktitle = {ICCV}, + year = {2019}, +} +``` + + diff --git a/configs/mmdet/yolact/metafile.yml b/configs/mmdet/yolact/metafile.yml new file mode 100644 index 00000000..e7019ae6 --- /dev/null +++ b/configs/mmdet/yolact/metafile.yml @@ -0,0 +1,78 @@ +Collections: + - Name: YOLACT + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.02689 + Title: 'YOLACT: Real-time Instance Segmentation' + README: configs/yolact/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.5.0/mmdet/models/detectors/yolact.py#L9 + Version: v2.5.0 + +Models: + - Name: yolact_r50_1x8_coco + In Collection: YOLACT + Config: configs/yolact/yolact_r50_1x8_coco.py + Metadata: + Training Resources: 1x V100 GPU + Batch Size: 8 + inference time (ms/im): + - value: 23.53 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (550, 550) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 29.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_1x8_coco/yolact_r50_1x8_coco_20200908-f38d58df.pth + + - Name: yolact_r50_8x8_coco + In Collection: YOLACT + Config: configs/yolact/yolact_r50_8x8_coco.py + Metadata: + Batch Size: 64 + inference time (ms/im): + - value: 23.53 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (550, 550) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 28.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_8x8_coco/yolact_r50_8x8_coco_20200908-ca34f5db.pth + + - Name: yolact_r101_1x8_coco + In Collection: YOLACT + Config: configs/yolact/yolact_r101_1x8_coco.py + Metadata: + Training Resources: 1x V100 GPU + Batch Size: 8 + inference time (ms/im): + - value: 29.85 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (550, 550) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 30.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r101_1x8_coco/yolact_r101_1x8_coco_20200908-4cbe9101.pth diff --git a/configs/mmdet/yolact/yolact_r101_1x8_coco.py b/configs/mmdet/yolact/yolact_r101_1x8_coco.py new file mode 100644 index 00000000..532631dd --- /dev/null +++ b/configs/mmdet/yolact/yolact_r101_1x8_coco.py @@ -0,0 +1,7 @@ +_base_ = './yolact_r50_1x8_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mmdet/yolact/yolact_r50_1x8_coco.py b/configs/mmdet/yolact/yolact_r50_1x8_coco.py new file mode 100644 index 00000000..dbced5a1 --- /dev/null +++ b/configs/mmdet/yolact/yolact_r50_1x8_coco.py @@ -0,0 +1,165 @@ +_base_ = '../_base_/default_runtime.py' + +# model settings +img_size = 550 +model = dict( + type='YOLACT', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, # do not freeze stem + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, # update the statistics of bn + zero_init_residual=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5, + upsample_cfg=dict(mode='bilinear')), + bbox_head=dict( + type='YOLACTHead', + num_classes=80, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=3, + scales_per_octave=1, + base_sizes=[8, 16, 32, 64, 128], + ratios=[0.5, 1.0, 2.0], + strides=[550.0 / x for x in [69, 35, 18, 9, 5]], + centers=[(550 * 0.5 / x, 550 * 0.5 / x) + for x in [69, 35, 18, 9, 5]]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + reduction='none', + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.5), + num_head_convs=1, + num_protos=32, + use_ohem=True), + mask_head=dict( + type='YOLACTProtonet', + in_channels=256, + num_protos=32, + num_classes=80, + max_masks_to_train=100, + loss_mask_weight=6.125), + segm_head=dict( + type='YOLACTSegmHead', + num_classes=80, + in_channels=256, + loss_segm=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + # smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + iou_thr=0.5, + top_k=200, + max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.68, 116.78, 103.94], std=[58.40, 57.12, 57.38], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(4.0, 4.0)), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(img_size, img_size), keep_ratio=False), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(img_size, img_size), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4) +optimizer_config = dict() +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.1, + step=[20, 42, 49, 52]) +runner = dict(type='EpochBasedRunner', max_epochs=55) +cudnn_benchmark = True +evaluation = dict(metric=['bbox', 'segm']) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (1 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/configs/mmdet/yolact/yolact_r50_8x8_coco.py b/configs/mmdet/yolact/yolact_r50_8x8_coco.py new file mode 100644 index 00000000..41003ab4 --- /dev/null +++ b/configs/mmdet/yolact/yolact_r50_8x8_coco.py @@ -0,0 +1,16 @@ +_base_ = 'yolact_r50_1x8_coco.py' + +optimizer = dict(type='SGD', lr=8e-3, momentum=0.9, weight_decay=5e-4) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.1, + step=[20, 42, 49, 52]) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/yolo/README.md b/configs/mmdet/yolo/README.md new file mode 100644 index 00000000..57b8f534 --- /dev/null +++ b/configs/mmdet/yolo/README.md @@ -0,0 +1,55 @@ +# YOLOv3 + +> [YOLOv3: An Incremental Improvement](https://arxiv.org/abs/1804.02767) + + + +## Abstract + +We present some updates to YOLO! We made a bunch of little design changes to make it better. We also trained this new network that's pretty swell. It's a little bigger than last time but more accurate. It's still fast though, don't worry. At 320x320 YOLOv3 runs in 22 ms at 28.2 mAP, as accurate as SSD but three times faster. When we look at the old .5 IOU mAP detection metric YOLOv3 is quite good. It achieves 57.9 mAP@50 in 51 ms on a Titan X, compared to 57.5 mAP@50 in 198 ms by RetinaNet, similar performance but 3.8x faster. + +
+ +
+ +## Results and Models + +| Backbone | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| DarkNet-53 | 320 | 273e | 2.7 | 63.9 | 27.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_d53_320_273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-421362b6.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-20200819_172101.log.json) | +| DarkNet-53 | 416 | 273e | 3.8 | 61.2 | 30.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_d53_mstrain-416_273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-2b60fcd9.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-20200819_173424.log.json) | +| DarkNet-53 | 608 | 273e | 7.4 | 48.1 | 33.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_d53_mstrain-608_273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020-a2c3acb8.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020.log.json) | + +## Mixed Precision Training + +We also train YOLOv3 with mixed precision training. + +| Backbone | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| DarkNet-53 | 608 | 273e | 4.7 | 48.1 | 33.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542-4bc34944.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542.log.json) | + +## Lightweight models + +| Backbone | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :------: | :--------: | +| MobileNetV2 | 416 | 300e | 5.3 | | 23.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823-f68a07b3.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823.log.json) | +| MobileNetV2 | 320 | 300e | 3.2 | | 22.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_mobilenetv2_320_300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349.log.json) | + +Notice: We reduce the number of channels to 96 in both head and neck. It can reduce the flops and parameters, which makes these models more suitable for edge devices. + +## Credit + +This implementation originates from the project of Haoyu Wu(@wuhy08) at Western Digital. + +## Citation + +```latex +@misc{redmon2018yolov3, + title={YOLOv3: An Incremental Improvement}, + author={Joseph Redmon and Ali Farhadi}, + year={2018}, + eprint={1804.02767}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/mmdet/yolo/metafile.yml b/configs/mmdet/yolo/metafile.yml new file mode 100644 index 00000000..22c35da5 --- /dev/null +++ b/configs/mmdet/yolo/metafile.yml @@ -0,0 +1,124 @@ +Collections: + - Name: YOLOv3 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - DarkNet + Paper: + URL: https://arxiv.org/abs/1804.02767 + Title: 'YOLOv3: An Incremental Improvement' + README: configs/yolo/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/detectors/yolo.py#L8 + Version: v2.4.0 + +Models: + - Name: yolov3_d53_320_273e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_d53_320_273e_coco.py + Metadata: + Training Memory (GB): 2.7 + inference time (ms/im): + - value: 15.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (320, 320) + Epochs: 273 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 27.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-421362b6.pth + + - Name: yolov3_d53_mstrain-416_273e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_d53_mstrain-416_273e_coco.py + Metadata: + Training Memory (GB): 3.8 + inference time (ms/im): + - value: 16.34 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (416, 416) + Epochs: 273 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 30.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-2b60fcd9.pth + + - Name: yolov3_d53_mstrain-608_273e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_d53_mstrain-608_273e_coco.py + Metadata: + Training Memory (GB): 7.4 + inference time (ms/im): + - value: 20.79 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (608, 608) + Epochs: 273 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 33.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020-a2c3acb8.pth + + - Name: yolov3_d53_fp16_mstrain-608_273e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py + Metadata: + Training Memory (GB): 4.7 + inference time (ms/im): + - value: 20.79 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP16 + resolution: (608, 608) + Epochs: 273 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 33.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542-4bc34944.pth + + - Name: yolov3_mobilenetv2_320_300e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_mobilenetv2_320_300e_coco.py + Metadata: + Training Memory (GB): 3.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 22.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth + + - Name: yolov3_mobilenetv2_mstrain-416_300e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py + Metadata: + Training Memory (GB): 5.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 23.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823-f68a07b3.pth diff --git a/configs/mmdet/yolo/yolov3_d53_320_273e_coco.py b/configs/mmdet/yolo/yolov3_d53_320_273e_coco.py new file mode 100644 index 00000000..d4785e31 --- /dev/null +++ b/configs/mmdet/yolo/yolov3_d53_320_273e_coco.py @@ -0,0 +1,42 @@ +_base_ = './yolov3_d53_mstrain-608_273e_coco.py' +# dataset settings +img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(320, 320), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(320, 320), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py b/configs/mmdet/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py new file mode 100644 index 00000000..4ef2422d --- /dev/null +++ b/configs/mmdet/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py @@ -0,0 +1,3 @@ +_base_ = './yolov3_d53_mstrain-608_273e_coco.py' +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/configs/mmdet/yolo/yolov3_d53_mstrain-416_273e_coco.py b/configs/mmdet/yolo/yolov3_d53_mstrain-416_273e_coco.py new file mode 100644 index 00000000..94325c5a --- /dev/null +++ b/configs/mmdet/yolo/yolov3_d53_mstrain-416_273e_coco.py @@ -0,0 +1,42 @@ +_base_ = './yolov3_d53_mstrain-608_273e_coco.py' +# dataset settings +img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=[(320, 320), (416, 416)], keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(416, 416), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/yolo/yolov3_d53_mstrain-608_273e_coco.py b/configs/mmdet/yolo/yolov3_d53_mstrain-608_273e_coco.py new file mode 100644 index 00000000..43aa2f03 --- /dev/null +++ b/configs/mmdet/yolo/yolov3_d53_mstrain-608_273e_coco.py @@ -0,0 +1,132 @@ +_base_ = '../_base_/default_runtime.py' +# model settings +model = dict( + type='YOLOV3', + backbone=dict( + type='Darknet', + depth=53, + out_indices=(3, 4, 5), + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://darknet53')), + neck=dict( + type='YOLOV3Neck', + num_scales=3, + in_channels=[1024, 512, 256], + out_channels=[512, 256, 128]), + bbox_head=dict( + type='YOLOV3Head', + num_classes=80, + in_channels=[512, 256, 128], + out_channels=[1024, 512, 256], + anchor_generator=dict( + type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), (156, 198), (373, 326)], + [(30, 61), (62, 45), (59, 119)], + [(10, 13), (16, 30), (33, 23)]], + strides=[32, 16, 8]), + bbox_coder=dict(type='YOLOBBoxCoder'), + featmap_strides=[32, 16, 8], + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_conf=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_xy=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=2.0, + reduction='sum'), + loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='GridAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0)), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + conf_thr=0.005, + nms=dict(type='nms', iou_threshold=0.45), + max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=[(320, 320), (608, 608)], keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(608, 608), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=2000, # same as burn-in in darknet + warmup_ratio=0.1, + step=[218, 246]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=273) +evaluation = dict(interval=1, metric=['bbox']) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/yolo/yolov3_mobilenetv2_320_300e_coco.py b/configs/mmdet/yolo/yolov3_mobilenetv2_320_300e_coco.py new file mode 100644 index 00000000..477d2530 --- /dev/null +++ b/configs/mmdet/yolo/yolov3_mobilenetv2_320_300e_coco.py @@ -0,0 +1,53 @@ +_base_ = ['./yolov3_mobilenetv2_mstrain-416_300e_coco.py'] + +# yapf:disable +model = dict( + bbox_head=dict( + anchor_generator=dict( + base_sizes=[[(220, 125), (128, 222), (264, 266)], + [(35, 87), (102, 96), (60, 170)], + [(10, 15), (24, 36), (72, 42)]]))) +# yapf:enable + +# dataset settings +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(320, 320), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(320, 320), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + train=dict(dataset=dict(pipeline=train_pipeline)), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/mmdet/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py b/configs/mmdet/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py new file mode 100644 index 00000000..18e0622e --- /dev/null +++ b/configs/mmdet/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py @@ -0,0 +1,142 @@ +_base_ = '../_base_/default_runtime.py' +# model settings +model = dict( + type='YOLOV3', + backbone=dict( + type='MobileNetV2', + out_indices=(2, 4, 6), + act_cfg=dict(type='LeakyReLU', negative_slope=0.1), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://mmdet/mobilenet_v2')), + neck=dict( + type='YOLOV3Neck', + num_scales=3, + in_channels=[320, 96, 32], + out_channels=[96, 96, 96]), + bbox_head=dict( + type='YOLOV3Head', + num_classes=80, + in_channels=[96, 96, 96], + out_channels=[96, 96, 96], + anchor_generator=dict( + type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), (156, 198), (373, 326)], + [(30, 61), (62, 45), (59, 119)], + [(10, 13), (16, 30), (33, 23)]], + strides=[32, 16, 8]), + bbox_coder=dict(type='YOLOBBoxCoder'), + featmap_strides=[32, 16, 8], + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_conf=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_xy=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=2.0, + reduction='sum'), + loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='GridAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0)), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + conf_thr=0.005, + nms=dict(type='nms', iou_threshold=0.45), + max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict( + type='Resize', + img_scale=[(320, 320), (416, 416)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(416, 416), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + samples_per_gpu=24, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', # use RepeatDataset to speed up training + times=10, + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline)), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.003, momentum=0.9, weight_decay=0.0005) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=4000, + warmup_ratio=0.0001, + step=[24, 28]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=30) +evaluation = dict(interval=1, metric=['bbox']) +find_unused_parameters = True + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (24 samples per GPU) +auto_scale_lr = dict(base_batch_size=192) diff --git a/configs/mmdet/yolof/README.md b/configs/mmdet/yolof/README.md new file mode 100644 index 00000000..9aa6001d --- /dev/null +++ b/configs/mmdet/yolof/README.md @@ -0,0 +1,35 @@ +# YOLOF + +> [You Only Look One-level Feature](https://arxiv.org/abs/2103.09460) + + + +## Abstract + +This paper revisits feature pyramids networks (FPN) for one-stage detectors and points out that the success of FPN is due to its divide-and-conquer solution to the optimization problem in object detection rather than multi-scale feature fusion. From the perspective of optimization, we introduce an alternative way to address the problem instead of adopting the complex feature pyramids - {\em utilizing only one-level feature for detection}. Based on the simple and efficient solution, we present You Only Look One-level Feature (YOLOF). In our method, two key components, Dilated Encoder and Uniform Matching, are proposed and bring considerable improvements. Extensive experiments on the COCO benchmark prove the effectiveness of the proposed model. Our YOLOF achieves comparable results with its feature pyramids counterpart RetinaNet while being 2.5× faster. Without transformer layers, YOLOF can match the performance of DETR in a single-level feature manner with 7× less training epochs. With an image size of 608×608, YOLOF achieves 44.3 mAP running at 60 fps on 2080Ti, which is 13% faster than YOLOv4. + +
+ +
+ +## Results and Models + +| Backbone | Style | Epoch | Lr schd | Mem (GB) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:-------:|:--------:|:------:|:------:|:--------:| +| R-50-C5 | caffe | Y | 1x | 8.3 | 37.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolof/yolof_r50_c5_8x8_1x_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427.log.json) | + +**Note**: + +1. We find that the performance is unstable and may fluctuate by about 0.3 mAP. mAP 37.4 ~ 37.7 is acceptable in YOLOF_R_50_C5_1x. Such fluctuation can also be found in the [original implementation](https://github.com/chensnathan/YOLOF). +2. In addition to instability issues, sometimes there are large loss fluctuations and NAN, so there may still be problems with this project, which will be improved subsequently. + +## Citation + +```latex +@inproceedings{chen2021you, + title={You Only Look One-level Feature}, + author={Chen, Qiang and Wang, Yingming and Yang, Tong and Zhang, Xiangyu and Cheng, Jian and Sun, Jian}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2021} +} +``` diff --git a/configs/mmdet/yolof/metafile.yml b/configs/mmdet/yolof/metafile.yml new file mode 100644 index 00000000..9436fee2 --- /dev/null +++ b/configs/mmdet/yolof/metafile.yml @@ -0,0 +1,32 @@ +Collections: + - Name: YOLOF + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Dilated Encoder + - ResNet + Paper: + URL: https://arxiv.org/abs/2103.09460 + Title: 'You Only Look One-level Feature' + README: configs/yolof/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/yolof.py#L6 + Version: v2.12.0 + +Models: + - Name: yolof_r50_c5_8x8_1x_coco + In Collection: YOLOF + Config: configs/yolof/yolof_r50_c5_8x8_1x_coco.py + Metadata: + Training Memory (GB): 8.3 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth diff --git a/configs/mmdet/yolof/yolof_r50_c5_8x8_1x_coco.py b/configs/mmdet/yolof/yolof_r50_c5_8x8_1x_coco.py new file mode 100644 index 00000000..1b29b7fe --- /dev/null +++ b/configs/mmdet/yolof/yolof_r50_c5_8x8_1x_coco.py @@ -0,0 +1,110 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='YOLOF', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet50_caffe')), + neck=dict( + type='DilatedEncoder', + in_channels=2048, + out_channels=512, + block_mid_channels=128, + num_residual_blocks=4), + bbox_head=dict( + type='YOLOFHead', + num_classes=80, + in_channels=512, + reg_decoded_bbox=True, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[1, 2, 4, 8, 16], + strides=[32]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1., 1., 1., 1.], + add_ctr_clamp=True, + ctr_clamp=32), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='UniformAssigner', pos_ignore_thr=0.15, neg_ignore_thr=0.7), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optimizer = dict( + type='SGD', + lr=0.12, + momentum=0.9, + weight_decay=0.0001, + paramwise_cfg=dict( + norm_decay_mult=0., custom_keys={'backbone': dict(lr_mult=1. / 3)})) +lr_config = dict(warmup_iters=1500, warmup_ratio=0.00066667) + +# use caffe img_norm +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='RandomShift', shift_ratio=0.5, max_shift_px=32), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=8, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/yolof/yolof_r50_c5_8x8_iter-1x_coco.py b/configs/mmdet/yolof/yolof_r50_c5_8x8_iter-1x_coco.py new file mode 100644 index 00000000..c95c02da --- /dev/null +++ b/configs/mmdet/yolof/yolof_r50_c5_8x8_iter-1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './yolof_r50_c5_8x8_1x_coco.py' + +# We implemented the iter-based config according to the source code. +# COCO dataset has 117266 images after filtering. We use 8 gpu and +# 8 batch size training, so 22500 is equivalent to +# 22500/(117266/(8x8))=12.3 epoch, 15000 is equivalent to 8.2 epoch, +# 20000 is equivalent to 10.9 epoch. Due to lr(0.12) is large, +# the iter-based and epoch-based setting have about 0.2 difference on +# the mAP evaluation value. +lr_config = dict(step=[15000, 20000]) +runner = dict(_delete_=True, type='IterBasedRunner', max_iters=22500) +checkpoint_config = dict(interval=2500) +evaluation = dict(interval=4500) +log_config = dict(interval=20) diff --git a/configs/mmdet/yolox/README.md b/configs/mmdet/yolox/README.md new file mode 100644 index 00000000..165045e5 --- /dev/null +++ b/configs/mmdet/yolox/README.md @@ -0,0 +1,39 @@ +# YOLOX + +> [YOLOX: Exceeding YOLO Series in 2021](https://arxiv.org/abs/2107.08430) + + + +## Abstract + +In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3% AP on COCO, surpassing NanoDet by 1.8% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3% AP on COCO, outperforming the current best practice by 3.0% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. + +
+ +
+ +## Results and Models + +| Backbone | size | Mem (GB) | box AP | Config | Download | +|:---------:|:-------:|:-------:|:-------:|:--------:|:------:| +| YOLOX-tiny | 416 | 3.5 | 32.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolox/yolox_tiny_8x8_300e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234.log.json) | +| YOLOX-s | 640 | 7.6 | 40.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolox/yolox_s_8x8_300e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711.log.json) | +| YOLOX-l | 640 | 19.9 | 49.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolox/yolox_l_8x8_300e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236.log.json) | +| YOLOX-x | 640 | 28.1 | 50.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolox/yolox_x_8x8_300e_coco.py) |[model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth) | [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254.log.json) | + +**Note**: + +1. The test score threshold is 0.001, and the box AP indicates the best AP. +2. Due to the need for pre-training weights, we cannot reproduce the performance of the `yolox-nano` model. Please refer to https://github.com/Megvii-BaseDetection/YOLOX/issues/674 for more information. +3. We also trained the model by the official release of YOLOX based on [Megvii-BaseDetection/YOLOX#735](https://github.com/Megvii-BaseDetection/YOLOX/issues/735) with commit ID [38c633](https://github.com/Megvii-BaseDetection/YOLOX/tree/38c633bf176462ee42b110c70e4ffe17b5753208). We found that the best AP of `YOLOX-tiny`, `YOLOX-s`, `YOLOX-l`, and `YOLOX-x` is 31.8, 40.3, 49.2, and 50.9, respectively. The performance is consistent with that of our re-implementation (see Table above) but still has a gap (0.3~0.8 AP) in comparison with the reported performance in their [README](https://github.com/Megvii-BaseDetection/YOLOX/blob/38c633bf176462ee42b110c70e4ffe17b5753208/README.md#benchmark). + +## Citation + +```latex +@article{yolox2021, + title={{YOLOX}: Exceeding YOLO Series in 2021}, + author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian}, + journal={arXiv preprint arXiv:2107.08430}, + year={2021} +} +``` diff --git a/configs/mmdet/yolox/metafile.yml b/configs/mmdet/yolox/metafile.yml new file mode 100644 index 00000000..845cb0a4 --- /dev/null +++ b/configs/mmdet/yolox/metafile.yml @@ -0,0 +1,70 @@ +Collections: + - Name: YOLOX + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - Cosine Annealing Lr Updater + Training Resources: 8x TITANXp GPUs + Architecture: + - CSPDarkNet + - PAFPN + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'YOLOX: Exceeding YOLO Series in 2021' + README: configs/yolox/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.15.1/mmdet/models/detectors/yolox.py#L6 + Version: v2.15.1 + + +Models: + - Name: yolox_s_8x8_300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_s_8x8_300e_coco.py + Metadata: + Training Memory (GB): 7.6 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth + - Name: yolox_l_8x8_300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_l_8x8_300e_coco.py + Metadata: + Training Memory (GB): 19.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth + - Name: yolox_x_8x8_300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_x_8x8_300e_coco.py + Metadata: + Training Memory (GB): 28.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth + - Name: yolox_tiny_8x8_300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_tiny_8x8_300e_coco.py + Metadata: + Training Memory (GB): 3.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 32.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth diff --git a/configs/mmdet/yolox/yolox_l_8x8_300e_coco.py b/configs/mmdet/yolox/yolox_l_8x8_300e_coco.py new file mode 100644 index 00000000..dcbfa183 --- /dev/null +++ b/configs/mmdet/yolox/yolox_l_8x8_300e_coco.py @@ -0,0 +1,8 @@ +_base_ = './yolox_s_8x8_300e_coco.py' + +# model settings +model = dict( + backbone=dict(deepen_factor=1.0, widen_factor=1.0), + neck=dict( + in_channels=[256, 512, 1024], out_channels=256, num_csp_blocks=3), + bbox_head=dict(in_channels=256, feat_channels=256)) diff --git a/configs/mmdet/yolox/yolox_m_8x8_300e_coco.py b/configs/mmdet/yolox/yolox_m_8x8_300e_coco.py new file mode 100644 index 00000000..3048c95c --- /dev/null +++ b/configs/mmdet/yolox/yolox_m_8x8_300e_coco.py @@ -0,0 +1,8 @@ +_base_ = './yolox_s_8x8_300e_coco.py' + +# model settings +model = dict( + backbone=dict(deepen_factor=0.67, widen_factor=0.75), + neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2), + bbox_head=dict(in_channels=192, feat_channels=192), +) diff --git a/configs/mmdet/yolox/yolox_nano_8x8_300e_coco.py b/configs/mmdet/yolox/yolox_nano_8x8_300e_coco.py new file mode 100644 index 00000000..d33ed04b --- /dev/null +++ b/configs/mmdet/yolox/yolox_nano_8x8_300e_coco.py @@ -0,0 +1,11 @@ +_base_ = './yolox_tiny_8x8_300e_coco.py' + +# model settings +model = dict( + backbone=dict(deepen_factor=0.33, widen_factor=0.25, use_depthwise=True), + neck=dict( + in_channels=[64, 128, 256], + out_channels=64, + num_csp_blocks=1, + use_depthwise=True), + bbox_head=dict(in_channels=64, feat_channels=64, use_depthwise=True)) diff --git a/configs/mmdet/yolox/yolox_s_8x8_300e_coco.py b/configs/mmdet/yolox/yolox_s_8x8_300e_coco.py new file mode 100644 index 00000000..97ff23e8 --- /dev/null +++ b/configs/mmdet/yolox/yolox_s_8x8_300e_coco.py @@ -0,0 +1,165 @@ +_base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'] + +img_scale = (640, 640) # height, width + +# model settings +model = dict( + type='YOLOX', + input_size=img_scale, + random_size_range=(15, 25), + random_size_interval=10, + backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5), + neck=dict( + type='YOLOXPAFPN', + in_channels=[128, 256, 512], + out_channels=128, + num_csp_blocks=1), + bbox_head=dict( + type='YOLOXHead', num_classes=80, in_channels=128, feat_channels=128), + train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), + # In order to align the source code, the threshold of the val phase is + # 0.01, and the threshold of the test phase is 0.001. + test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65))) + +# dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' + +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', flip_ratio=0.5), + # According to the official implementation, multi-scale + # training is not considered here but in the + # 'mmdet/models/detectors/yolox.py'. + dict(type='Resize', img_scale=img_scale, keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + # If the image is three-channel, the pad value needs + # to be set separately for each channel. + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] + +train_dataset = dict( + type='MultiImageMixDataset', + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ], + filter_empty_gt=False, + ), + pipeline=train_pipeline) + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale, + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']) + ]) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + persistent_workers=True, + train=train_dataset, + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) + +# optimizer +# default 8 gpu +optimizer = dict( + type='SGD', + lr=0.01, + momentum=0.9, + weight_decay=5e-4, + nesterov=True, + paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.)) +optimizer_config = dict(grad_clip=None) + +max_epochs = 300 +num_last_epochs = 15 +resume_from = None +interval = 10 + +# learning policy +lr_config = dict( + _delete_=True, + policy='YOLOX', + warmup='exp', + by_epoch=False, + warmup_by_epoch=True, + warmup_ratio=1, + warmup_iters=5, # 5 epoch + num_last_epochs=num_last_epochs, + min_lr_ratio=0.05) + +runner = dict(type='EpochBasedRunner', max_epochs=max_epochs) + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + priority=48), + dict( + type='SyncNormHook', + num_last_epochs=num_last_epochs, + interval=interval, + priority=48), + dict( + type='ExpMomentumEMAHook', + resume_from=resume_from, + momentum=0.0001, + priority=49) +] +checkpoint_config = dict(interval=interval) +evaluation = dict( + save_best='auto', + # The evaluation interval is 'interval' when running epoch is + # less than ‘max_epochs - num_last_epochs’. + # The evaluation interval is 1 when running epoch is greater than + # or equal to ‘max_epochs - num_last_epochs’. + interval=interval, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)], + metric='bbox') +log_config = dict(interval=50) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/yolox/yolox_tiny_8x8_300e_coco.py b/configs/mmdet/yolox/yolox_tiny_8x8_300e_coco.py new file mode 100644 index 00000000..75931bad --- /dev/null +++ b/configs/mmdet/yolox/yolox_tiny_8x8_300e_coco.py @@ -0,0 +1,58 @@ +_base_ = './yolox_s_8x8_300e_coco.py' + +# model settings +model = dict( + random_size_range=(10, 20), + backbone=dict(deepen_factor=0.33, widen_factor=0.375), + neck=dict(in_channels=[96, 192, 384], out_channels=96), + bbox_head=dict(in_channels=96, feat_channels=96)) + +img_scale = (640, 640) # height, width + +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.5, 1.5), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Resize', img_scale=img_scale, keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(416, 416), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']) + ]) +] + +train_dataset = dict(pipeline=train_pipeline) + +data = dict( + train=train_dataset, + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/configs/mmdet/yolox/yolox_x_8x8_300e_coco.py b/configs/mmdet/yolox/yolox_x_8x8_300e_coco.py new file mode 100644 index 00000000..65c0b75c --- /dev/null +++ b/configs/mmdet/yolox/yolox_x_8x8_300e_coco.py @@ -0,0 +1,8 @@ +_base_ = './yolox_s_8x8_300e_coco.py' + +# model settings +model = dict( + backbone=dict(deepen_factor=1.33, widen_factor=1.25), + neck=dict( + in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4), + bbox_head=dict(in_channels=320, feat_channels=320)) diff --git a/configs/mmtune/_base_/space/mmdet_model.py b/configs/mmtune/_base_/space/mmdet_model.py new file mode 100644 index 00000000..fc0ee8ac --- /dev/null +++ b/configs/mmtune/_base_/space/mmdet_model.py @@ -0,0 +1,342 @@ +sync_norm_cfg = dict(type='SyncBN', requires_grad=True) +# you need to set mode='dynamic' if you are using pytorch<=1.5.0 +fp16 = dict(loss_scale=dict(init_scale=512)) + +faster_rcnn_x101_64x4d_fpn = dict( + _delete_=True, + type='FasterRCNN', + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + )) + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa +faster_rcnn_swin_s_p4_w7_fpn = dict( + _delete_=True, + type='FasterRCNN', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 18, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict( + type='FPN', + in_channels=[96, 192, 384, 768], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + )) + +tood_r101_dcnv2 = dict( + _delete_=True, + type='TOOD', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='TOODHead', + num_dcn=2, + num_classes=80, + in_channels=256, + stacked_convs=6, + feat_channels=256, + anchor_type='anchor_free', + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + initial_loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + activated=True, # use probability instead of logit as input + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + activated=True, # use probability instead of logit as input + beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0)), + train_cfg=dict( + initial_epoch=4, + initial_assigner=dict(type='ATSSAssigner', topk=9), + assigner=dict(type='TaskAlignedAssigner', topk=13), + alpha=1, + beta=6, + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +img_scale = (640, 640) # height, width + +# model settings +yolox_x_8x8 = dict( + _delete_=True, + type='YOLOX', + input_size=img_scale, + random_size_range=(15, 25), + random_size_interval=10, + backbone=dict(type='CSPDarknet', deepen_factor=1.33, widen_factor=1.25), + neck=dict( + type='YOLOXPAFPN', + in_channels=[320, 640, 1280], + out_channels=320, + num_csp_blocks=4), + bbox_head=dict( + type='YOLOXHead', num_classes=80, in_channels=320, feat_channels=320), + train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), + # In order to align the source code, the threshold of the val phase is + # 0.01, and the threshold of the test phase is 0.001. + test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65))) + +model = dict( + type='Choice', + categories=[ + faster_rcnn_x101_64x4d_fpn, + faster_rcnn_swin_s_p4_w7_fpn, + tood_r101_dcnv2, + yolox_x_8x8, + ], + alias=[ + 'faster_rcnn_x101_64x4d_fpn', + 'faster_rcnn_swin_s_p4_w7_fpn', + 'tood_r101_dcnv2', + 'yolo_x_8x8', + ], +) diff --git a/configs/mmtune/mmdet_asynchb_nevergrad_pso.py b/configs/mmtune/mmdet_asynchb_nevergrad_pso.py new file mode 100644 index 00000000..38c51fc6 --- /dev/null +++ b/configs/mmtune/mmdet_asynchb_nevergrad_pso.py @@ -0,0 +1,16 @@ +_base_ = [ + './_base_/context/train.py', './_base_/searcher/nevergrad_pso.py', + './_base_/scheduler/asynchb.py', './_base_/space/mmdet_model.py', + './_base_/space/optimizer.py', './_base_/space/batch_size.py' +] + +space = { + 'model': {{_base_.model}}, + 'optimizer': {{_base_.optimizer}}, + 'data.samples_per_gpu': {{_base_.batch_size}}, +} + +metric = 'val/AP' +mode = 'max' +raise_on_failed_trial = False, +num_samples = 256 diff --git a/mmtune/mm/tasks/__init__.py b/mmtune/mm/tasks/__init__.py index 17ac52d1..8b50fe3b 100644 --- a/mmtune/mm/tasks/__init__.py +++ b/mmtune/mm/tasks/__init__.py @@ -1,11 +1,13 @@ from .base import BaseTask from .blackbox import BloackBoxTask from .builder import TASKS, build_task_processor +from .mmdet import MMDetection from .mmseg import MMSegmentation from .mmtrainbase import MMTrainBasedTask from .sphere import Sphere __all__ = [ 'TASKS', 'build_task_processor', 'BaseTask', 'BloackBoxTask', - 'MMTrainBasedTask', 'MMSegmentation', 'MMSegmentation', 'Sphere' + 'MMTrainBasedTask', 'MMDetection', 'MMSegmentation', 'MMSegmentation', + 'Sphere' ] diff --git a/mmtune/mm/tasks/mmdet.py b/mmtune/mm/tasks/mmdet.py new file mode 100644 index 00000000..f1c9c297 --- /dev/null +++ b/mmtune/mm/tasks/mmdet.py @@ -0,0 +1,207 @@ +import argparse +import copy +import time +from os import path as osp +from typing import Optional + +import mmcv +import torch +import torch.distributed as dist +from mmcv.runner import get_dist_info +from mmcv.utils import Config, DictAction, get_git_hash + +from .builder import TASKS +from .mmtrainbase import MMTrainBasedTask + + +@TASKS.register_module() +class MMDetection(MMTrainBasedTask): + + def add_arguments( + self, + parser: Optional[argparse.ArgumentParser] = None + ) -> argparse.ArgumentParser: + + if parser is None: + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('--config', help='train config file path') + parser.add_argument( + '--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') + parser.add_argument( + '--auto-resume', + action='store_true', + help='resume from the latest checkpoint automatically') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + parser.add_argument( + '--seed', type=int, default=None, help='random seed') + parser.add_argument( + '--diff-seed', + action='store_true', + help='Whether or not set different seeds for different ranks') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the ' + 'key-value pair in xxx=yyy format will be merged into config file' + ' (deprecate), change to --cfg-options instead.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the ' + 'key-value pair in xxx=yyy format will be merged into config ' + 'file. If the value to be overwritten is a list, it should be ' + 'like key="[a,b]" or key=a,b It also allows nested list/tuple ' + 'values, e.g. key="[(a,b),(c,d)]" Note that the quotation marks ' + 'are necessary and that no white space is allowed.') + parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument( + '--auto-scale-lr', + action='store_true', + help='enable automatically scaling LR.') + + return parser + + def build_model(self, + cfg: Config, + train_cfg: Optional[Config] = None, + test_cfg: Optional[Config] = None) -> torch.nn.Module: + from mmdet.models.builder import build_detector + return build_detector(cfg, train_cfg, test_cfg) + + def build_dataset( + self, + cfg: Config, + default_args: Optional[Config] = None) -> torch.utils.data.Dataset: + from mmdet.datasets.builder import build_dataset + return build_dataset(cfg, default_args) + + def train_model(self, + model: torch.nn.Module, + dataset: torch.utils.data.Dataset, + cfg: Config, + distributed: bool = True, + validate: bool = False, + timestamp: Optional[str] = None, + meta: Optional[dict] = None) -> None: + from mmdet.apis.train import train_detector + train_detector(model, dataset, cfg, distributed, validate, timestamp, + meta) + + def run(self, *args, **kwargs): + from mmdet import __version__ + from mmdet.apis import init_random_seed, set_random_seed + from mmdet.utils import (collect_env, get_root_logger, + setup_multi_processes) + args = kwargs['args'] + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + # work_dir is determined in this priority: + # CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + if args.resume_from is not None: + cfg.resume_from = args.resume_from + + cfg.auto_resume = args.auto_resume + + # init distributed env first, since logger depends on the dist info. + distributed = True + # gpu_ids is used to calculate iter when resuming checkpoint + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + # dump config + cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + print(cfg) + logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) + + # set multi-process settings + setup_multi_processes(cfg) + + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([f'{k}: {v}' for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + # noqa W504 + '\n' + dash_line) + meta['env_info'] = env_info + + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + + # set random seeds + seed = init_random_seed(args.seed) + seed = seed + dist.get_rank() if args.diff_seed else seed + logger.info(f'Set random seed to {seed}, ' + f'deterministic: {args.deterministic}') + set_random_seed(seed, deterministic=args.deterministic) + cfg.seed = seed + meta['seed'] = seed + meta['exp_name'] = osp.basename(args.config) + + model = self.build_model( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + model.init_weights() + + # SyncBN is not support for DP + logger.info(model) + + datasets = [self.build_dataset(cfg.data.train)] + if len(cfg.workflow) == 2: + val_dataset = copy.deepcopy(cfg.data.val) + val_dataset.pipeline = cfg.data.train.pipeline + datasets.append(self.build_dataset(val_dataset)) + if cfg.checkpoint_config is not None: + # save mmseg version, config file content and class names in + # checkpoints as meta data + cfg.checkpoint_config.meta = dict( + mmseg_version=f'{__version__}+{get_git_hash()[:7]}', + config=cfg.pretty_text, + CLASSES=datasets[0].CLASSES, + PALETTE=datasets[0].PALETTE) + # add an attribute for visualization convenience + model.CLASSES = datasets[0].CLASSES + # passing checkpoint meta for saving best checkpoint + meta.update(cfg.checkpoint_config.meta) + self.train_model( + model, + datasets, + cfg, + distributed=True, + validate=(not args.no_validate), + timestamp=timestamp, + meta=meta) diff --git a/mmtune/mm/tasks/mmseg.py b/mmtune/mm/tasks/mmseg.py index ce8c6a97..b795df80 100644 --- a/mmtune/mm/tasks/mmseg.py +++ b/mmtune/mm/tasks/mmseg.py @@ -49,12 +49,12 @@ def add_arguments( '--cfg-options', nargs='+', action=DictAction, - help='override some settings in the used config, the key-value pair ' - 'in xxx=yyy format will be merged into config file. If the value to ' - 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' - 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' - 'Note that the quotation marks are necessary and that no white space ' - 'is allowed.') + help='override some settings in the used config, the key-value ' + 'pair in xxx=yyy format will be merged into config file. If the ' + 'value to be overwritten is a list, it should be like key="[a,b]" ' + 'or key=a,b It also allows nested list/tuple values, e.g. ' + 'key="[(a,b),(c,d)]" Note that the quotation marks are necessary ' + 'and that no white space is allowed.') parser.add_argument( '--auto-resume', action='store_true', @@ -91,7 +91,8 @@ def train_model(self, def run(self, *args, **kwargs): from mmseg import __version__ from mmseg.apis import init_random_seed, set_random_seed - from mmseg.utils import collect_env, get_root_logger, setup_multi_processes + from mmseg.utils import (collect_env, get_root_logger, + setup_multi_processes) args = kwargs['args'] cfg = Config.fromfile(args.config) @@ -102,7 +103,8 @@ def run(self, *args, **kwargs): if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True - # work_dir is determined in this priority: CLI > segment in file > filename + # work_dir is determined in this priority: CLI > + # segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir diff --git a/mmtune/ray/searchers/__init__.py b/mmtune/ray/searchers/__init__.py index 61d728dd..b0ba1bd9 100644 --- a/mmtune/ray/searchers/__init__.py +++ b/mmtune/ray/searchers/__init__.py @@ -1,4 +1,4 @@ from .builder import SEARCHERS, build_searcher from .nevergrad import NevergradSearch -__all__ = ['SEARCHERS', 'build_searcher'] +__all__ = ['SEARCHERS', 'build_searcher', 'NevergradSearch'] diff --git a/mmtune/ray/searchers/nevergrad.py b/mmtune/ray/searchers/nevergrad.py index aa76641d..ed455610 100644 --- a/mmtune/ray/searchers/nevergrad.py +++ b/mmtune/ray/searchers/nevergrad.py @@ -11,7 +11,8 @@ from nevergrad.optimization import Optimizer from nevergrad.optimization.base import ConfiguredOptimizer Parameter = ng.p.Parameter - from nevergrad.optimization.optimizerlib import registry as optimizer_registry + from nevergrad.optimization.optimizerlib import (registry as + optimizer_registry) except ImportError: ng = None Optimizer = None @@ -32,7 +33,8 @@ def __init__(self, num_workers: int = 1, budget: Optional[int] = None, **kwargs): - assert optimizer in optimizer_registry, f'{optimizer} is not registered' + assert optimizer in optimizer_registry, ( + f'{optimizer} is not registered') optimizer = optimizer_registry[optimizer] self._budget = budget self._num_workers = num_workers diff --git a/setup.cfg b/setup.cfg index 1c55edfb..79159e19 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,19 +1,7 @@ -[flake8] -exclude = submodules -ignore = - E203, - E722, - W503 -per_file_ignores = - __init__.py: F401 -max_line_length = 120 -select = B, C, E, F, W, T4, B9 -max_complexity = 18 - [codespell] skip = *.ipynb quiet-level = 3 -ignore-words-list = DOTA,dota,formating,datas +ignore-words-list = DOTA,dota,formating,datas,tood,TOOD,winn,gool,wan,Winn [yapf] BASED_ON_STYLE = pep8 diff --git a/tools/tune.py b/tools/tune.py index 5b0e1fef..df97c0bd 100644 --- a/tools/tune.py +++ b/tools/tune.py @@ -69,7 +69,8 @@ def main(): file_name = osp.splitext(osp.basename(args.tune_config))[0] """ work_dir is determined in this priority: - CLI > segment in tune cfg file > segment in task cfg file > tune cfg filename + CLI > segment in tune cfg file > segment in task cfg file + > tune cfg filename """ args.work_dir = getattr(args, 'work_dir', '') or getattr( tune_config, 'work_dir', '') or getattr(