From 46d3dae60c5018c8940102f8ddfbe76661f250ce Mon Sep 17 00:00:00 2001 From: WYJSJTU <60090077+WYJSJTU@users.noreply.github.com> Date: Sat, 30 Apr 2022 00:12:35 +0800 Subject: [PATCH] [Feature] Add PARE (#161) * Training and testing code for PARE: Part Attention Regressor for 3D Human Body Estimation [ICCV 2021]. * Achieving 49.35mm PA-MPJPE, 81.79 MPJPE on 3DPW, compared to the original implementation with 50.9mm PA-MPJPE, 82 MPJPE. * Provided with detailed pre-train and training config. --- configs/pare/README.md | 90 +++ configs/pare/hrnet_w32_conv_pare_coco.py | 207 ++++++ .../pare/hrnet_w32_conv_pare_coco_cache.py | 210 ++++++ configs/pare/hrnet_w32_conv_pare_mix.py | 231 ++++++ configs/pare/hrnet_w32_conv_pare_mix_cache.py | 244 +++++++ .../pare/hrnet_w32_conv_pare_mix_no_mosh.py | 231 ++++++ configs/pare/metafile.yml | 32 + docs/additional_licenses.md | 112 +++ docs/model_zoo.md | 4 + docs/preprocess_dataset.md | 47 +- mmhuman3d/apis/train.py | 2 - .../renderer/torch3d_renderer/meshes.py | 2 +- .../torch3d_renderer/render_runner.py | 8 +- .../visualization/visualize_keypoints2d.py | 61 +- .../core/visualization/visualize_smpl.py | 99 +-- mmhuman3d/data/datasets/__init__.py | 19 +- .../data/datasets/human_image_dataset.py | 4 + mmhuman3d/data/datasets/pipelines/__init__.py | 31 +- .../synthetic_occlusion_augmentation.py | 137 ++++ .../models/architectures/mesh_estimator.py | 172 ++++- mmhuman3d/models/backbones/__init__.py | 3 +- mmhuman3d/models/backbones/hrnet.py | 665 ++++++++++++++++++ mmhuman3d/models/backbones/resnet.py | 19 + mmhuman3d/models/heads/__init__.py | 3 +- mmhuman3d/models/heads/pare_head.py | 615 ++++++++++++++++ mmhuman3d/models/losses/__init__.py | 7 +- mmhuman3d/models/losses/cross_entropy_loss.py | 250 +++++++ mmhuman3d/utils/geometry.py | 5 +- tests/test_datasets/test_pipelines.py | 16 +- .../test_architectures/test_mesh_estimator.py | 63 ++ .../test_models/test_backbones/test_hrnet.py | 173 +++++ .../test_models/test_heads/test_pare_head.py | 73 ++ tests/test_models/test_losses/test_loss.py | 44 +- 33 files changed, 3754 insertions(+), 125 deletions(-) create mode 100644 configs/pare/README.md create mode 100644 configs/pare/hrnet_w32_conv_pare_coco.py create mode 100644 configs/pare/hrnet_w32_conv_pare_coco_cache.py create mode 100644 configs/pare/hrnet_w32_conv_pare_mix.py create mode 100644 configs/pare/hrnet_w32_conv_pare_mix_cache.py create mode 100644 configs/pare/hrnet_w32_conv_pare_mix_no_mosh.py create mode 100644 configs/pare/metafile.yml create mode 100644 mmhuman3d/data/datasets/pipelines/synthetic_occlusion_augmentation.py create mode 100644 mmhuman3d/models/backbones/hrnet.py create mode 100644 mmhuman3d/models/heads/pare_head.py create mode 100644 mmhuman3d/models/losses/cross_entropy_loss.py create mode 100644 tests/test_models/test_backbones/test_hrnet.py create mode 100644 tests/test_models/test_heads/test_pare_head.py diff --git a/configs/pare/README.md b/configs/pare/README.md new file mode 100644 index 00000000..7ad649f4 --- /dev/null +++ b/configs/pare/README.md @@ -0,0 +1,90 @@ +# PARE + +## Introduction + +We provide the config files for PARE: [Part Attention Regressor for 3D Human Body Estimation](https://arxiv.org/abs/2104.08527). + +```BibTeX +@inproceedings{Kocabas_PARE_2021, + title = {{PARE}: Part Attention Regressor for {3D} Human Body Estimation}, + author = {Kocabas, Muhammed and Huang, Chun-Hao P. and Hilliges, Otmar and Black, Michael J.}, + booktitle = {Proc. International Conference on Computer Vision (ICCV)}, + pages = {11127--11137}, + month = oct, + year = {2021}, + doi = {}, + month_numeric = {10} +} +``` + +## Notes + +- [SMPL](https://smpl.is.tue.mpg.de/) v1.0 is used in our experiments. +- [J_regressor_extra.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/J_regressor_extra.npy?versionId=CAEQHhiBgIDD6c3V6xciIGIwZDEzYWI5NTBlOTRkODU4OTE1M2Y4YTI0NTVlZGM1) +- [J_regressor_h36m.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/J_regressor_h36m.npy?versionId=CAEQHhiBgIDE6c3V6xciIDdjYzE3MzQ4MmU4MzQyNmRiZDA5YTg2YTI5YWFkNjRi) +- [smpl_mean_params.npz](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/smpl_mean_params.npz?versionId=CAEQHhiBgICN6M3V6xciIDU1MzUzNjZjZGNiOTQ3OWJiZTJmNThiZmY4NmMxMTM4) +- Pascal Occluders for the pretraining: + - [pascal_occluders.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/pascal_occluders.npy?versionId=CAEQOhiBgMCH2fqigxgiIDY0YzRiNThkMjU1MzRjZTliMTBhZmFmYWY0MTViMTIx) + +As for pretrained model (hrnet_w32_conv_pare_coco.pth). You can download it from [here](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/hrnet_w32_conv_pare_coco.pth?versionId=CAEQOhiBgMCxmv_RgxgiIDkxNWJhOWMxNDEyMzQ1OGQ4YTQ3NjgwNjA0MWUzNDE5) and change the path of pretrained model in the config. +You can also pretrain the model using [hrnet_w32_conv_pare_coco.py]([hrnet_w32_conv_pare_coco.py]). Download the hrnet pretrain from [here](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/hrnet_pretrain.pth?versionId=CAEQOhiBgMC26fSigxgiIGViMTFiZmJkZDljMDRhMWY4Mjc5Y2UzNzBmYzU1MGVk +) for pretrain. + +Download the above resources and arrange them in the following file structure: + +```text +mmhuman3d +├── mmhuman3d +├── docs +├── tests +├── tools +├── configs +└── data + ├── gmm_08.pkl + ├── body_models + │ ├── J_regressor_extra.npy + │ ├── J_regressor_h36m.npy + │ ├── smpl_mean_params.npz + │ └── smpl + │ ├── SMPL_FEMALE.pkl + │ ├── SMPL_MALE.pkl + │ └── SMPL_NEUTRAL.pkl + ├── pretrained + │ ├── hrnet_pretrain.pth + │ └── hrnet_w32_conv_pare_coco.pth + ├── preprocessed_datasets + │ ├── h36m_mosh_train.npz + │ ├── h36m_train.npz + │ ├── mpi_inf_3dhp_train.npz + │ ├── eft_mpii.npz + │ ├── eft_lspet.npz + │ ├── eft_coco_all.npz + │ ├── pw3d_test.npz + ├── occluders + │ ├── pascal_occluders.npy + └── datasets + ├── coco + ├── h36m + ├── lspet + ├── mpi_inf_3dhp + ├── mpii + └── pw3d + +``` + + +## Results and Models + +We evaluate PARE on 3DPW. Values are MPJPE/PA-MPJPE. + +Trained with MoShed Human3.6M Datasets and Cache: + +| Config | 3DPW | Download | +|:------:|:-------:|:------:| +| [hrnet_w32_conv_pare_mix_cache.py](hrnet_w32_conv_pare_mix_cache.py) | 81.79 / 49.35 | [model](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/with_mosh/hrnet_w32_conv_pare_mosh.pth?versionId=CAEQOhiBgIDooeHSgxgiIDkwYzViMTUyNjM1MjQ3ZDNiNzNjMjJlOGFlNjgxYjlh) | [log](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/with_mosh/20220427_113717.log?versionId=CAEQOhiBgMClqr3PgxgiIGRjZWU0NzFhMmVkMDQzN2I5ZmY5Y2MxMzJiZDM3MGQ0) | + + +Trained without MoShed Human3.6M Datasets: +| Config | 3DPW | Download | +|:------:|:-------:|:------:| +| [hrnet_w32_conv_pare_mix_no_mosh.py](hrnet_w32_conv_pare_mix_no_mosh.py) | 81.81 / 50.78 | [model](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/without_mosh/hrnet_w32_conv_pare.pth?versionId=CAEQOhiBgMCi4YbVgxgiIDgzYzFhMWNlNDE2NTQwN2ZiOTQ1ZGJmYTM4OTNmYWY5) | [log](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/without_mosh/20220427_113844.log?versionId=CAEQOhiBgMCHwcTPgxgiIGI0NjI0M2JiM2ViMzRhMTFiMWQxZDJmMGI5MmQwMjgw) | diff --git a/configs/pare/hrnet_w32_conv_pare_coco.py b/configs/pare/hrnet_w32_conv_pare_coco.py new file mode 100644 index 00000000..6b69d829 --- /dev/null +++ b/configs/pare/hrnet_w32_conv_pare_coco.py @@ -0,0 +1,207 @@ +use_adversarial_train = True + +# evaluate +evaluation = dict(interval=10, metric=['pa-mpjpe', 'mpjpe']) +# optimizer + +optimizer = dict( + backbone=dict(type='Adam', lr=2.0e-4), + head=dict(type='Adam', lr=2.0e-4), +) +optimizer_config = dict(grad_clip=None) + +lr_config = dict(policy='Fixed', by_epoch=False) +runner = dict(type='EpochBasedRunner', max_epochs=200) + +log_config = dict( + interval=50, hooks=[ + dict(type='TextLoggerHook'), + ]) + +_base_ = ['../_base_/default_runtime.py'] +checkpoint_config = dict(interval=10) +width = 32 +downsample = False +use_conv = True +hrnet_extra = dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(width, width * 2)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(width, width * 2, width * 4)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(width, width * 2, width * 4, width * 8)), + downsample=downsample, + use_conv=use_conv, + pretrained_layers=[ + 'conv1', + 'bn1', + 'conv2', + 'bn2', + 'layer1', + 'transition1', + 'stage2', + 'transition2', + 'stage3', + 'transition3', + 'stage4', + ], + final_conv_kernel=1, + return_list=False, +) + +find_unused_parameters = True + +model = dict( + type='ImageBodyModelEstimator', + backbone=dict( + type='PoseHighResolutionNet', + extra=hrnet_extra, + num_joints=24, + init_cfg=dict( + type='Pretrained', + checkpoint='data/pretrained_models/hrnet_pretrain.pth')), + head=dict( + type='PareHead', + num_joints=24, + num_input_features=480, + smpl_mean_params='data/body_models/smpl_mean_params.npz', + num_deconv_layers=2, + num_deconv_filters=[128] * + 2, # num_deconv_filters = [num_deconv_filters] * num_deconv_layers + num_deconv_kernels=[4] * + 2, # num_deconv_kernels = [num_deconv_kernels] * num_deconv_layers + use_heatmaps='part_segm', + use_keypoint_attention=True, + backbone='hrnet_w32-conv', + ), + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_49', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy'), + body_model_test=dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + convention='smpl_49', + loss_keypoints3d=dict(type='MSELoss', loss_weight=300), + loss_keypoints2d=dict(type='MSELoss', loss_weight=300), + loss_smpl_pose=dict(type='MSELoss', loss_weight=60), + loss_smpl_betas=dict(type='MSELoss', loss_weight=60 * 0.001), + loss_segm_mask=dict(type='CrossEntropyLoss', loss_weight=60), + loss_camera=dict(type='CameraPriorLoss', loss_weight=1), +) + +# dataset settings +dataset_type = 'HumanImageDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data_keys = [ + 'has_smpl', 'has_keypoints3d', 'has_keypoints2d', 'smpl_body_pose', + 'smpl_global_orient', 'smpl_betas', 'smpl_transl', 'keypoints2d', + 'keypoints3d', 'sample_idx' +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomChannelNoise', noise_factor=0.4), + dict( + type='SyntheticOcclusion', + occluders_file='data/occluders/pascal_occluders.npy'), + dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_49'), + dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +inference_pipeline = [ + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img', 'sample_idx'], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +data = dict( + samples_per_gpu=64, + workers_per_gpu=0, + train=dict( + type='MixedDataset', + configs=[ + dict( + type=dataset_type, + dataset_name='coco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_49', + ann_file='eft_coco_all.npz'), + ], + partition=[1.0], + ), + test=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), + val=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), +) diff --git a/configs/pare/hrnet_w32_conv_pare_coco_cache.py b/configs/pare/hrnet_w32_conv_pare_coco_cache.py new file mode 100644 index 00000000..5aa582ee --- /dev/null +++ b/configs/pare/hrnet_w32_conv_pare_coco_cache.py @@ -0,0 +1,210 @@ +use_adversarial_train = True + +# evaluate +evaluation = dict(interval=10, metric=['pa-mpjpe', 'mpjpe']) +# optimizer + +optimizer = dict( + backbone=dict(type='Adam', lr=2.0e-4), + head=dict(type='Adam', lr=2.0e-4), +) +optimizer_config = dict(grad_clip=None) + +lr_config = dict(policy='Fixed', by_epoch=False) +runner = dict(type='EpochBasedRunner', max_epochs=200) + +log_config = dict( + interval=50, hooks=[ + dict(type='TextLoggerHook'), + ]) + +_base_ = ['../_base_/default_runtime.py'] +checkpoint_config = dict(interval=10) +width = 32 +downsample = False +use_conv = True +hrnet_extra = dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(width, width * 2)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(width, width * 2, width * 4)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(width, width * 2, width * 4, width * 8)), + downsample=downsample, + use_conv=use_conv, + pretrained_layers=[ + 'conv1', + 'bn1', + 'conv2', + 'bn2', + 'layer1', + 'transition1', + 'stage2', + 'transition2', + 'stage3', + 'transition3', + 'stage4', + ], + final_conv_kernel=1, + return_list=False, +) + +find_unused_parameters = True + +model = dict( + type='ImageBodyModelEstimator', + backbone=dict( + type='PoseHighResolutionNet', + extra=hrnet_extra, + num_joints=24, + init_cfg=dict( + type='Pretrained', + checkpoint='data/pretrained_models/hrnet_pretrain.pth')), + head=dict( + type='PareHead', + num_joints=24, + num_input_features=480, + smpl_mean_params='data/body_models/smpl_mean_params.npz', + num_deconv_layers=2, + num_deconv_filters=[128] * + 2, # num_deconv_filters = [num_deconv_filters] * num_deconv_layers + num_deconv_kernels=[4] * + 2, # num_deconv_kernels = [num_deconv_kernels] * num_deconv_layers + use_heatmaps='part_segm', + use_keypoint_attention=True, + backbone='hrnet_w32-conv', + ), + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_49', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy'), + body_model_test=dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + convention='smpl_49', + loss_keypoints3d=dict(type='MSELoss', loss_weight=300), + loss_keypoints2d=dict(type='MSELoss', loss_weight=300), + loss_smpl_pose=dict(type='MSELoss', loss_weight=60), + loss_smpl_betas=dict(type='MSELoss', loss_weight=60 * 0.001), + loss_segm_mask=dict(type='CrossEntropyLoss', loss_weight=60), + loss_camera=dict(type='CameraPriorLoss', loss_weight=1), +) + +# dataset settings +dataset_type = 'HumanImageDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data_keys = [ + 'has_smpl', 'has_keypoints3d', 'has_keypoints2d', 'smpl_body_pose', + 'smpl_global_orient', 'smpl_betas', 'smpl_transl', 'keypoints2d', + 'keypoints3d', 'sample_idx' +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomChannelNoise', noise_factor=0.4), + dict( + type='SyntheticOcclusion', + occluders_file='data/occluders/pascal_occluders.npy'), + dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_49'), + dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +inference_pipeline = [ + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img', 'sample_idx'], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +cache_files = {'coco': 'data/cache/coco_2014_train_smpl_49.npz'} + +data = dict( + samples_per_gpu=64, + workers_per_gpu=0, + train=dict( + type='MixedDataset', + configs=[ + dict( + type=dataset_type, + dataset_name='coco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_49', + cache_data_path=cache_files['coco'], + ann_file='eft_coco_all.npz'), + ], + partition=[1.0], + ), + test=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), + val=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), +) diff --git a/configs/pare/hrnet_w32_conv_pare_mix.py b/configs/pare/hrnet_w32_conv_pare_mix.py new file mode 100644 index 00000000..7e4e7c47 --- /dev/null +++ b/configs/pare/hrnet_w32_conv_pare_mix.py @@ -0,0 +1,231 @@ +use_adversarial_train = True + +evaluation = dict(interval=6, metric=['pa-mpjpe', 'mpjpe']) +optimizer = dict( + backbone=dict(type='Adam', lr=5.0e-5), + head=dict(type='Adam', lr=5.0e-5), +) +optimizer_config = dict(grad_clip=None) + +lr_config = dict(policy='Fixed', by_epoch=False) +runner = dict(type='EpochBasedRunner', max_epochs=50) + +log_config = dict( + interval=50, hooks=[ + dict(type='TextLoggerHook'), + ]) + +checkpoint_config = dict(interval=6) + +_base_ = ['../_base_/default_runtime.py'] + +width = 32 +downsample = False +use_conv = True +hrnet_extra = dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(width, width * 2)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(width, width * 2, width * 4)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(width, width * 2, width * 4, width * 8)), + downsample=downsample, + use_conv=use_conv, + pretrained_layers=[ + 'conv1', + 'bn1', + 'conv2', + 'bn2', + 'layer1', + 'transition1', + 'stage2', + 'transition2', + 'stage3', + 'transition3', + 'stage4', + ], + final_conv_kernel=1, + return_list=False, +) + +find_unused_parameters = True + +model = dict( + type='ImageBodyModelEstimator', + backbone=dict( + type='PoseHighResolutionNet', + extra=hrnet_extra, + num_joints=24, + ), + head=dict( + type='PareHead', + num_joints=24, + num_input_features=480, + smpl_mean_params='data/body_models/smpl_mean_params.npz', + num_deconv_layers=2, + num_deconv_filters=[128] * + 2, # num_deconv_filters = [num_deconv_filters] * num_deconv_layers + num_deconv_kernels=[4] * + 2, # num_deconv_kernels = [num_deconv_kernels] * num_deconv_layers + use_heatmaps='part_segm', + use_keypoint_attention=True, + backbone='hrnet_w32-conv', + ), + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_24', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy'), + body_model_test=dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + convention='smpl_24', + loss_keypoints3d=dict(type='MSELoss', loss_weight=300), + loss_keypoints2d=dict(type='MSELoss', loss_weight=150), + loss_smpl_pose=dict(type='MSELoss', loss_weight=60), + loss_smpl_betas=dict(type='MSELoss', loss_weight=60 * 0.001), + loss_camera=dict(type='CameraPriorLoss', loss_weight=1), + init_cfg=dict( + type='Pretrained', + checkpoint=('data/pretrained_models/hrnet_w32_conv_pare_coco.pth')), +) + +# dataset settings +dataset_type = 'HumanImageDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data_keys = [ + 'has_smpl', 'has_keypoints3d', 'has_keypoints2d', 'smpl_body_pose', + 'smpl_global_orient', 'smpl_betas', 'smpl_transl', 'keypoints2d', + 'keypoints3d', 'sample_idx' +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomChannelNoise', noise_factor=0.4), + dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_24'), + dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +inference_pipeline = [ + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img', 'sample_idx'], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +data = dict( + samples_per_gpu=32, + workers_per_gpu=0, + train=dict( + type='MixedDataset', + configs=[ + dict( + type=dataset_type, + dataset_name='h36m', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='h36m_mosh_train.npz'), + dict( + type=dataset_type, + dataset_name='coco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='eft_coco_all.npz'), + dict( + type=dataset_type, + dataset_name='lspet', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='eft_lspet.npz'), + dict( + type=dataset_type, + dataset_name='mpii', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='eft_mpii.npz'), + dict( + type=dataset_type, + dataset_name='mpi-inf-3dhp', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='mpi_inf_3dhp_train_mmhuman3d.npz'), + ], + partition=[0.5, 0.233, 0.046, 0.021, 0.2], + ), + test=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), + val=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), +) diff --git a/configs/pare/hrnet_w32_conv_pare_mix_cache.py b/configs/pare/hrnet_w32_conv_pare_mix_cache.py new file mode 100644 index 00000000..ecfc367d --- /dev/null +++ b/configs/pare/hrnet_w32_conv_pare_mix_cache.py @@ -0,0 +1,244 @@ +use_adversarial_train = True + +evaluation = dict(interval=6, metric=['pa-mpjpe', 'mpjpe']) +optimizer = dict( + backbone=dict(type='Adam', lr=5.0e-5), + head=dict(type='Adam', lr=5.0e-5), +) +optimizer_config = dict(grad_clip=None) + +lr_config = dict(policy='Fixed', by_epoch=False) +runner = dict(type='EpochBasedRunner', max_epochs=50) + +log_config = dict( + interval=50, hooks=[ + dict(type='TextLoggerHook'), + ]) + +checkpoint_config = dict(interval=6) + +_base_ = ['../_base_/default_runtime.py'] + +width = 32 +downsample = False +use_conv = True +hrnet_extra = dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(width, width * 2)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(width, width * 2, width * 4)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(width, width * 2, width * 4, width * 8)), + downsample=downsample, + use_conv=use_conv, + pretrained_layers=[ + 'conv1', + 'bn1', + 'conv2', + 'bn2', + 'layer1', + 'transition1', + 'stage2', + 'transition2', + 'stage3', + 'transition3', + 'stage4', + ], + final_conv_kernel=1, + return_list=False, +) + +find_unused_parameters = True + +model = dict( + type='ImageBodyModelEstimator', + backbone=dict( + type='PoseHighResolutionNet', + extra=hrnet_extra, + num_joints=24, + ), + head=dict( + type='PareHead', + num_joints=24, + num_input_features=480, + smpl_mean_params='data/body_models/smpl_mean_params.npz', + num_deconv_layers=2, + num_deconv_filters=[128] * + 2, # num_deconv_filters = [num_deconv_filters] * num_deconv_layers + num_deconv_kernels=[4] * + 2, # num_deconv_kernels = [num_deconv_kernels] * num_deconv_layers + use_heatmaps='part_segm', + use_keypoint_attention=True, + backbone='hrnet_w32-conv', + ), + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_24', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy'), + body_model_test=dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + convention='smpl_24', + loss_keypoints3d=dict(type='MSELoss', loss_weight=300), + loss_keypoints2d=dict(type='MSELoss', loss_weight=150), + loss_smpl_pose=dict(type='MSELoss', loss_weight=60), + loss_smpl_betas=dict(type='MSELoss', loss_weight=60 * 0.001), + loss_camera=dict(type='CameraPriorLoss', loss_weight=1), + init_cfg=dict( + type='Pretrained', + checkpoint=('data/pretrained_models/hrnet_w32_conv_pare_coco.pth')), +) + +# dataset settings +dataset_type = 'HumanImageDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data_keys = [ + 'has_smpl', 'has_keypoints3d', 'has_keypoints2d', 'smpl_body_pose', + 'smpl_global_orient', 'smpl_betas', 'smpl_transl', 'keypoints2d', + 'keypoints3d', 'sample_idx' +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomChannelNoise', noise_factor=0.4), + dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_24'), + dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +inference_pipeline = [ + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img', 'sample_idx'], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +cache_files = { + 'h36m': 'data/cache/h36m_mosh_train_smpl_24.npz', + 'mpi-inf-3dhp': 'data/cache/mpi_inf_3dhp_train_smpl_24.npz', + 'lsp': 'data/cache/lsp_train_smpl_24.npz', + 'lspet': 'data/cache/lspet_train_smpl_24.npz', + 'mpii': 'data/cache/mpii_train_smpl_24.npz', + 'coco': 'data/cache/coco_2014_train_smpl_24.npz' +} +data = dict( + samples_per_gpu=32, + workers_per_gpu=0, + train=dict( + type='MixedDataset', + configs=[ + dict( + type=dataset_type, + dataset_name='h36m', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + cache_data_path=cache_files['h36m'], + ann_file='h36m_mosh_train.npz'), + dict( + type=dataset_type, + dataset_name='coco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + cache_data_path=cache_files['coco'], + ann_file='eft_coco_all.npz'), + dict( + type=dataset_type, + dataset_name='lspet', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + cache_data_path=cache_files['lspet'], + ann_file='eft_lspet.npz'), + dict( + type=dataset_type, + dataset_name='mpii', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + cache_data_path=cache_files['mpii'], + ann_file='eft_mpii.npz'), + dict( + type=dataset_type, + dataset_name='mpi-inf-3dhp', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + cache_data_path=cache_files['mpi-inf-3dhp'], + ann_file='mpi_inf_3dhp_train_mmhuman3d.npz'), + ], + partition=[0.5, 0.233, 0.046, 0.021, 0.2], + ), + test=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), + val=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), +) diff --git a/configs/pare/hrnet_w32_conv_pare_mix_no_mosh.py b/configs/pare/hrnet_w32_conv_pare_mix_no_mosh.py new file mode 100644 index 00000000..c7b91719 --- /dev/null +++ b/configs/pare/hrnet_w32_conv_pare_mix_no_mosh.py @@ -0,0 +1,231 @@ +use_adversarial_train = True + +evaluation = dict(interval=6, metric=['pa-mpjpe', 'mpjpe']) +optimizer = dict( + backbone=dict(type='Adam', lr=5.0e-5), + head=dict(type='Adam', lr=5.0e-5), +) +optimizer_config = dict(grad_clip=None) + +lr_config = dict(policy='Fixed', by_epoch=False) +runner = dict(type='EpochBasedRunner', max_epochs=50) + +log_config = dict( + interval=50, hooks=[ + dict(type='TextLoggerHook'), + ]) + +checkpoint_config = dict(interval=6) + +_base_ = ['../_base_/default_runtime.py'] + +width = 32 +downsample = False +use_conv = True +hrnet_extra = dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(width, width * 2)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(width, width * 2, width * 4)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(width, width * 2, width * 4, width * 8)), + downsample=downsample, + use_conv=use_conv, + pretrained_layers=[ + 'conv1', + 'bn1', + 'conv2', + 'bn2', + 'layer1', + 'transition1', + 'stage2', + 'transition2', + 'stage3', + 'transition3', + 'stage4', + ], + final_conv_kernel=1, + return_list=False, +) + +find_unused_parameters = True + +model = dict( + type='ImageBodyModelEstimator', + backbone=dict( + type='PoseHighResolutionNet', + extra=hrnet_extra, + num_joints=24, + ), + head=dict( + type='PareHead', + num_joints=24, + num_input_features=480, + smpl_mean_params='data/body_models/smpl_mean_params.npz', + num_deconv_layers=2, + num_deconv_filters=[128] * + 2, # num_deconv_filters = [num_deconv_filters] * num_deconv_layers + num_deconv_kernels=[4] * + 2, # num_deconv_kernels = [num_deconv_kernels] * num_deconv_layers + use_heatmaps='part_segm', + use_keypoint_attention=True, + backbone='hrnet_w32-conv', + ), + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_24', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy'), + body_model_test=dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + convention='smpl_24', + loss_keypoints3d=dict(type='MSELoss', loss_weight=300), + loss_keypoints2d=dict(type='MSELoss', loss_weight=150), + loss_smpl_pose=dict(type='MSELoss', loss_weight=60), + loss_smpl_betas=dict(type='MSELoss', loss_weight=60 * 0.001), + loss_camera=dict(type='CameraPriorLoss', loss_weight=1), + init_cfg=dict( + type='Pretrained', + checkpoint=('data/pretrained_models/hrnet_w32_conv_pare_coco.pth')), +) + +# dataset settings +dataset_type = 'HumanImageDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data_keys = [ + 'has_smpl', 'has_keypoints3d', 'has_keypoints2d', 'smpl_body_pose', + 'smpl_global_orient', 'smpl_betas', 'smpl_transl', 'keypoints2d', + 'keypoints3d', 'sample_idx' +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomChannelNoise', noise_factor=0.4), + dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_24'), + dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0), + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +inference_pipeline = [ + dict(type='MeshAffine', img_res=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img', 'sample_idx'], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +data = dict( + samples_per_gpu=32, + workers_per_gpu=0, + train=dict( + type='MixedDataset', + configs=[ + dict( + type=dataset_type, + dataset_name='h36m', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='h36m_train.npz'), + dict( + type=dataset_type, + dataset_name='coco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='eft_coco_all.npz'), + dict( + type=dataset_type, + dataset_name='lspet', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='eft_lspet.npz'), + dict( + type=dataset_type, + dataset_name='mpii', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='eft_mpii.npz'), + dict( + type=dataset_type, + dataset_name='mpi-inf-3dhp', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_24', + ann_file='mpi_inf_3dhp_train_mmhuman3d.npz'), + ], + partition=[0.5, 0.233, 0.046, 0.021, 0.2], + ), + test=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), + val=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), +) diff --git a/configs/pare/metafile.yml b/configs/pare/metafile.yml new file mode 100644 index 00000000..4638a40d --- /dev/null +++ b/configs/pare/metafile.yml @@ -0,0 +1,32 @@ +Collections: + - Name: SPIN + Metadata: + Training Data: + - COCO + - Human3.6M + - LSP-Extended + - LSP + - MPI-INF-3DHP + - MPII + - 3DPW + Architecture: + - PoseHighResolutionNet + - PareHead + Paper: + URL: https://arxiv.org/abs/2104.08527 + Title: "PARE: Part Attention Regressor for 3D Human Body Estimation" + README: configs/pare/README.md + +Models: + - Name: hrnet_w32_conv_pare_mix + In Collection: PARE + Config: configs/spin/hrnet_w32_conv_pare_mix.py + Metadata: + Epochs: 50 + Results: + - Task: Human Pose and Shape Estimation + Dataset: 3DPW + Metrics: + MPJPE: 81.74 + PA-MPJPE: 48.69 + Weights: https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/with_mosh/hrnet_w32_conv_pare_mosh.pth?versionId=CAEQOhiBgIDooeHSgxgiIDkwYzViMTUyNjM1MjQ3ZDNiNzNjMjJlOGFlNjgxYjlh diff --git a/docs/additional_licenses.md b/docs/additional_licenses.md index fc89b6fb..6828146b 100644 --- a/docs/additional_licenses.md +++ b/docs/additional_licenses.md @@ -189,3 +189,115 @@ Redistribution and use in source and binary forms, with or without modification, 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +## PARE + +License + +Software Copyright License for non-commercial scientific research purposes +Please read carefully the following terms and conditions and any accompanying documentation before you download +and/or use the PARE model, data and software, (the "Model & Software"), including 3D meshes, software, and scripts. +By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use +of this github repository), you acknowledge that you have read these terms and conditions, understand them, and +agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use +the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights +under this License + +Ownership / Licensees +The Model & Software and the associated materials has been developed at the + +Max Planck Institute for Intelligent Systems (hereinafter "MPI"). + +Any copyright or patent right is owned by and proprietary material of the + +Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (hereinafter “MPG”; MPI and MPG hereinafter +collectively “Max-Planck”) + +hereinafter the “Licensor”. + +This software includes the SMPL Body Model. By downloading this software, you are agreeing to be bound by the terms of the SMPL Model License + + https://smpl.is.tue.mpg.de/modellicense + +which is necessary to create SMPL body models. + +SMPL bodies that are generated with PARE can be distributed freely under the SMPL Body License + + https://smpl.is.tue.mpg.de/bodylicense + +License Grant +Licensor grants you (Licensee) personally a single-user, non-exclusive, non-transferable, free of charge right: + +To install the Model & Software on computers owned, leased or otherwise controlled by you and/or your organization; +To use the Model & Software for the sole purpose of performing non-commercial scientific research, non-commercial +education, or non-commercial artistic projects; +Any other use, in particular any use for commercial purposes, is prohibited. This includes, without limitation, +incorporation in a commercial product, use in a commercial service, or production of other artifacts for +commercial purposes. The Model & Software may not be reproduced, modified and/or made available in any form to +any third party without Max-Planck’s prior written permission. + +The Model & Software may not be used for pornographic purposes or to generate pornographic material whether +commercial or not. This license also prohibits the use of the Model & Software to train methods/algorithms/neural +networks/etc. for commercial use of any kind. By downloading the Model & Software, +you agree not to reverse engineer it. + +No Distribution +The Model & Software and the license herein granted shall not be copied, shared, distributed, re-sold, offered +for re-sale, transferred or sub-licensed in whole or in part except that you may make one copy for archive +purposes only. + +Disclaimer of Representations and Warranties +You expressly acknowledge and agree that the Model & Software results from basic research, is provided “AS IS”, +may contain errors, and that any use of the Model & Software is at your sole risk. LICENSOR MAKES NO REPRESENTATIONS +OR WARRANTIES OF ANY KIND CONCERNING THE MODEL & SOFTWARE, NEITHER EXPRESS NOR IMPLIED, AND THE ABSENCE OF ANY +LEGAL OR ACTUAL DEFECTS, WHETHER DISCOVERABLE OR NOT. Specifically, and not to limit the foregoing, licensor +makes no representations or warranties (i) regarding the merchantability or fitness for a particular purpose of +the Model & Software, (ii) that the use of the Model & Software will not infringe any patents, copyrights or other +intellectual property rights of a third party, and (iii) that the use of the Model & Software will not cause any +damage of any kind to you or a third party. + +Limitation of Liability +Because this Model & Software License Agreement qualifies as a donation, according to Section 521 of the German +Civil Code (Bürgerliches Gesetzbuch – BGB) Licensor as a donor is liable for intent and gross negligence only. +If the Licensor fraudulently conceals a legal or material defect, they are obliged to compensate the Licensee +for the resulting damage. + +Licensor shall be liable for loss of data only up to the amount of typical recovery costs which would have +arisen had proper and regular data backup measures been taken. For the avoidance of doubt Licensor shall be +liable in accordance with the German Product Liability Act in the event of product liability. The foregoing +applies also to Licensor’s legal representatives or assistants in performance. Any further liability shall be excluded. +Patent claims generated through the usage of the Model & Software cannot be directed towards the copyright holders. +The Model & Software is provided in the state of development the licensor defines. If modified or extended by +Licensee, the Licensor makes no claims about the fitness of the Model & Software and is not responsible +for any problems such modifications cause. + +No Maintenance Services +You understand and agree that Licensor is under no obligation to provide either maintenance services, +update services, notices of latent defects, or corrections of defects with regard to the Model & Software. +Licensor nevertheless reserves the right to update, modify, or discontinue the Model & Software at any time. + +Defects of the Model & Software must be notified in writing to the Licensor with a comprehensible description +of the error symptoms. The notification of the defect should enable the reproduction of the error. +The Licensee is encouraged to communicate any use, results, modification or publication. + +Publications using the Model & Software +You acknowledge that the Model & Software is a valuable scientific resource and agree to appropriately reference +the following paper in any publication making use of the Model & Software. + +Citation: + +@inproceedings{Kocabas_PARE_2021, + title = {{PARE}: Part Attention Regressor for {3D} Human Body Estimation}, + author = {Kocabas, Muhammed and Huang, Chun-Hao P. and Hilliges, Otmar and Black, Michael J.}, + booktitle = {Proc. International Conference on Computer Vision (ICCV)}, + pages = {11127--11137}, + month = oct, + year = {2021}, + doi = {}, + month_numeric = {10} +} + +Commercial licensing opportunities +For commercial uses of the Model & Software, please send email to ps-license@tue.mpg.de + +This Agreement shall be governed by the laws of the Federal Republic of Germany except for the UN Sales Convention. diff --git a/docs/model_zoo.md b/docs/model_zoo.md index 638fd9ad..5c520145 100644 --- a/docs/model_zoo.md +++ b/docs/model_zoo.md @@ -21,3 +21,7 @@ Please refer to [VIBE](https://github.com/open-mmlab/mmhuman3d/tree/main/configs ### HybrIK Please refer to [HybrIK](https://github.com/open-mmlab/mmhuman3d/tree/main/configs/hybrik/) for details. + +### PARE + +Please refer to [PARE](https://github.com/open-mmlab/mmhuman3d/tree/main/configs/pare/) for details. diff --git a/docs/preprocess_dataset.md b/docs/preprocess_dataset.md index 0e2b7b8a..efe938e3 100644 --- a/docs/preprocess_dataset.md +++ b/docs/preprocess_dataset.md @@ -304,6 +304,51 @@ mmhuman3d └── hybrik_pw3d_test.npz ``` +For PARE training, the following datasets are required: + - [Human3.6M](#human36m) + - [Human3.6M Mosh](#human36mmosh) + - [MPI-INF-3DHP](#mpi-inf-3dhp) + - [EFT-COCO](#EFT) + - [EFT-MPII](#EFT) + - [EFT-LSPET](#EFT) + - [PW3D](#pw3d) + + +Convert datasets with the following `dataset-names`: +``` +h36m, coco, mpii, lspet, mpi-inf-3dhp, pw3d +``` + +**Alternatively**, you may download the preprocessed files directly: +- [h36m_train.npz](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/datasets/h36m_train.npz?versionId=CAEQHhiBgMDrrfbS6xciIGY2NjMxMjgwMWQzNjRkNWJhYTNkZTYyYWUxNWQ4ZTE5) +- [mpi_inf_3dhp_train.npz](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/datasets/mpi_inf_3dhp_train.npz?versionId=CAEQHhiBgMD3q_bS6xciIGQwYjc4NTRjYTllMzRkODU5NTNiZDQyOTBlYmRhODg5) +- [eft_mpii.npz](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/datasets/eft_mpii.npz?versionId=CAEQOhiBgMCXlty_gxgiIDYxNDc5YTIzZjBjMDRhMGM5ZjBiZmYzYjFjMTU1ZTRm) +- [eft_lspet.npz](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/datasets/eft_lspet.npz?versionId=CAEQOhiBgMC339u_gxgiIDZlNzk1YjMxMWRmMzRkNWJiNjg1OTI2Mjg5OTA1YzJh +) +- [eft_coco_all.npz](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/datasets/eft_coco_all.npz?versionId=CAEQOhiBgID3iuS_gxgiIDgwYzU4NTc3ZWRkNDQyNGJiYzU4MGViYTFhYTFmMmUx) +- [pw3d_test.npz](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/datasets/pw3d_test.npz?versionId=CAEQHhiBgMDaq_bS6xciIGVjY2YzZGJkNjNmMjQ2NGU4OTZkYjMwMjhhYWM1Y2I0) + + +The preprocessed datasets should have this structure: +```text +mmhuman3d +├── mmhuman3d +├── docs +├── tests +├── tools +├── configs +└── data + ├── datasets + └── preprocessed_datasets + ├── h36m_mosh_train.npz + ├── h36m_train.npz + ├── mpi_inf_3dhp_train.npz + ├── eft_mpii.npz + ├── eft_lspet.npz + ├── eft_coco_all.npz + └── pw3d_test.npz +``` + ## Folder structure ### AGORA @@ -681,7 +726,7 @@ h36m_p1=dict( -For data preparation of [Human3.6M](http://vision.imar.ro/human3.6m/description.php) for HMR and SPIN training, we use the [MoShed](https://mosh.is.tue.mpg.de/) data provided in [HMR](https://github.com/akanazawa/hmr) for training. However, due to license limitations, we are not allowed to redistribute the data. Even if you do not have access to these parameters, you can still generate the preprocessed h36m npz file without mosh parameters using our [converter](https://github.com/open-mmlab/mmhuman3d/tree/main/mmhuman3d/data/data_converters/h36m.py). +For data preparation of [Human3.6M](http://vision.imar.ro/human3.6m/description.php) for HMR, SPIN and PARE training, we use the [MoShed](https://mosh.is.tue.mpg.de/) data provided in [HMR](https://github.com/akanazawa/hmr) for training. However, due to license limitations, we are not allowed to redistribute the data. Even if you do not have access to these parameters, you can still generate the preprocessed h36m npz file without mosh parameters using our [converter](https://github.com/open-mmlab/mmhuman3d/tree/main/mmhuman3d/data/data_converters/h36m.py). You will need to extract images from raw videos for training. Do note that preprocessing can take a long time if image extraction is required. To do so, modify the `h36m_p1` config in [DATASET_CONFIG](https://github.com/open-mmlab/mmhuman3d/blob/main/tools/convert_datasets.py): diff --git a/mmhuman3d/apis/train.py b/mmhuman3d/apis/train.py index aca92d5a..207f6a43 100644 --- a/mmhuman3d/apis/train.py +++ b/mmhuman3d/apis/train.py @@ -50,7 +50,6 @@ def train_model(model, # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] - data_loaders = [ build_dataloader( ds, @@ -95,7 +94,6 @@ def train_model(model, # build runner optimizer = build_optimizers(model, cfg.optimizer) - if cfg.get('runner') is None: cfg.runner = { 'type': 'EpochBasedRunner', diff --git a/mmhuman3d/core/visualization/renderer/torch3d_renderer/meshes.py b/mmhuman3d/core/visualization/renderer/torch3d_renderer/meshes.py index 5bb4cc04..3115dabf 100644 --- a/mmhuman3d/core/visualization/renderer/torch3d_renderer/meshes.py +++ b/mmhuman3d/core/visualization/renderer/torch3d_renderer/meshes.py @@ -7,7 +7,7 @@ from pytorch3d.renderer.mesh.textures import TexturesBase from pytorch3d.structures import Meshes, list_to_padded, padded_to_list -from mmhuman3d.models import SMPL, SMPLX +from mmhuman3d.models.body_models import SMPL, SMPLX from mmhuman3d.utils.mesh_utils import \ join_meshes_as_batch as _join_meshes_as_batch from .builder import build_renderer diff --git a/mmhuman3d/core/visualization/renderer/torch3d_renderer/render_runner.py b/mmhuman3d/core/visualization/renderer/torch3d_renderer/render_runner.py index 1777a666..43e38c70 100644 --- a/mmhuman3d/core/visualization/renderer/torch3d_renderer/render_runner.py +++ b/mmhuman3d/core/visualization/renderer/torch3d_renderer/render_runner.py @@ -29,6 +29,7 @@ def render(renderer: Union[nn.Module, dict], batch_size: int = 5, return_tensor: bool = False, no_grad: bool = False, + verbose: bool = True, **forward_params): if isinstance(renderer, dict): @@ -96,8 +97,11 @@ def render(renderer: Union[nn.Module, dict], if isinstance(forward_params[k], np.ndarray): forward_params.update( {k: torch.tensor(forward_params[k]).to(device)}) - - for i in trange(math.ceil(num_frames // batch_size)): + if verbose: + iter_func = trange + else: + iter_func = range + for i in iter_func(math.ceil(num_frames // batch_size)): indexes = list( range(i * batch_size, min((i + 1) * batch_size, len(meshes)))) foward_params_batch = {} diff --git a/mmhuman3d/core/visualization/visualize_keypoints2d.py b/mmhuman3d/core/visualization/visualize_keypoints2d.py index fec67eac..ee56cb8d 100644 --- a/mmhuman3d/core/visualization/visualize_keypoints2d.py +++ b/mmhuman3d/core/visualization/visualize_keypoints2d.py @@ -413,29 +413,30 @@ def update_frame_list(frame_list, origin_frames, img_format, start, end): def visualize_kp2d( - kp2d: np.ndarray, - output_path: Optional[str] = None, - frame_list: Optional[List[str]] = None, - origin_frames: Optional[str] = None, - image_array: Optional[np.ndarray] = None, - limbs: Optional[Union[np.ndarray, List[int]]] = None, - palette: Optional[Iterable[int]] = None, - data_source: str = 'coco', - mask: Optional[Union[list, np.ndarray]] = None, - img_format: str = '%06d.png', - start: int = 0, - end: Optional[int] = None, - overwrite: bool = False, - with_file_name: bool = True, - resolution: Optional[Union[Tuple[int, int], list]] = None, - fps: Union[float, int] = 30, - draw_bbox: bool = False, - with_number: bool = False, - pop_parts: Iterable[str] = None, - disable_tqdm: bool = False, - disable_limbs: bool = False, - return_array: Optional[bool] = False, - keypoints_factory: dict = KEYPOINTS_FACTORY + kp2d: np.ndarray, + output_path: Optional[str] = None, + frame_list: Optional[List[str]] = None, + origin_frames: Optional[str] = None, + image_array: Optional[np.ndarray] = None, + limbs: Optional[Union[np.ndarray, List[int]]] = None, + palette: Optional[Iterable[int]] = None, + data_source: str = 'coco', + mask: Optional[Union[list, np.ndarray]] = None, + img_format: str = '%06d.png', + start: int = 0, + end: int = -1, + overwrite: bool = False, + with_file_name: bool = True, + resolution: Optional[Union[Tuple[int, int], list]] = None, + fps: Union[float, int] = 30, + draw_bbox: bool = False, + with_number: bool = False, + pop_parts: Iterable[str] = None, + disable_tqdm: bool = False, + disable_limbs: bool = False, + return_array: Optional[bool] = False, + keypoints_factory: dict = KEYPOINTS_FACTORY, + remove_raw_file: bool = True, ) -> Union[None, np.ndarray]: """Visualize 2d keypoints to a video or into a folder of frames. @@ -467,9 +468,7 @@ def visualize_kp2d( Defaults to None. img_format (str, optional): input image format. Default to '%06d.png', start (int, optional): start frame index. Defaults to 0. - end (int, optional): end frame index. Exclusive. - Could be positive int or negative int or None. - None represents include all the frames. + end (int, optional): end frame index. Defaults to -1. overwrite (bool, optional): whether replace the origin frames. Defaults to False. with_file_name (bool, optional): whether write origin frame name on @@ -512,9 +511,8 @@ def visualize_kp2d( assert kp2d.ndim == 4 num_frames, num_person = kp2d.shape[0], kp2d.shape[1] # slice the input array temporally - end = (min(num_frames - 1, end) + - num_frames) % num_frames if end is not None else num_frames - kp2d = kp2d[start:end] + end = (min(num_frames - 1, end) + num_frames) % num_frames + kp2d = kp2d[start:end + 1] if image_array is not None: origin_frames = None @@ -554,9 +552,10 @@ def visualize_kp2d( if disable_limbs: limbs_target, limbs_palette = None, None else: + # *** changed by wyj *** limbs_target, limbs_palette = _prepare_limb_palette( limbs, palette, pop_parts, data_source, mask) - + # limbs_target, limbs_palette = limbs, palette canvas_producer = _CavasProducer(frame_list, resolution, kp2d, image_array) out_image_array = [] @@ -608,7 +607,7 @@ def visualize_kp2d( images_to_video( input_folder=output_temp_folder, output_path=output_path, - remove_raw_file=True, + remove_raw_file=remove_raw_file, img_format=img_format, fps=fps) diff --git a/mmhuman3d/core/visualization/visualize_smpl.py b/mmhuman3d/core/visualization/visualize_smpl.py index 74e73d2a..14df2bef 100644 --- a/mmhuman3d/core/visualization/visualize_smpl.py +++ b/mmhuman3d/core/visualization/visualize_smpl.py @@ -350,7 +350,6 @@ def _prepare_mesh(poses, betas, transl, verts, start, end, body_model): elif verts.ndim == 4: joints = torch.einsum('fpik,ji->fpjk', [verts, body_model.J_regressor]) - num_verts = body_model.NUM_VERTS assert verts.shape[-2] == num_verts, 'Wrong input verts shape.' num_frames = verts.shape[0] @@ -427,52 +426,54 @@ def _prepare_colors(palette, render_choice, num_person, num_verts, model_type): def render_smpl( - # smpl parameters - poses: Optional[Union[torch.Tensor, np.ndarray, dict]] = None, - betas: Optional[Union[torch.Tensor, np.ndarray]] = None, - transl: Optional[Union[torch.Tensor, np.ndarray]] = None, - verts: Optional[Union[torch.Tensor, np.ndarray]] = None, - body_model: Optional[nn.Module] = None, - body_model_config: Optional[dict] = None, - # camera parameters - R: Optional[Union[torch.Tensor, np.ndarray]] = None, - T: Optional[Union[torch.Tensor, np.ndarray]] = None, - K: Optional[Union[torch.Tensor, np.ndarray]] = None, - orig_cam: Optional[Union[torch.Tensor, np.ndarray]] = None, - Ks: Optional[Union[torch.Tensor, np.ndarray]] = None, - in_ndc: bool = True, - convention: str = 'pytorch3d', - projection: Literal['weakperspective', 'perspective', 'fovperspective', - 'orthographics', 'fovorthographics'] = 'perspective', - orbit_speed: Union[float, Tuple[float, float]] = 0.0, - # render choice parameters - render_choice: Literal['lq', 'mq', 'hq', 'silhouette', 'depth', 'normal', - 'pointcloud', 'part_silhouette'] = 'hq', - palette: Union[List[str], str, np.ndarray, torch.Tensor] = 'white', - texture_image: Union[torch.Tensor, np.ndarray] = None, - resolution: Optional[Union[List[int], Tuple[int, int]]] = None, - start: int = 0, - end: Optional[int] = None, - alpha: float = 1.0, - no_grad: bool = True, - batch_size: int = 10, - device: Union[torch.device, str] = 'cuda', - # file io parameters - return_tensor: bool = False, - output_path: str = None, - origin_frames: Optional[str] = None, - frame_list: Optional[List[str]] = None, - image_array: Optional[Union[np.ndarray, torch.Tensor]] = None, - img_format: str = '%06d.png', - overwrite: bool = False, - mesh_file_path: Optional[str] = None, - read_frames_batch: bool = False, - # visualize keypoints - plot_kps: bool = False, - kp3d: Optional[Union[np.ndarray, torch.Tensor]] = None, - mask: Optional[Union[np.ndarray, List[int]]] = None, - vis_kp_index: bool = False, -) -> Union[None, torch.Tensor]: + # smpl parameters + poses: Optional[Union[torch.Tensor, np.ndarray, dict]] = None, + betas: Optional[Union[torch.Tensor, np.ndarray]] = None, + transl: Optional[Union[torch.Tensor, np.ndarray]] = None, + verts: Optional[Union[torch.Tensor, np.ndarray]] = None, + body_model: Optional[nn.Module] = None, + body_model_config: Optional[dict] = None, + # camera parameters + R: Optional[Union[torch.Tensor, np.ndarray]] = None, + T: Optional[Union[torch.Tensor, np.ndarray]] = None, + K: Optional[Union[torch.Tensor, np.ndarray]] = None, + orig_cam: Optional[Union[torch.Tensor, np.ndarray]] = None, + Ks: Optional[Union[torch.Tensor, np.ndarray]] = None, + in_ndc: bool = True, + convention: str = 'pytorch3d', + projection: Literal['weakperspective', 'perspective', 'fovperspective', + 'orthographics', + 'fovorthographics'] = 'perspective', + orbit_speed: Union[float, Tuple[float, float]] = 0.0, + # render choice parameters + render_choice: Literal['lq', 'mq', 'hq', 'silhouette', 'depth', + 'normal', 'pointcloud', + 'part_silhouette'] = 'hq', + palette: Union[List[str], str, np.ndarray, torch.Tensor] = 'white', + texture_image: Union[torch.Tensor, np.ndarray] = None, + resolution: Optional[Union[List[int], Tuple[int, int]]] = None, + start: int = 0, + end: Optional[int] = None, + alpha: float = 1.0, + no_grad: bool = True, + batch_size: int = 10, + device: Union[torch.device, str] = 'cuda', + # file io parameters + return_tensor: bool = False, + output_path: str = None, + origin_frames: Optional[str] = None, + frame_list: Optional[List[str]] = None, + image_array: Optional[Union[np.ndarray, torch.Tensor]] = None, + img_format: str = '%06d.png', + overwrite: bool = False, + mesh_file_path: Optional[str] = None, + read_frames_batch: bool = False, + # visualize keypoints + plot_kps: bool = False, + kp3d: Optional[Union[np.ndarray, torch.Tensor]] = None, + mask: Optional[Union[np.ndarray, List[int]]] = None, + vis_kp_index: bool = False, + verbose: bool = False) -> Union[None, torch.Tensor]: """Render SMPL or SMPL-X mesh or silhouette into differentiable tensors, and export video or images. @@ -721,6 +722,9 @@ def render_smpl( Whether plot keypoint index number on human mesh. Defaults to False. + # visualize render progress + verbose (bool, optional): + Whether print the progress bar for rendering. Returns: Union[None, torch.Tensor]: return the rendered image tensors or None. """ @@ -1039,6 +1043,7 @@ def render_smpl( output_path=output_path, return_tensor=return_tensor, no_grad=no_grad, + verbose=verbose, **render_data) if remove_folder: diff --git a/mmhuman3d/data/datasets/__init__.py b/mmhuman3d/data/datasets/__init__.py index 27c0ec5e..3e7d0f47 100644 --- a/mmhuman3d/data/datasets/__init__.py +++ b/mmhuman3d/data/datasets/__init__.py @@ -11,8 +11,19 @@ from .samplers import DistributedSampler __all__ = [ - 'BaseDataset', 'HumanImageDataset', 'build_dataloader', 'build_dataset', - 'Compose', 'DistributedSampler', 'ConcatDataset', 'RepeatDataset', - 'DATASETS', 'PIPELINES', 'MixedDataset', 'AdversarialDataset', - 'MeshDataset', 'HumanVideoDataset', 'HybrIKHumanImageDataset' + 'BaseDataset', + 'HumanImageDataset', + 'build_dataloader', + 'build_dataset', + 'Compose', + 'DistributedSampler', + 'ConcatDataset', + 'RepeatDataset', + 'DATASETS', + 'PIPELINES', + 'MixedDataset', + 'AdversarialDataset', + 'MeshDataset', + 'HumanVideoDataset', + 'HybrIKHumanImageDataset', ] diff --git a/mmhuman3d/data/datasets/human_image_dataset.py b/mmhuman3d/data/datasets/human_image_dataset.py index 56e6ce74..85c70c32 100644 --- a/mmhuman3d/data/datasets/human_image_dataset.py +++ b/mmhuman3d/data/datasets/human_image_dataset.py @@ -192,12 +192,16 @@ def prepare_raw_data(self, idx: int): if 'keypoints2d' in self.human_data: info['keypoints2d'] = self.human_data['keypoints2d'][idx] + info['has_keypoints2d'] = 1 else: info['keypoints2d'] = np.zeros((self.num_keypoints, 3)) + info['has_keypoints2d'] = 0 if 'keypoints3d' in self.human_data: info['keypoints3d'] = self.human_data['keypoints3d'][idx] + info['has_keypoints3d'] = 1 else: info['keypoints3d'] = np.zeros((self.num_keypoints, 4)) + info['has_keypoints3d'] = 0 if 'smpl' in self.human_data: smpl_dict = self.human_data['smpl'] diff --git a/mmhuman3d/data/datasets/pipelines/__init__.py b/mmhuman3d/data/datasets/pipelines/__init__.py index d77921d7..7ccccc55 100644 --- a/mmhuman3d/data/datasets/pipelines/__init__.py +++ b/mmhuman3d/data/datasets/pipelines/__init__.py @@ -17,6 +17,7 @@ RandomOcclusion, ) from .loading import LoadImageFromFile +from .synthetic_occlusion_augmentation import SyntheticOcclusion from .transforms import ( CenterCrop, ColorJitter, @@ -29,10 +30,28 @@ ) __all__ = [ - 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToPIL', 'ToNumpy', - 'Transpose', 'Collect', 'LoadImageFromFile', 'CenterCrop', - 'RandomHorizontalFlip', 'ColorJitter', 'Lighting', 'RandomChannelNoise', - 'GetRandomScaleRotation', 'MeshAffine', 'HybrIKRandomFlip', 'HybrIKAffine', - 'GenerateHybrIKTarget', 'RandomDPG', 'RandomOcclusion', - 'NewKeypointsSelection', 'Normalize' + 'Compose', + 'to_tensor', + 'ToTensor', + 'ImageToTensor', + 'ToPIL', + 'ToNumpy', + 'Transpose', + 'Collect', + 'LoadImageFromFile', + 'CenterCrop', + 'RandomHorizontalFlip', + 'ColorJitter', + 'Lighting', + 'RandomChannelNoise', + 'GetRandomScaleRotation', + 'MeshAffine', + 'HybrIKRandomFlip', + 'HybrIKAffine', + 'GenerateHybrIKTarget', + 'RandomDPG', + 'RandomOcclusion', + 'NewKeypointsSelection', + 'Normalize', + 'SyntheticOcclusion', ] diff --git a/mmhuman3d/data/datasets/pipelines/synthetic_occlusion_augmentation.py b/mmhuman3d/data/datasets/pipelines/synthetic_occlusion_augmentation.py new file mode 100644 index 00000000..7a9da6a5 --- /dev/null +++ b/mmhuman3d/data/datasets/pipelines/synthetic_occlusion_augmentation.py @@ -0,0 +1,137 @@ +"""This script is modified from https://github.com/ isarandi/synthetic- +occlusion. + +Original license please see docs/additional_licenses.md. +""" +import os.path +import random + +import cv2 +import numpy as np + +from ..builder import PIPELINES + + +def load_pascal_occluders(occluders_file): + """load pascal occluders from the occluder file.""" + + if os.path.isfile(occluders_file): + return np.load(occluders_file, allow_pickle=True) + else: + raise NotImplementedError() + + +def occlude_with_pascal_objects(im, occluders): + """Returns an augmented version of `im`, containing some occluders from the + Pascal VOC dataset.""" + + result = im.copy() + width_height = np.asarray([im.shape[1], im.shape[0]]) + im_scale_factor = min(width_height) / 256 + count = np.random.randint(1, 8) + + # logger.debug(f'Number of augmentation objects: {count}') + + for _ in range(count): + occluder = random.choice(occluders) + + center = np.random.uniform([0, 0], width_height) + random_scale_factor = np.random.uniform(0.2, 1.0) + scale_factor = random_scale_factor * im_scale_factor + + # logger.debug(f'occluder size: {occluder.shape}, + # scale_f: {scale_factor}, img_scale: {im_scale_factor}') + occluder = resize_by_factor(occluder, scale_factor) + + paste_over(im_src=occluder, im_dst=result, center=center) + + return result + + +def paste_over(im_src, im_dst, center): + """Pastes `im_src` onto `im_dst` at a specified position, with alpha + blending, in place. + + Locations outside the bounds of `im_dst` + are handled as expected (only a part or none of `im_src` becomes visible). + + Args: + im_src: The RGBA image to be pasted onto `im_dst`. + Its size can be arbitrary. + im_dst: The target image. + alpha: A float (0.0-1.0) array of the same size as `im_src` + controlling the alpha blending at each pixel. + Large values mean more visibility for `im_src`. + center: coordinates in `im_dst` where + the center of `im_src` should be placed. + """ + + width_height_src = np.asarray([im_src.shape[1], im_src.shape[0]]) + width_height_dst = np.asarray([im_dst.shape[1], im_dst.shape[0]]) + + center = np.round(center).astype(np.int32) + raw_start_dst = center - width_height_src // 2 + raw_end_dst = raw_start_dst + width_height_src + + start_dst = np.clip(raw_start_dst, 0, width_height_dst) + end_dst = np.clip(raw_end_dst, 0, width_height_dst) + region_dst = im_dst[start_dst[1]:end_dst[1], start_dst[0]:end_dst[0]] + + start_src = start_dst - raw_start_dst + end_src = width_height_src + (end_dst - raw_end_dst) + region_src = im_src[start_src[1]:end_src[1], start_src[0]:end_src[0]] + color_src = region_src[..., 0:3] + alpha = region_src[..., 3:].astype(np.float32) / 255 + + im_dst[start_dst[1]:end_dst[1], start_dst[0]:end_dst[0]] = ( + alpha * color_src + (1 - alpha) * region_dst) + + +def resize_by_factor(im, factor): + """Returns a copy of `im` resized by `factor`, using bilinear interp for up + and area interp for downscaling.""" + new_size = tuple( + np.round(np.array([im.shape[1], im.shape[0]]) * factor).astype(int)) + interp = cv2.INTER_LINEAR if factor > 1.0 else cv2.INTER_AREA + return cv2.resize(im, new_size, fx=factor, fy=factor, interpolation=interp) + + +def list_filepaths(dirpath): + """list the file paths.""" + names = os.listdir(dirpath) + paths = [os.path.join(dirpath, name) for name in names] + return sorted(filter(os.path.isfile, paths)) + + +@PIPELINES.register_module() +class SyntheticOcclusion: + """Data augmentation with synthetic occlusion. + + Required keys: 'img' + Modifies key: 'img' + Args: + flip_prob (float): probability of the image being flipped. Default: 0.5 + flip_pairs (list[int]): list of left-right keypoint pairs for flipping + occ_aug_dataset (str): name of occlusion dataset. Default: pascal + pascal_voc_root_path (str): the path to pascal voc dataset, + which can generate occluders file. + occluders_file (str): occluders file. + """ + + def __init__(self, occluders_file='', occluders=None): + self.occluders = None + if occluders is not None: + self.occluders = occluders + + else: + self.occluders = load_pascal_occluders( + occluders_file=occluders_file, ) + + def __call__(self, results): + """Perform data augmentation with random channel noise.""" + img = results['img'] + + img = occlude_with_pascal_objects(img, self.occluders) + + results['img'] = img + return results diff --git a/mmhuman3d/models/architectures/mesh_estimator.py b/mmhuman3d/models/architectures/mesh_estimator.py index 568ddc47..4c789139 100644 --- a/mmhuman3d/models/architectures/mesh_estimator.py +++ b/mmhuman3d/models/architectures/mesh_estimator.py @@ -2,7 +2,9 @@ from typing import Optional, Tuple, Union import torch +import torch.nn.functional as F +import mmhuman3d.core.visualization.visualize_smpl as visualize_smpl from mmhuman3d.core.conventions.keypoints_mapping import get_keypoint_idx from mmhuman3d.models.utils import FitsDict from mmhuman3d.utils.geometry import ( @@ -70,6 +72,8 @@ class BodyModelEstimator(BaseArchitecture, metaclass=ABCMeta): camera parameters. Default: None loss_adv (dict | None, optional): Losses config for adversial training. Default: None. + loss_segm_mask (dict | None, optional): Losses config for predicted + part segmentation. Default: None. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None. """ @@ -90,6 +94,7 @@ def __init__(self, loss_smpl_betas: Optional[Union[dict, None]] = None, loss_camera: Optional[Union[dict, None]] = None, loss_adv: Optional[Union[dict, None]] = None, + loss_segm_mask: Optional[Union[dict, None]] = None, init_cfg: Optional[Union[list, dict, None]] = None): super(BodyModelEstimator, self).__init__(init_cfg) self.backbone = build_backbone(backbone) @@ -109,12 +114,13 @@ def __init__(self, self.loss_keypoints2d = build_loss(loss_keypoints2d) self.loss_keypoints3d = build_loss(loss_keypoints3d) + self.loss_vertex = build_loss(loss_vertex) self.loss_smpl_pose = build_loss(loss_smpl_pose) self.loss_smpl_betas = build_loss(loss_smpl_betas) self.loss_adv = build_loss(loss_adv) self.loss_camera = build_loss(loss_camera) - + self.loss_segm_mask = build_loss(loss_segm_mask) set_requires_grad(self.body_model_train, False) set_requires_grad(self.body_model_test, False) @@ -410,8 +416,11 @@ def optimize_generator(self, predictions: dict): loss = dict(adv_loss=loss_adv) return loss - def compute_keypoints3d_loss(self, pred_keypoints3d: torch.Tensor, - gt_keypoints3d: torch.Tensor): + def compute_keypoints3d_loss( + self, + pred_keypoints3d: torch.Tensor, + gt_keypoints3d: torch.Tensor, + has_keypoints3d: Optional[torch.Tensor] = None): """Compute loss for 3d keypoints.""" keypoints3d_conf = gt_keypoints3d[:, :, 3].float().unsqueeze(-1) keypoints3d_conf = keypoints3d_conf.repeat(1, 1, 3) @@ -431,19 +440,39 @@ def compute_keypoints3d_loss(self, pred_keypoints3d: torch.Tensor, pred_keypoints3d = pred_keypoints3d - pred_pelvis[:, None, :] loss = self.loss_keypoints3d( pred_keypoints3d, gt_keypoints3d, reduction_override='none') - valid_pos = keypoints3d_conf > 0 - if keypoints3d_conf[valid_pos].numel() == 0: - return torch.Tensor([0]).type_as(gt_keypoints3d) - loss = torch.sum(loss * keypoints3d_conf) - loss /= keypoints3d_conf[valid_pos].numel() + + # If has_keypoints3d is not None, then computes the losses on the + # instances that have ground-truth keypoints3d. + # But the zero confidence keypoints will be included in mean. + # Otherwise, only compute the keypoints3d + # which have positive confidence. + + # has_keypoints3d is None when the key has_keypoints3d + # is not in the datasets + if has_keypoints3d is None: + + valid_pos = keypoints3d_conf > 0 + if keypoints3d_conf[valid_pos].numel() == 0: + return torch.Tensor([0]).type_as(gt_keypoints3d) + loss = torch.sum(loss * keypoints3d_conf) + loss /= keypoints3d_conf[valid_pos].numel() + else: + + keypoints3d_conf = keypoints3d_conf[has_keypoints3d == 1] + if keypoints3d_conf.shape[0] == 0: + return torch.Tensor([0]).type_as(gt_keypoints3d) + loss = loss[has_keypoints3d == 1] + loss = (loss * keypoints3d_conf).mean() return loss - def compute_keypoints2d_loss(self, - pred_keypoints3d: torch.Tensor, - pred_cam: torch.Tensor, - gt_keypoints2d: torch.Tensor, - img_res: Optional[int] = 224, - focal_length: Optional[int] = 5000): + def compute_keypoints2d_loss( + self, + pred_keypoints3d: torch.Tensor, + pred_cam: torch.Tensor, + gt_keypoints2d: torch.Tensor, + img_res: Optional[int] = 224, + focal_length: Optional[int] = 5000, + has_keypoints2d: Optional[torch.Tensor] = None): """Compute loss for 2d keypoints.""" keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1) keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2) @@ -462,11 +491,28 @@ def compute_keypoints2d_loss(self, gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1 loss = self.loss_keypoints2d( pred_keypoints2d, gt_keypoints2d, reduction_override='none') - valid_pos = keypoints2d_conf > 0 - if keypoints2d_conf[valid_pos].numel() == 0: - return torch.Tensor([0]).type_as(gt_keypoints2d) - loss = torch.sum(loss * keypoints2d_conf) - loss /= keypoints2d_conf[valid_pos].numel() + + # If has_keypoints2d is not None, then computes the losses on the + # instances that have ground-truth keypoints2d. + # But the zero confidence keypoints will be included in mean. + # Otherwise, only compute the keypoints2d + # which have positive confidence. + # has_keypoints2d is None when the key has_keypoints2d + # is not in the datasets + + if has_keypoints2d is None: + valid_pos = keypoints2d_conf > 0 + if keypoints2d_conf[valid_pos].numel() == 0: + return torch.Tensor([0]).type_as(gt_keypoints2d) + loss = torch.sum(loss * keypoints2d_conf) + loss /= keypoints2d_conf[valid_pos].numel() + else: + keypoints2d_conf = keypoints2d_conf[has_keypoints2d == 1] + if keypoints2d_conf.shape[0] == 0: + return torch.Tensor([0]).type_as(gt_keypoints2d) + loss = loss[has_keypoints2d == 1] + loss = (loss * keypoints2d_conf).mean() + return loss def compute_vertex_loss(self, pred_vertices: torch.Tensor, @@ -514,6 +560,69 @@ def compute_camera_loss(self, cameras: torch.Tensor): loss = self.loss_camera(cameras) return loss + def compute_part_segmentation_loss(self, + pred_heatmap: torch.Tensor, + gt_vertices: torch.Tensor, + gt_keypoints2d: torch.Tensor, + gt_model_joints: torch.Tensor, + has_smpl: torch.Tensor, + img_res: Optional[int] = 224, + focal_length: Optional[int] = 500): + """Compute loss for part segmentations.""" + device = gt_keypoints2d.device + gt_keypoints2d_valid = gt_keypoints2d[has_smpl == 1] + batch_size = gt_keypoints2d_valid.shape[0] + + gt_vertices_valid = gt_vertices[has_smpl == 1] + gt_model_joints_valid = gt_model_joints[has_smpl == 1] + + if batch_size == 0: + return torch.Tensor([0]).type_as(gt_keypoints2d) + gt_cam_t = estimate_translation( + gt_model_joints_valid, + gt_keypoints2d_valid, + focal_length=focal_length, + img_size=img_res, + ) + + K = torch.eye(3) + K[0, 0] = focal_length + K[1, 1] = focal_length + K[2, 2] = 1 + K[0, 2] = img_res / 2. + K[1, 2] = img_res / 2. + K = K[None, :, :] + + R = torch.eye(3)[None, :, :] + device = gt_keypoints2d.device + gt_sem_mask = visualize_smpl.render_smpl( + verts=gt_vertices_valid, + R=R, + K=K, + T=gt_cam_t, + render_choice='part_silhouette', + resolution=img_res, + return_tensor=True, + body_model=self.body_model_train, + device=device, + in_ndc=False, + convention='pytorch3d', + projection='perspective', + no_grad=True, + batch_size=batch_size, + verbose=False, + ) + gt_sem_mask = torch.flip(gt_sem_mask, [1, 2]).squeeze(-1).detach() + pred_heatmap_valid = pred_heatmap[has_smpl == 1] + ph, pw = pred_heatmap_valid.size(2), pred_heatmap_valid.size(3) + h, w = gt_sem_mask.size(1), gt_sem_mask.size(2) + if ph != h or pw != w: + pred_heatmap_valid = F.interpolate( + input=pred_heatmap_valid, size=(h, w), mode='bilinear') + + loss = self.loss_segm_mask(pred_heatmap_valid, gt_sem_mask) + return loss + def compute_losses(self, predictions: dict, targets: dict): """Compute losses.""" pred_betas = predictions['pred_shape'].view(-1, 10) @@ -559,14 +668,29 @@ def compute_losses(self, predictions: dict, targets: dict): global_orient=gt_pose[:, :3], num_joints=gt_keypoints2d.shape[1]) gt_vertices = gt_output['vertices'] - + gt_model_joints = gt_output['joints'] + if 'has_keypoints3d' in targets: + has_keypoints3d = targets['has_keypoints3d'].squeeze(-1) + else: + has_keypoints3d = None + if 'has_keypoints2d' in targets: + has_keypoints2d = targets['has_keypoints2d'].squeeze(-1) + else: + has_keypoints2d = None + if 'pred_segm_mask' in predictions: + pred_segm_mask = predictions['pred_segm_mask'] losses = {} if self.loss_keypoints3d is not None: losses['keypoints3d_loss'] = self.compute_keypoints3d_loss( - pred_keypoints3d, gt_keypoints3d) + pred_keypoints3d, + gt_keypoints3d, + has_keypoints3d=has_keypoints3d) if self.loss_keypoints2d is not None: losses['keypoints2d_loss'] = self.compute_keypoints2d_loss( - pred_keypoints3d, pred_cam, gt_keypoints2d) + pred_keypoints3d, + pred_cam, + gt_keypoints2d, + has_keypoints2d=has_keypoints2d) if self.loss_vertex is not None: losses['vertex_loss'] = self.compute_vertex_loss( pred_vertices, gt_vertices, has_smpl) @@ -578,6 +702,10 @@ def compute_losses(self, predictions: dict, targets: dict): pred_betas, gt_betas, has_smpl) if self.loss_camera is not None: losses['camera_loss'] = self.compute_camera_loss(pred_cam) + if self.loss_segm_mask is not None: + losses['loss_segm_mask'] = self.compute_part_segmentation_loss( + pred_segm_mask, gt_vertices, gt_keypoints2d, gt_model_joints, + has_smpl) return losses diff --git a/mmhuman3d/models/backbones/__init__.py b/mmhuman3d/models/backbones/__init__.py index 5a34fb2a..5e22c407 100644 --- a/mmhuman3d/models/backbones/__init__.py +++ b/mmhuman3d/models/backbones/__init__.py @@ -1,3 +1,4 @@ +from .hrnet import PoseHighResolutionNet from .resnet import ResNet, ResNetV1d -__all__ = ['ResNet', 'ResNetV1d'] +__all__ = ['ResNet', 'ResNetV1d', 'PoseHighResolutionNet'] diff --git a/mmhuman3d/models/backbones/hrnet.py b/mmhuman3d/models/backbones/hrnet.py new file mode 100644 index 00000000..1a670fbd --- /dev/null +++ b/mmhuman3d/models/backbones/hrnet.py @@ -0,0 +1,665 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmcv.runner import BaseModule, ModuleList, Sequential +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from .resnet import BasicBlock, Bottleneck + + +class HRModule(BaseModule): + """High-Resolution Module for HRNet. + + In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange + is in this module. + """ + + def __init__(self, + num_branches, + blocks, + num_blocks, + in_channels, + num_channels, + multiscale_output=True, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + block_init_cfg=None, + init_cfg=None): + super(HRModule, self).__init__(init_cfg) + self.block_init_cfg = block_init_cfg + self._check_branches(num_branches, num_blocks, in_channels, + num_channels) + + self.in_channels = in_channels + self.num_branches = num_branches + + self.multiscale_output = multiscale_output + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.with_cp = with_cp + self.branches = self._make_branches(num_branches, blocks, num_blocks, + num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(inplace=False) + + def _check_branches(self, num_branches, num_blocks, in_channels, + num_channels): + if num_branches != len(num_blocks): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_BLOCKS({len(num_blocks)})' + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_CHANNELS({len(num_channels)})' + raise ValueError(error_msg) + + if num_branches != len(in_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_INCHANNELS({len(in_channels)})' + raise ValueError(error_msg) + + def _make_one_branch(self, + branch_index, + block, + num_blocks, + num_channels, + stride=1): + downsample = None + if stride != 1 or \ + self.in_channels[branch_index] != \ + num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + self.in_channels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, num_channels[branch_index] * + block.expansion)[1]) + + layers = [] + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + init_cfg=self.block_init_cfg)) + self.in_channels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + init_cfg=self.block_init_cfg)) + + return Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels)) + + return ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + in_channels = self.in_channels + fuse_layers = [] + num_out_branches = num_branches if self.multiscale_output else 1 + for i in range(num_out_branches): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, in_channels[i])[1], + nn.Upsample( + scale_factor=2**(j - i), mode='nearest'))) + elif j == i: + fuse_layer.append(None) + else: + conv_downsamples = [] + for k in range(i - j): + if k == i - j - 1: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[i])[1])) + else: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + nn.ReLU(inplace=False))) + fuse_layer.append(nn.Sequential(*conv_downsamples)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def forward(self, x): + """Forward function.""" + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + for i in range(len(self.fuse_layers)): + y = 0 + for j in range(self.num_branches): + if i == j: + y += x[j] + else: + y += self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + return x_fuse + + +@BACKBONES.register_module() +class PoseHighResolutionNet(BaseModule): + """HRNet backbone. + `High-Resolution Representations for Labeling Pixels and Regions + arXiv: `_. + Args: + extra (dict): Detailed configuration for each stage of HRNet. + There must be 4 stages, the configuration for each stage must have + 5 keys: + - num_modules(int): The number of HRModule in this stage. + - num_branches(int): The number of branches in the HRModule. + - block(str): The type of convolution block. + - num_blocks(tuple): The number of blocks in each branch. + The length must be equal to num_branches. + - num_channels(tuple): The number of channels in each branch. + The length must be equal to num_branches. + in_channels (int): Number of input image channels. Default: 3. + conv_cfg (dict): Dictionary to construct and config conv layer. + norm_cfg (dict): Dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: True. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: False. + multiscale_output (bool): Whether to output multi-level features + produced by multiple branches. If False, only the first level + feature will be output. Default: True. + num_joints(int): the number of output for the final layer. Default: 24. + pretrained (str, optional): Model pretrained path. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} + + def __init__(self, + extra, + in_channels=3, + conv_cfg=None, + norm_cfg=dict(type='BN'), + norm_eval=True, + with_cp=False, + num_joints=24, + zero_init_residual=False, + multiscale_output=True, + pretrained=None, + init_cfg=None): + super(PoseHighResolutionNet, self).__init__(init_cfg) + + self.pretrained = pretrained + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + else: + raise TypeError('pretrained must be a str or None') + + # Assert configurations of 4 stages are in extra + assert 'stage1' in extra and 'stage2' in extra \ + and 'stage3' in extra and 'stage4' in extra + # Assert whether the length of `num_blocks` and `num_channels` are + # equal to `num_branches` + for i in range(4): + cfg = extra[f'stage{i + 1}'] + assert len(cfg['num_blocks']) == cfg['num_branches'] and \ + len(cfg['num_channels']) == cfg['num_branches'] + + self.extra = extra + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + # stem net + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) + self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) + + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + 64, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.relu = nn.ReLU(inplace=True) + + # stage 1 + self.stage1_cfg = self.extra['stage1'] + num_channels = self.stage1_cfg['num_channels'][0] + block_type = self.stage1_cfg['block'] + num_blocks = self.stage1_cfg['num_blocks'][0] + + block = self.blocks_dict[block_type] + stage1_out_channels = num_channels * block.expansion + self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) + + # stage 2 + self.stage2_cfg = self.extra['stage2'] + num_channels = self.stage2_cfg['num_channels'] + block_type = self.stage2_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition1 = self._make_transition_layer([stage1_out_channels], + num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + # stage 3 + self.stage3_cfg = self.extra['stage3'] + num_channels = self.stage3_cfg['num_channels'] + block_type = self.stage3_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition2 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + # stage 4 + self.stage4_cfg = self.extra['stage4'] + num_channels = self.stage4_cfg['num_channels'] + block_type = self.stage4_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition3 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multiscale_output=multiscale_output) + # self.pretrained_layers = extra['pretrained_layers'] + self.final_layer = build_conv_layer( + cfg=self.conv_cfg, + in_channels=pre_stage_channels[0], + out_channels=num_joints, + kernel_size=extra['final_conv_kernel'], + stride=1, + padding=1 if extra['final_conv_kernel'] == 3 else 0) + if extra['downsample'] and extra['use_conv']: + self.downsample_stage_1 = self._make_downsample_layer( + 3, num_channel=self.stage2_cfg['num_channels'][0]) + self.downsample_stage_2 = self._make_downsample_layer( + 2, num_channel=self.stage2_cfg['num_channels'][-1]) + self.downsample_stage_3 = self._make_downsample_layer( + 1, num_channel=self.stage3_cfg['num_channels'][-1]) + elif not extra['downsample'] and extra['use_conv']: + self.upsample_stage_2 = self._make_upsample_layer( + 1, num_channel=self.stage2_cfg['num_channels'][-1]) + self.upsample_stage_3 = self._make_upsample_layer( + 2, num_channel=self.stage3_cfg['num_channels'][-1]) + self.upsample_stage_4 = self._make_upsample_layer( + 3, num_channel=self.stage4_cfg['num_channels'][-1]) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + def _make_transition_layer(self, num_channels_pre_layer, + num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + num_channels_pre_layer[i], + num_channels_cur_layer[i], + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + num_channels_cur_layer[i])[1], + nn.ReLU(inplace=True))) + else: + transition_layers.append(None) + else: + conv_downsamples = [] + for j in range(i + 1 - num_branches_pre): + in_channels = num_channels_pre_layer[-1] + out_channels = num_channels_cur_layer[i] \ + if j == i - num_branches_pre else in_channels + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, out_channels)[1], + nn.ReLU(inplace=True))) + transition_layers.append(nn.Sequential(*conv_downsamples)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, planes * block.expansion)[1]) + + layers = [] + block_init_cfg = None + if self.pretrained is None and not hasattr( + self, 'init_cfg') and self.zero_init_residual: + if block is BasicBlock: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm2')) + elif block is Bottleneck: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm3')) + layers.append( + block( + inplanes, + planes, + stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + init_cfg=block_init_cfg, + )) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + init_cfg=block_init_cfg)) + + return Sequential(*layers) + + def _make_stage(self, layer_config, in_channels, multiscale_output=True): + num_modules = layer_config['num_modules'] + num_branches = layer_config['num_branches'] + num_blocks = layer_config['num_blocks'] + num_channels = layer_config['num_channels'] + block = self.blocks_dict[layer_config['block']] + + hr_modules = [] + block_init_cfg = None + if self.pretrained is None and not hasattr( + self, 'init_cfg') and self.zero_init_residual: + if block is BasicBlock: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm2')) + elif block is Bottleneck: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm3')) + + for i in range(num_modules): + # multi_scale_output is only used for the last module + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + + hr_modules.append( + HRModule( + num_branches, + block, + num_blocks, + in_channels, + num_channels, + reset_multiscale_output, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + block_init_cfg=block_init_cfg)) + + return Sequential(*hr_modules), in_channels + + def _make_upsample_layer(self, num_layers, num_channel, kernel_size=3): + layers = [] + for i in range(num_layers): + layers.append( + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=True)) + layers.append( + build_conv_layer( + cfg=self.conv_cfg, + in_channels=num_channel, + out_channels=num_channel, + kernel_size=kernel_size, + stride=1, + padding=1, + bias=False, + )) + layers.append(build_norm_layer(self.norm_cfg, num_channel)[1]) + layers.append(nn.ReLU(inplace=True)) + + return nn.Sequential(*layers) + + def _make_downsample_layer(self, num_layers, num_channel, kernel_size=3): + layers = [] + for i in range(num_layers): + layers.append( + build_conv_layer( + cfg=self.conv_cfg, + in_channels=num_channel, + out_channels=num_channel, + kernel_size=kernel_size, + stride=2, + padding=1, + bias=False, + )) + layers.append(build_norm_layer(self.norm_cfg, num_channel)[1]) + layers.append(nn.ReLU(inplace=True)) + + return nn.Sequential(*layers) + + def forward(self, x): + """Forward function.""" + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.norm2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['num_branches']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['num_branches']): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['num_branches']): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + if self.extra['return_list']: + return y_list + elif self.extra['downsample']: + if self.extra['use_conv']: + # Downsampling with strided convolutions + x1 = self.downsample_stage_1(y_list[0]) + x2 = self.downsample_stage_2(y_list[1]) + x3 = self.downsample_stage_3(y_list[2]) + x = torch.cat([x1, x2, x3, y_list[3]], 1) + else: + # Downsampling with interpolation + x0_h, x0_w = y_list[3].size(2), y_list[3].size(3) + x1 = F.interpolate( + y_list[0], + size=(x0_h, x0_w), + mode='bilinear', + align_corners=True) + x2 = F.interpolate( + y_list[1], + size=(x0_h, x0_w), + mode='bilinear', + align_corners=True) + x3 = F.interpolate( + y_list[2], + size=(x0_h, x0_w), + mode='bilinear', + align_corners=True) + x = torch.cat([x1, x2, x3, y_list[3]], 1) + else: + if self.extra['use_conv']: + # Upsampling with interpolations + convolutions + x1 = self.upsample_stage_2(y_list[1]) + x2 = self.upsample_stage_3(y_list[2]) + x3 = self.upsample_stage_4(y_list[3]) + x = torch.cat([y_list[0], x1, x2, x3], 1) + else: + # Upsampling with interpolation + x0_h, x0_w = y_list[0].size(2), y_list[0].size(3) + x1 = F.interpolate( + y_list[1], + size=(x0_h, x0_w), + mode='bilinear', + align_corners=True) + x2 = F.interpolate( + y_list[2], + size=(x0_h, x0_w), + mode='bilinear', + align_corners=True) + x3 = F.interpolate( + y_list[3], + size=(x0_h, x0_w), + mode='bilinear', + align_corners=True) + x = torch.cat([y_list[0], x1, x2, x3], 1) + return x + + def train(self, mode=True): + """Convert the model into training mode will keeping the normalization + layer freezed.""" + super(PoseHighResolutionNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmhuman3d/models/backbones/resnet.py b/mmhuman3d/models/backbones/resnet.py index cb2c514c..185aea3c 100644 --- a/mmhuman3d/models/backbones/resnet.py +++ b/mmhuman3d/models/backbones/resnet.py @@ -652,3 +652,22 @@ class ResNetV1d(ResNet): def __init__(self, **kwargs): super(ResNetV1d, self).__init__( deep_stem=True, avg_down=True, **kwargs) + + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding.""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=False, + dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution.""" + return nn.Conv2d( + in_planes, out_planes, kernel_size=1, stride=stride, bias=False) diff --git a/mmhuman3d/models/heads/__init__.py b/mmhuman3d/models/heads/__init__.py index 95e8bacf..eacdb399 100644 --- a/mmhuman3d/models/heads/__init__.py +++ b/mmhuman3d/models/heads/__init__.py @@ -1,4 +1,5 @@ from .hmr_head import HMRHead from .hybrik_head import HybrIKHead +from .pare_head import PareHead -__all__ = ['HMRHead', 'HybrIKHead'] +__all__ = ['HMRHead', 'HybrIKHead', 'PareHead'] diff --git a/mmhuman3d/models/heads/pare_head.py b/mmhuman3d/models/heads/pare_head.py new file mode 100644 index 00000000..32796b7a --- /dev/null +++ b/mmhuman3d/models/heads/pare_head.py @@ -0,0 +1,615 @@ +"""This script is modified from [PARE](https://github.com/ +mkocabas/PARE/tree/master/pare/models/layers). + +Original license please see docs/additional_licenses.md. +""" +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.runner.base_module import BaseModule +from torch.nn.modules.utils import _pair + +from mmhuman3d.utils.geometry import rot6d_to_rotmat +from ..builder import HEADS + + +class LocallyConnected2d(nn.Module): + """Locally Connected Layer. + + Args: + in_channels (int): + the in channel of the features. + out_channels (int): + the out channel of the features. + output_size (List[int]): + the output size of the features. + kernel_size (int): + the size of the kernel. + stride (int): + the stride of the kernel. + Returns: + attended_features (torch.Tensor): + attended feature maps + """ + + def __init__(self, + in_channels, + out_channels, + output_size, + kernel_size, + stride, + bias=False): + super(LocallyConnected2d, self).__init__() + output_size = _pair(output_size) + self.weight = nn.Parameter( + torch.randn(1, out_channels, in_channels, output_size[0], + output_size[1], kernel_size**2), + requires_grad=True, + ) + if bias: + self.bias = nn.Parameter( + torch.randn(1, out_channels, output_size[0], output_size[1]), + requires_grad=True) + else: + self.register_parameter('bias', None) + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + + def forward(self, x): + _, c, h, w = x.size() + kh, kw = self.kernel_size + dh, dw = self.stride + x = x.unfold(2, kh, dh).unfold(3, kw, dw) + x = x.contiguous().view(*x.size()[:-2], -1) + # Sum in in_channel and kernel_size dims + out = (x.unsqueeze(1) * self.weight).sum([2, -1]) + if self.bias is not None: + out += self.bias + return out + + +class KeypointAttention(nn.Module): + """Keypoint Attention Layer. + + Args: + use_conv (bool): + whether to use conv for the attended feature map. + Default: False + in_channels (List[int]): + the in channel of shape_cam features and pose features. + Default: (256, 64) + out_channels (List[int]): + the out channel of shape_cam features and pose features. + Default: (256, 64) + Returns: + attended_features (torch.Tensor): + attended feature maps + """ + + def __init__(self, + use_conv=False, + in_channels=(256, 64), + out_channels=(256, 64), + act='softmax', + use_scale=False): + super(KeypointAttention, self).__init__() + self.use_conv = use_conv + self.in_channels = in_channels + self.out_channels = out_channels + self.act = act + self.use_scale = use_scale + if use_conv: + self.conv1x1_pose = nn.Conv1d( + in_channels[0], out_channels[0], kernel_size=1) + self.conv1x1_shape_cam = nn.Conv1d( + in_channels[1], out_channels[1], kernel_size=1) + + def forward(self, features, heatmaps): + batch_size, num_joints, height, width = heatmaps.shape + + if self.use_scale: + scale = 1.0 / np.sqrt(height * width) + heatmaps = heatmaps * scale + + if self.act == 'softmax': + normalized_heatmap = F.softmax( + heatmaps.reshape(batch_size, num_joints, -1), dim=-1) + elif self.act == 'sigmoid': + normalized_heatmap = torch.sigmoid( + heatmaps.reshape(batch_size, num_joints, -1)) + features = features.reshape(batch_size, -1, height * width) + + attended_features = torch.matmul(normalized_heatmap, + features.transpose(2, 1)) + attended_features = attended_features.transpose(2, 1) + + if self.use_conv: + if attended_features.shape[1] == self.in_channels[0]: + attended_features = self.conv1x1_pose(attended_features) + else: + attended_features = self.conv1x1_shape_cam(attended_features) + + return attended_features + + +def interpolate(feat, uv): + """ + Args: + feat (torch.Tensor): [B, C, H, W] image features + uv (torch.Tensor): [B, 2, N] uv coordinates + in the image plane, range [-1, 1] + Returns: + samples[:, :, :, 0] (torch.Tensor): + [B, C, N] image features at the uv coordinates + """ + if uv.shape[-1] != 2: + uv = uv.transpose(1, 2) # [B, N, 2] + uv = uv.unsqueeze(2) # [B, N, 1, 2] + # NOTE: for newer PyTorch, it seems that training + # results are degraded due to implementation diff in F.grid_sample + # for old versions, simply remove the aligned_corners argument. + if int(torch.__version__.split('.')[1]) < 4: + samples = torch.nn.functional.grid_sample(feat, uv) # [B, C, N, 1] + else: + samples = torch.nn.functional.grid_sample( + feat, uv, align_corners=True) # [B, C, N, 1] + return samples[:, :, :, 0] # [B, C, N] + + +def _softmax(tensor, temperature, dim=-1): + return F.softmax(tensor * temperature, dim=dim) + + +def softargmax2d( + heatmaps, + temperature=None, + normalize_keypoints=True, +): + """Softargmax layer for heatmaps.""" + dtype, device = heatmaps.dtype, heatmaps.device + if temperature is None: + temperature = torch.tensor(1.0, dtype=dtype, device=device) + batch_size, num_channels, height, width = heatmaps.shape + x = torch.arange( + 0, width, device=device, + dtype=dtype).reshape(1, 1, 1, width).expand(batch_size, -1, height, -1) + y = torch.arange( + 0, height, device=device, + dtype=dtype).reshape(1, 1, height, 1).expand(batch_size, -1, -1, width) + # Should be Bx2xHxW + points = torch.cat([x, y], dim=1) + normalized_heatmap = _softmax( + heatmaps.reshape(batch_size, num_channels, -1), + temperature=temperature.reshape(1, -1, 1), + dim=-1) + + # Should be BxJx2 + keypoints = ( + normalized_heatmap.reshape(batch_size, -1, 1, height * width) * + points.reshape(batch_size, 1, 2, -1)).sum(dim=-1) + + if normalize_keypoints: + # Normalize keypoints to [-1, 1] + keypoints[:, :, 0] = (keypoints[:, :, 0] / (width - 1) * 2 - 1) + keypoints[:, :, 1] = (keypoints[:, :, 1] / (height - 1) * 2 - 1) + + return keypoints, normalized_heatmap.reshape(batch_size, -1, height, width) + + +@HEADS.register_module() +class PareHead(BaseModule): + + def __init__( + self, + num_joints=24, + num_input_features=480, + softmax_temp=1.0, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + num_camera_params=3, + num_features_smpl=64, + final_conv_kernel=1, + pose_mlp_num_layers=1, + shape_mlp_num_layers=1, + pose_mlp_hidden_size=256, + shape_mlp_hidden_size=256, + bn_momentum=0.1, + use_heatmaps='part_segm', + use_keypoint_attention=False, + use_postconv_keypoint_attention=False, + keypoint_attention_act='softmax', # softmax, sigmoid + use_scale_keypoint_attention=False, + backbone='hrnet_w32-conv', # hrnet, resnet + smpl_mean_params=None, + deconv_with_bias=False, + ): + """PARE parameters regressor head. This class is modified from. + + [PARE](hhttps://github.com/ + mkocabas/PARE/blob/master/pare/models/head/pare_head.py). Original + license please see docs/additional_licenses.md. + + Args: + num_joints (int): + Number of joints, should be 24 for smpl. + num_input_features (int): + Number of input featuremap channels. + softmax_temp (float): + Softmax tempreture + num_deconv_layers (int): + Number of deconvolution layers. + num_deconv_filters (List[int]): + Number of filters for each deconvolution layer, + len(num_deconv_filters) == num_deconv_layers. + num_deconv_kernels (List[int]): + Kernel size for each deconvolution layer, + len(num_deconv_kernels) == num_deconv_layers. + num_camera_params (int): + Number of predicted camera parameter dimension. + num_features_smpl (int): + Number of feature map channels. + final_conv_kernel (int): + Kernel size for the final deconvolution feature map channels. + pose_mlp_num_layers (int): + Number of mpl layers for pose parameter regression. + shape_mlp_num_layers (int): + Number of mpl layers for pose parameter regression. + pose_mlp_hidden_size (int): + Hidden size for pose mpl layers. + shape_mlp_hidden_size (int): + Hidden size for pose mpl layers. + bn_momemtum (float): + Momemtum for batch normalization. + use_heatmaps (str): + Types of heat maps to use. + use_keypoint_attention (bool) + Whether to use attention based on heat maps. + keypoint_attention_act (str): + Types of activation function for attention layers. + use_scale_keypoint_attention (str): + Whether to scale the attention + according to the size of the attention map. + deconv_with_bias (bool) + Whether to deconv with bias. + backbone (str): + Types of the backbone. + smpl_mean_params (str): + File name of the mean SMPL parameters + """ + + super(PareHead, self).__init__() + self.backbone = backbone + self.num_joints = num_joints + self.deconv_with_bias = deconv_with_bias + self.use_heatmaps = use_heatmaps + self.pose_mlp_num_layers = pose_mlp_num_layers + self.shape_mlp_num_layers = shape_mlp_num_layers + self.pose_mlp_hidden_size = pose_mlp_hidden_size + self.shape_mlp_hidden_size = shape_mlp_hidden_size + self.use_keypoint_attention = use_keypoint_attention + + self.num_input_features = num_input_features + self.bn_momentum = bn_momentum + if self.use_heatmaps == 'part_segm': + + self.use_keypoint_attention = True + + if backbone.startswith('hrnet'): + + self.keypoint_deconv_layers = self._make_conv_layer( + num_deconv_layers, + num_deconv_filters, + (3, ) * num_deconv_layers, + ) + self.num_input_features = num_input_features + self.smpl_deconv_layers = self._make_conv_layer( + num_deconv_layers, + num_deconv_filters, + (3, ) * num_deconv_layers, + ) + else: + # part branch that estimates 2d keypoints + + conv_fn = self._make_deconv_layer + + self.keypoint_deconv_layers = conv_fn( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + # reset inplanes to 2048 -> final resnet layer + self.num_input_features = num_input_features + self.smpl_deconv_layers = conv_fn( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + + pose_mlp_inp_dim = num_deconv_filters[-1] + smpl_final_dim = num_features_smpl + shape_mlp_inp_dim = num_joints * smpl_final_dim + + self.keypoint_final_layer = nn.Conv2d( + in_channels=num_deconv_filters[-1], + out_channels=num_joints + + 1 if self.use_heatmaps in ('part_segm', + 'part_segm_pool') else num_joints, + kernel_size=final_conv_kernel, + stride=1, + padding=1 if final_conv_kernel == 3 else 0, + ) + + self.smpl_final_layer = nn.Conv2d( + in_channels=num_deconv_filters[-1], + out_channels=smpl_final_dim, + kernel_size=final_conv_kernel, + stride=1, + padding=1 if final_conv_kernel == 3 else 0, + ) + + # temperature for softargmax function + self.register_buffer('temperature', torch.tensor(softmax_temp)) + mean_params = np.load(smpl_mean_params) + init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0) + init_shape = torch.from_numpy( + mean_params['shape'][:].astype('float32')).unsqueeze(0) + init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0) + self.register_buffer('init_pose', init_pose) + self.register_buffer('init_shape', init_shape) + self.register_buffer('init_cam', init_cam) + + self.pose_mlp_inp_dim = pose_mlp_inp_dim + self.shape_mlp_inp_dim = shape_mlp_inp_dim + + self.shape_mlp = self._get_shape_mlp(output_size=10) + self.cam_mlp = self._get_shape_mlp(output_size=num_camera_params) + + self.pose_mlp = self._get_pose_mlp( + num_joints=num_joints, output_size=6) + + self.keypoint_attention = KeypointAttention( + use_conv=use_postconv_keypoint_attention, + in_channels=(self.pose_mlp_inp_dim, smpl_final_dim), + out_channels=(self.pose_mlp_inp_dim, smpl_final_dim), + act=keypoint_attention_act, + use_scale=use_scale_keypoint_attention, + ) + + def _get_shape_mlp(self, output_size): + """mlp layers for shape regression.""" + if self.shape_mlp_num_layers == 1: + return nn.Linear(self.shape_mlp_inp_dim, output_size) + + module_list = [] + for i in range(self.shape_mlp_num_layers): + if i == 0: + module_list.append( + nn.Linear(self.shape_mlp_inp_dim, + self.shape_mlp_hidden_size)) + elif i == self.shape_mlp_num_layers - 1: + module_list.append( + nn.Linear(self.shape_mlp_hidden_size, output_size)) + else: + module_list.append( + nn.Linear(self.shape_mlp_hidden_size, + self.shape_mlp_hidden_size)) + return nn.Sequential(*module_list) + + def _get_pose_mlp(self, num_joints, output_size): + """mlp layers for pose regression.""" + if self.pose_mlp_num_layers == 1: + + return LocallyConnected2d( + in_channels=self.pose_mlp_inp_dim, + out_channels=output_size, + output_size=[num_joints, 1], + kernel_size=1, + stride=1, + ) + + module_list = [] + for i in range(self.pose_mlp_num_layers): + if i == 0: + module_list.append( + LocallyConnected2d( + in_channels=self.pose_mlp_inp_dim, + out_channels=self.pose_mlp_hidden_size, + output_size=[num_joints, 1], + kernel_size=1, + stride=1, + )) + elif i == self.pose_mlp_num_layers - 1: + module_list.append( + LocallyConnected2d( + in_channels=self.pose_mlp_hidden_size, + out_channels=output_size, + output_size=[num_joints, 1], + kernel_size=1, + stride=1, + )) + else: + module_list.append( + LocallyConnected2d( + in_channels=self.pose_mlp_hidden_size, + out_channels=self.pose_mlp_hidden_size, + output_size=[num_joints, 1], + kernel_size=1, + stride=1, + )) + return nn.Sequential(*module_list) + + def _get_deconv_cfg(self, deconv_kernel): + """get deconv padding, output padding according to kernel size.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_conv_layer(self, num_layers, num_filters, num_kernels): + """make convolution layers.""" + assert num_layers == len(num_filters), \ + 'ERROR: num_conv_layers is different len(num_conv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_conv_layers is different len(num_conv_filters)' + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + nn.Conv2d( + in_channels=self.num_input_features, + out_channels=planes, + kernel_size=kernel, + stride=1, + padding=padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=self.bn_momentum)) + layers.append(nn.ReLU(inplace=True)) + self.num_input_features = planes + + return nn.Sequential(*layers) + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """make deconvolution layers.""" + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + nn.ConvTranspose2d( + in_channels=self.num_input_features, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=self.bn_momentum)) + layers.append(nn.ReLU(inplace=True)) + # if self.use_self_attention: + # layers.append(SelfAttention(planes)) + self.num_input_features = planes + + return nn.Sequential(*layers) + + def forward(self, features): + batch_size = features.shape[0] + + init_pose = self.init_pose.expand(batch_size, -1) # N, Jx6 + init_shape = self.init_shape.expand(batch_size, -1) + init_cam = self.init_cam.expand(batch_size, -1) + + output = {} + + part_feats = self._get_2d_branch_feats(features) + + part_attention = self._get_part_attention_map(part_feats, output) + + smpl_feats = self._get_3d_smpl_feats(features, part_feats) + + point_local_feat, cam_shape_feats = self._get_local_feats( + smpl_feats, part_attention, output) + + pred_pose, pred_shape, pred_cam = self._get_final_preds( + point_local_feat, cam_shape_feats, init_pose, init_shape, init_cam) + + pred_rotmat = rot6d_to_rotmat(pred_pose).reshape(batch_size, 24, 3, 3) + + output.update({ + 'pred_pose': pred_rotmat, + 'pred_cam': pred_cam, + 'pred_shape': pred_shape, + }) + return output + + def _get_local_feats(self, smpl_feats, part_attention, output): + # 1x1 conv + """get keypoints and camera features from backbone features.""" + + cam_shape_feats = self.smpl_final_layer(smpl_feats) + + if self.use_keypoint_attention: + point_local_feat = self.keypoint_attention(smpl_feats, + part_attention) + cam_shape_feats = self.keypoint_attention(cam_shape_feats, + part_attention) + else: + point_local_feat = interpolate(smpl_feats, output['pred_kp2d']) + cam_shape_feats = interpolate(cam_shape_feats, output['pred_kp2d']) + return point_local_feat, cam_shape_feats + + def _get_2d_branch_feats(self, features): + """get part features from backbone features.""" + part_feats = self.keypoint_deconv_layers(features) + + return part_feats + + def _get_3d_smpl_feats(self, features, part_feats): + """get smpl feature maps from backbone features.""" + + smpl_feats = self.smpl_deconv_layers(features) + + return smpl_feats + + def _get_part_attention_map(self, part_feats, output): + """get attention map from part feature map.""" + heatmaps = self.keypoint_final_layer(part_feats) + + if self.use_heatmaps == 'part_segm': + + output['pred_segm_mask'] = heatmaps + # remove the the background channel + heatmaps = heatmaps[:, 1:, :, :] + else: + pred_kp2d, _ = softargmax2d(heatmaps, self.temperature) + output['pred_kp2d'] = pred_kp2d + output['pred_heatmaps_2d'] = heatmaps + return heatmaps + + def _get_final_preds(self, pose_feats, cam_shape_feats, init_pose, + init_shape, init_cam): + """get final preds.""" + return self._pare_get_final_preds(pose_feats, cam_shape_feats, + init_pose, init_shape, init_cam) + + def _pare_get_final_preds(self, pose_feats, cam_shape_feats, init_pose, + init_shape, init_cam): + """get final preds.""" + pose_feats = pose_feats.unsqueeze(-1) # + + if init_pose.shape[-1] == 6: + # This means init_pose comes from a previous iteration + init_pose = init_pose.transpose(2, 1).unsqueeze(-1) + else: + # This means init pose comes from mean pose + init_pose = init_pose.reshape(init_pose.shape[0], 6, + -1).unsqueeze(-1) + + shape_feats = cam_shape_feats + + shape_feats = torch.flatten(shape_feats, start_dim=1) + + pred_pose = self.pose_mlp(pose_feats) + pred_cam = self.cam_mlp(shape_feats) + pred_shape = self.shape_mlp(shape_feats) + + pred_pose = pred_pose.squeeze(-1).transpose(2, 1) # N, J, 6 + return pred_pose, pred_shape, pred_cam diff --git a/mmhuman3d/models/losses/__init__.py b/mmhuman3d/models/losses/__init__.py index c2c54872..747ead35 100644 --- a/mmhuman3d/models/losses/__init__.py +++ b/mmhuman3d/models/losses/__init__.py @@ -1,3 +1,4 @@ +from .cross_entropy_loss import CrossEntropyLoss from .gan_loss import GANLoss from .mse_loss import KeypointMSELoss, MSELoss from .prior_loss import ( @@ -22,7 +23,7 @@ __all__ = [ 'reduce_loss', 'weight_reduce_loss', 'weighted_loss', 'convert_to_one_hot', 'MSELoss', 'L1Loss', 'SmoothL1Loss', 'GANLoss', 'JointPriorLoss', - 'PoseRegLoss', 'ShapePriorLoss', 'KeypointMSELoss', 'CameraPriorLoss', - 'SmoothJointLoss', 'SmoothPelvisLoss', 'SmoothTranslationLoss', - 'MaxMixturePrior', 'LimbLengthLoss' + 'ShapePriorLoss', 'KeypointMSELoss', 'CameraPriorLoss', 'SmoothJointLoss', + 'SmoothPelvisLoss', 'SmoothTranslationLoss', 'MaxMixturePrior', + 'CrossEntropyLoss', 'PoseRegLoss', 'LimbLengthLoss' ] diff --git a/mmhuman3d/models/losses/cross_entropy_loss.py b/mmhuman3d/models/losses/cross_entropy_loss.py new file mode 100644 index 00000000..ec8f6aeb --- /dev/null +++ b/mmhuman3d/models/losses/cross_entropy_loss.py @@ -0,0 +1,250 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES +from .utils import weight_reduce_loss + + +def cross_entropy(pred, + label, + weight=None, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=-100): + """Calculate the CrossEntropy loss. + + Args: + pred (torch.Tensor): The prediction with shape (N, C), C is the number + of classes. + label (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + reduction (str, optional): The method used to reduce the loss. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (int | None): The label index to be ignored. + If None, it will be set to default value. Default: -100. + + Returns: + torch.Tensor: The calculated loss + """ + # The default value of ignore_index is the same as F.cross_entropy + ignore_index = -100 if ignore_index is None else ignore_index + # element-wise losses + loss = F.cross_entropy( + pred, + label, + weight=class_weight, + reduction='none', + ignore_index=ignore_index) + + # apply weights and do the reduction + if weight is not None: + weight = weight.float() + loss = weight_reduce_loss( + loss, weight=weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + +def _expand_onehot_labels(labels, label_weights, label_channels, ignore_index): + """Expand onehot labels to match the size of prediction.""" + bin_labels = labels.new_full((labels.size(0), label_channels), 0) + valid_mask = (labels >= 0) & (labels != ignore_index) + inds = torch.nonzero( + valid_mask & (labels < label_channels), as_tuple=False) + + if inds.numel() > 0: + bin_labels[inds, labels[inds]] = 1 + + valid_mask = valid_mask.view(-1, 1).expand(labels.size(0), + label_channels).float() + if label_weights is None: + bin_label_weights = valid_mask + else: + bin_label_weights = label_weights.view(-1, 1).repeat(1, label_channels) + bin_label_weights *= valid_mask + + return bin_labels, bin_label_weights + + +def binary_cross_entropy(pred, + label, + weight=None, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=-100): + """Calculate the binary CrossEntropy loss. + + Args: + pred (torch.Tensor): The prediction with shape (N, 1). + label (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (int | None): The label index to be ignored. + If None, it will be set to default value. Default: -100. + + Returns: + torch.Tensor: The calculated loss. + """ + # The default value of ignore_index is the same as F.cross_entropy + ignore_index = -100 if ignore_index is None else ignore_index + if pred.dim() != label.dim(): + label, weight = _expand_onehot_labels(label, weight, pred.size(-1), + ignore_index) + + # weighted element-wise losses + if weight is not None: + weight = weight.float() + loss = F.binary_cross_entropy_with_logits( + pred, label.float(), pos_weight=class_weight, reduction='none') + # do the reduction for the weighted loss + loss = weight_reduce_loss( + loss, weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + +def mask_cross_entropy(pred, + target, + label, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=None): + """Calculate the CrossEntropy loss for masks. + + Args: + pred (torch.Tensor): The prediction with shape (N, C, *), C is the + number of classes. The trailing * indicates arbitrary shape. + target (torch.Tensor): The learning label of the prediction. + label (torch.Tensor): ``label`` indicates the class label of the mask + corresponding object. This will be used to select the mask in the + of the class which the object belongs to when the mask prediction + if not class-agnostic. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (None): Placeholder, to be consistent with other loss. + Default: None. + + Returns: + torch.Tensor: The calculated loss + + Example: + >>> N, C = 3, 11 + >>> H, W = 2, 2 + >>> pred = torch.randn(N, C, H, W) * 1000 + >>> target = torch.rand(N, H, W) + >>> label = torch.randint(0, C, size=(N,)) + >>> reduction = 'mean' + >>> avg_factor = None + >>> class_weights = None + >>> loss = mask_cross_entropy(pred, target, label, reduction, + >>> avg_factor, class_weights) + >>> assert loss.shape == (1,) + """ + assert ignore_index is None, 'BCE loss does not support ignore_index' + # TODO: handle these two reserved arguments + assert reduction == 'mean' and avg_factor is None + num_rois = pred.size()[0] + inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) + pred_slice = pred[inds, label].squeeze(1) + return F.binary_cross_entropy_with_logits( + pred_slice, target, weight=class_weight, reduction='mean')[None] + + +@LOSSES.register_module() +class CrossEntropyLoss(nn.Module): + + def __init__(self, + use_sigmoid=False, + use_mask=False, + reduction='mean', + class_weight=None, + ignore_index=None, + loss_weight=1.0): + """CrossEntropyLoss. + + Args: + use_sigmoid (bool, optional): Whether the prediction uses sigmoid + of softmax. Defaults to False. + use_mask (bool, optional): Whether to use mask cross entropy loss. + Defaults to False. + reduction (str, optional): . Defaults to 'mean'. + Options are "none", "mean" and "sum". + class_weight (list[float], optional): Weight of each class. + Defaults to None. + ignore_index (int | None): The label index to be ignored. + Defaults to None. + loss_weight (float, optional): Weight of the loss. Defaults to 1.0. + """ + super(CrossEntropyLoss, self).__init__() + assert (use_sigmoid is False) or (use_mask is False) + self.use_sigmoid = use_sigmoid + self.use_mask = use_mask + self.reduction = reduction + self.loss_weight = loss_weight + self.class_weight = class_weight + self.ignore_index = ignore_index + + if self.use_sigmoid: + self.cls_criterion = binary_cross_entropy + elif self.use_mask: + self.cls_criterion = mask_cross_entropy + else: + self.cls_criterion = cross_entropy + + def forward(self, + cls_score, + label, + weight=None, + avg_factor=None, + reduction_override=None, + ignore_index=None, + **kwargs): + """Forward function. + + Args: + cls_score (torch.Tensor): The prediction. + label (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The method used to reduce the + loss. Options are "none", "mean" and "sum". + ignore_index (int | None): The label index to be ignored. + If not None, it will override the default value. Default: None. + Returns: + torch.Tensor: The calculated loss. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if ignore_index is None: + ignore_index = self.ignore_index + + if self.class_weight is not None: + class_weight = cls_score.new_tensor( + self.class_weight, device=cls_score.device) + else: + class_weight = None + loss_cls = self.loss_weight * self.cls_criterion( + cls_score, + label, + weight, + class_weight=class_weight, + reduction=reduction, + avg_factor=avg_factor, + ignore_index=ignore_index, + **kwargs) + return loss_cls diff --git a/mmhuman3d/utils/geometry.py b/mmhuman3d/utils/geometry.py index a1820a59..28439e4e 100644 --- a/mmhuman3d/utils/geometry.py +++ b/mmhuman3d/utils/geometry.py @@ -60,7 +60,10 @@ def rot6d_to_rotmat(x): Output: (B,3,3) Batch of corresponding rotation matrices """ - x = x.view(-1, 3, 2) + if isinstance(x, torch.Tensor): + x = x.reshape(-1, 3, 2) + elif isinstance(x, np.ndarray): + x = x.view(-1, 3, 2) a1 = x[:, :, 0] a2 = x[:, :, 1] b1 = F.normalize(a1) diff --git a/tests/test_datasets/test_pipelines.py b/tests/test_datasets/test_pipelines.py index f0a9afb9..4e0daf61 100644 --- a/tests/test_datasets/test_pipelines.py +++ b/tests/test_datasets/test_pipelines.py @@ -1,7 +1,10 @@ import numpy as np import pytest -from mmhuman3d.data.datasets.pipelines import LoadImageFromFile +from mmhuman3d.data.datasets.pipelines import ( + LoadImageFromFile, + SyntheticOcclusion, +) test_image_path = 'tests/data/dataset_sample/3DPW/imageFiles/' \ 'courtyard_arguing_00/image_00000.jpg' @@ -43,3 +46,14 @@ def test_load_image_from_file_smc(): assert isinstance(results['img'], np.ndarray) assert results['img_shape'] == results['ori_shape'] == (1920, 1440) assert isinstance(results['img_norm_cfg'], dict) + + +def test_synthetic_occlusion(): + results = {'img': None} + results['img'] = np.ones((224, 224, 3)) + occluders = [np.zeros((18, 18, 4))] + occluders[0][2:5, 2:5, 3] = 255 + pipeline = SyntheticOcclusion(occluders=occluders) + + results = pipeline(results) + assert results['img'].shape == (224, 224, 3) diff --git a/tests/test_models/test_architectures/test_mesh_estimator.py b/tests/test_models/test_architectures/test_mesh_estimator.py index 0abf650a..b888c6e0 100644 --- a/tests/test_models/test_architectures/test_mesh_estimator.py +++ b/tests/test_models/test_architectures/test_mesh_estimator.py @@ -6,6 +6,7 @@ VideoBodyModelEstimator, ) from mmhuman3d.models.builder import build_body_model +from mmhuman3d.utils.geometry import project_points def test_image_body_mesh_estimator(): @@ -176,6 +177,15 @@ def test_compute_keypoints3d_loss(): loss = model.compute_keypoints3d_loss(pred_keypoints3d, gt_keypoints3d) assert loss > 0 + has_keypoints3d = torch.ones(32) + loss = model.compute_keypoints3d_loss( + pred_keypoints3d, gt_keypoints3d, has_keypoints3d=has_keypoints3d) + assert loss > 0 + has_keypoints3d = torch.zeros(32) + loss = model.compute_keypoints3d_loss( + pred_keypoints3d, gt_keypoints3d, has_keypoints3d=has_keypoints3d) + assert loss == 0 + def test_compute_keypoints2d_loss(): model = ImageBodyModelEstimator( @@ -197,6 +207,22 @@ def test_compute_keypoints2d_loss(): gt_keypoints2d) assert loss > 0 + has_keypoints2d = torch.ones((32)) + loss = model.compute_keypoints2d_loss( + pred_keypoints3d, + pred_cam, + gt_keypoints2d, + has_keypoints2d=has_keypoints2d) + assert loss > 0 + + has_keypoints2d = torch.zeros((32)) + loss = model.compute_keypoints2d_loss( + pred_keypoints3d, + pred_cam, + gt_keypoints2d, + has_keypoints2d=has_keypoints2d) + assert loss == 0 + def test_compute_vertex_loss(): model = ImageBodyModelEstimator( @@ -234,6 +260,43 @@ def test_compute_smpl_pose_loss(): assert loss > 0 +def test_compute_part_segm_loss(): + N = 1 + random_body_pose = torch.rand((N, 69)) + body_model_train = dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_49', + model_path='data/body_models/smpl', + extra_joints_regressor='data/body_models/J_regressor_extra.npy') + body_model = build_body_model(body_model_train) + + body_model_output = body_model(body_pose=random_body_pose, ) + gt_model_joins = body_model_output['joints'].detach() + cam = torch.ones(N, 3) + gt_keypoints2d = project_points( + gt_model_joins, cam, focal_length=5000, img_res=224) + loss_segm_mask = dict(type='CrossEntropyLoss', loss_weight=60) + + gt_keypoints2d = torch.cat([gt_keypoints2d, torch.ones(N, 49, 1)], dim=-1) + model = ImageBodyModelEstimator( + body_model_train=body_model_train, + loss_segm_mask=loss_segm_mask, + ) + gt_vertices = torch.randn(N, 6890, 3) + pred_heatmap = torch.zeros(N, 25, 224, 224) + pred_heatmap[:, 0, :, :] = 1 + has_smpl = torch.ones((N)) + + loss = model.compute_part_segmentation_loss( + pred_heatmap, + gt_vertices, + has_smpl=has_smpl, + gt_keypoints2d=gt_keypoints2d, + gt_model_joints=gt_model_joins) + assert loss > 0 + + def test_compute_smpl_betas_loss(): model = ImageBodyModelEstimator( convention='smpl_54', diff --git a/tests/test_models/test_backbones/test_hrnet.py b/tests/test_models/test_backbones/test_hrnet.py new file mode 100644 index 00000000..6ee5f1fc --- /dev/null +++ b/tests/test_models/test_backbones/test_hrnet.py @@ -0,0 +1,173 @@ +import pytest +import torch + +from mmhuman3d.models.backbones.hrnet import HRModule, PoseHighResolutionNet +from mmhuman3d.models.backbones.resnet import BasicBlock, Bottleneck + + +def all_zeros(modules): + """Check if the weight(and bias) is all zero.""" + weight_zero = torch.equal(modules.weight.data, + torch.zeros_like(modules.weight.data)) + if hasattr(modules, 'bias'): + bias_zero = torch.equal(modules.bias.data, + torch.zeros_like(modules.bias.data)) + else: + bias_zero = True + + return weight_zero and bias_zero + + +@pytest.mark.parametrize('block', [BasicBlock, Bottleneck]) +def test_hrmodule(block): + # Test multiscale forward + num_channles = (32, 64) + in_channels = [c * block.expansion for c in num_channles] + hrmodule = HRModule( + num_branches=2, + blocks=block, + in_channels=in_channels, + num_blocks=(4, 4), + num_channels=num_channles, + ) + + feats = [ + torch.randn(1, in_channels[0], 64, 64), + torch.randn(1, in_channels[1], 32, 32) + ] + feats = hrmodule(feats) + + assert len(feats) == 2 + assert feats[0].shape == torch.Size([1, in_channels[0], 64, 64]) + assert feats[1].shape == torch.Size([1, in_channels[1], 32, 32]) + + # Test single scale forward + num_channles = (32, 64) + in_channels = [c * block.expansion for c in num_channles] + hrmodule = HRModule( + num_branches=2, + blocks=block, + in_channels=in_channels, + num_blocks=(4, 4), + num_channels=num_channles, + multiscale_output=False, + ) + + feats = [ + torch.randn(1, in_channels[0], 64, 64), + torch.randn(1, in_channels[1], 32, 32) + ] + feats = hrmodule(feats) + + assert len(feats) == 1 + assert feats[0].shape == torch.Size([1, in_channels[0], 64, 64]) + + +def test_hrnet_backbone(): + # only have 3 stages + extra = dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + return_list=True, + downsample=False, + use_conv=True, + final_conv_kernel=1, + ) + + with pytest.raises(AssertionError): + # HRNet now only support 4 stages + PoseHighResolutionNet(extra=extra) + extra['stage4'] = dict( + num_modules=3, + num_branches=3, # should be 4 + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256)) + + with pytest.raises(AssertionError): + # len(num_blocks) should equal num_branches + PoseHighResolutionNet(extra=extra) + + extra['stage4']['num_branches'] = 4 + + # Test hrnetv2p_w32 + model = PoseHighResolutionNet(extra=extra) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 256, 256) + feats = model(imgs) + assert len(feats) == 4 + assert feats[0].shape == torch.Size([1, 32, 64, 64]) + assert feats[3].shape == torch.Size([1, 256, 8, 8]) + + # Test single scale output + model = PoseHighResolutionNet(extra=extra, multiscale_output=False) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 256, 256) + feats = model(imgs) + assert len(feats) == 1 + assert feats[0].shape == torch.Size([1, 32, 64, 64]) + + extra['return_list'] = False + model = PoseHighResolutionNet(extra=extra) + model.train() + + imgs = torch.randn(1, 3, 256, 256) + feats = model(imgs) + assert feats.shape == torch.Size([1, 480, 64, 64]) + extra['use_conv'] = False + model = PoseHighResolutionNet(extra=extra) + model.init_weights() + imgs = torch.randn(1, 3, 256, 256) + feats = model(imgs) + assert feats.shape == torch.Size([1, 480, 64, 64]) + + extra['downsample'] = True + model = PoseHighResolutionNet(extra=extra) + model.init_weights() + imgs = torch.randn(1, 3, 256, 256) + feats = model(imgs) + assert feats.shape == torch.Size([1, 480, 8, 8]) + + extra['use_conv'] = True + model = PoseHighResolutionNet(extra=extra) + model.init_weights() + imgs = torch.randn(1, 3, 256, 256) + feats = model(imgs) + assert feats.shape == torch.Size([1, 480, 8, 8]) + extra['use_conv'] = False + + model = PoseHighResolutionNet(extra=extra, zero_init_residual=True) + model.init_weights() + + model.train() + init_cfg = {type: 'Pretrained'} + pretrained = '.' + with pytest.raises(AssertionError): + # # len(num_blocks) should equal num_branches + PoseHighResolutionNet( + extra=extra, init_cfg=init_cfg, pretrained=pretrained) + with pytest.raises(TypeError): + # # len(num_blocks) should equal num_branches + PoseHighResolutionNet(extra=extra, pretrained=1) + + PoseHighResolutionNet(extra=extra, pretrained=pretrained) diff --git a/tests/test_models/test_heads/test_pare_head.py b/tests/test_models/test_heads/test_pare_head.py new file mode 100644 index 00000000..ddb8d6ce --- /dev/null +++ b/tests/test_models/test_heads/test_pare_head.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest +import torch + +from mmhuman3d.models.heads import PareHead + + +@pytest.mark.parametrize('deconv_with_bias', [True, False]) +def test_pare_head(deconv_with_bias): + + # generate weight file for SMPL model. + + # initialize models + head = PareHead( + backbone='hrnet_w32-conv', + use_keypoint_attention=True, + smpl_mean_params='data/body_models/smpl_mean_params.npz', + deconv_with_bias=deconv_with_bias) + + # mock inputs + batch_size = 4 + input_shape = (batch_size, 480, 64, 64) + features = _demo_head_inputs(input_shape) + features = torch.tensor(features).float() + + predictions = head(features) + pred_keys = ['pred_pose', 'pred_cam', 'pred_shape'] + + for k in pred_keys: + assert k in predictions + assert predictions[k].shape[0] == batch_size + + +def test_pare_head_no_attention(): + + # generate weight file for SMPL model. + + # initialize models + head = PareHead( + backbone='hrnet_w32-conv', + use_keypoint_attention=False, + use_heatmaps='', + smpl_mean_params='data/body_models/smpl_mean_params.npz', + ) + + # mock inputs + batch_size = 4 + input_shape = (batch_size, 480, 64, 64) + features = _demo_head_inputs(input_shape) + features = torch.tensor(features).float() + + predictions = head(features) + pred_keys = ['pred_pose', 'pred_cam', 'pred_shape'] + + for k in pred_keys: + assert k in predictions + assert predictions[k].shape[0] == batch_size + + +def _demo_head_inputs(input_shape=(1, 480, 56, 56)): + """Create a superset of inputs needed to run test or train batches. + + Args: + input_shape (tuple): + input batch dimensions + """ + (N, C, H, W) = input_shape + + rng = np.random.RandomState(0) + + features = rng.rand(*input_shape) + + return features diff --git a/tests/test_models/test_losses/test_loss.py b/tests/test_models/test_losses/test_loss.py index fc8c2c4f..8e322968 100644 --- a/tests/test_models/test_losses/test_loss.py +++ b/tests/test_models/test_losses/test_loss.py @@ -1,10 +1,10 @@ import pytest import torch -from mmhuman3d.models.losses import L1Loss, MSELoss +from mmhuman3d.models.losses import CrossEntropyLoss, L1Loss, MSELoss -@pytest.mark.parametrize('loss_class', [MSELoss, L1Loss]) +@pytest.mark.parametrize('loss_class', [MSELoss, L1Loss, CrossEntropyLoss]) def test_loss_with_reduction_override(loss_class): pred = torch.rand((10, 3)) target = torch.rand((10, 3)), @@ -52,3 +52,43 @@ def test_regression_losses(loss_class): loss_class()( pred, target, avg_factor=10, reduction_override=reduction_override) assert isinstance(loss, torch.Tensor) + + +@pytest.mark.parametrize('use_sigmoid', [True, False]) +@pytest.mark.parametrize('reduction', ['sum', 'mean', None]) +def test_loss_with_ignore_index(use_sigmoid, reduction): + # Test cross_entropy loss + + loss_class = CrossEntropyLoss( + use_sigmoid=use_sigmoid, + use_mask=False, + ignore_index=255, + ) + pred = torch.rand((10, 5)) + target = torch.randint(0, 5, (10, )) + + ignored_indices = torch.randint(0, 10, (2, ), dtype=torch.long) + target[ignored_indices] = 255 + + # Test loss forward with default ignore + loss_with_ignore = loss_class(pred, target, reduction_override=reduction) + assert isinstance(loss_with_ignore, torch.Tensor) + + # Test loss forward with forward ignore + target[ignored_indices] = 255 + loss_with_forward_ignore = loss_class( + pred, target, ignore_index=255, reduction_override=reduction) + assert isinstance(loss_with_forward_ignore, torch.Tensor) + + # Verify correctness + + loss = loss_class(pred, target, reduction_override=reduction) + + assert torch.allclose(loss, loss_with_ignore) + assert torch.allclose(loss, loss_with_forward_ignore) + + # test ignore all target + pred = torch.rand((10, 5)) + target = torch.ones((10, ), dtype=torch.long) * 255 + loss = loss_class(pred, target, reduction_override=reduction) + assert loss == 0